#########################
#
# R code for the study:
# Naturally-diverse airborne environmental microbial exposures modulate the gut microbiome
# and may provide anxiolytic benefits in mice
# 
# Craig Liddicoat, Harrison Sydnor, Christian Cando-Dumancela, Romy Dresken, Jiajun Liu, Nicholas J.C. Gellie,
# Jacob G. Mills, Jennifer M. Young, Laura S. Weyrich, Mark R. Hutchinson, Philip Weinstein, Martin F. Breed
#
#########################

.libPaths() # "C:/R/R-3.5.1/library"

R.Version()
# "R version 3.5.1 (2018-07-02)"
citation()
# R Core Team (2018). R: A language and environment for statistical computing. R Foundation for Statistical
# Computing, Vienna, Austria. URL https://www.R-project.org/



library(readxl); packageVersion("readxl") # '1.1.0'
library(readr); packageVersion("readr") # '1.1.1'
library(plyr); packageVersion("plyr") # '1.8.4'
library(dplyr); packageVersion("dplyr") # '0.7.6'
library(vegan);packageVersion("vegan") # '2.5.2'
library(phyloseq); packageVersion("phyloseq") # '1.24.0'
library(ggplot2); packageVersion("ggplot2") # '3.0.0'
library(ggrepel); packageVersion("ggrepel") #  '0.8.0'
library(grid); packageVersion("grid") #  '3.5.1'
library(reshape2); packageVersion("reshape2") # '1.4.3'
library(ggsignif); packageVersion("ggsignif") # '0.4.0'
library(moments); packageVersion("moments") # '0.14'
library(doParallel); packageVersion("doParallel") # '1.0.11'
library(tidyr); packageVersion("tidyr") # '0.8.1'
library(DESeq2); packageVersion("DESeq2") # '1.20.0'
library(FSA); packageVersion("FSA") # '0.8.20'
library(rcompanion); packageVersion("rcompanion") # '1.13.2'
library(gridExtra); packageVersion("gridExtra") # '2.3'
library(VennDiagram); packageVersion("VennDiagram") # '1.6.20'
library(eulerr); packageVersion("eulerr") # '5.0.0'
library(gplots); packageVersion("gplots") # '3.0.1'
library(decontam); packageVersion("decontam") #  '1.1.2'
library(rBLAST); packageVersion("rBLAST") #  '0.99.1'   loads Biostrings, masks strsplit() from base package
library(stringr); packageVersion("stringr") # '1.3.1'
library(seqinr); packageVersion("seqinr") # '3.4.5'



workdir <- "C:/Workspace/PROJ/PAPER-MICRO-MICE/modelling"
setwd(workdir)
getwd() #

datadir <- "C:/Workspace/PROJ/PAPER-MICRO-MICE/modelling/data"
datadir

par.default <- par()

#########################





#### Animal details
#------------------------

mice_info <- read_excel(path= paste0(datadir,"/","Animal-details-main-study.xlsx"),
                        sheet=1, range="A1:N55", col_names = TRUE)

mice_info <- as.data.frame(mice_info)
str(mice_info)

# change char to factor
mice_info[ , c("mouseID", "Sex", "Litter", "Cage Name", "Treatment", "Ear notch pos", "Rack_position" )] <- 
  lapply(mice_info[ , c("mouseID", "Sex", "Litter", "Cage Name", "Treatment", "Ear notch pos", "Rack_position" )], 
         FUN = factor)
str(mice_info)
# 'data.frame':	54 obs. of  14 variables:
# $ mouseID                       : Factor w/ 54 levels "C1m1","C1m2",..: 1 2 3 37 38 39 19 20 21 22 ...
# $ Animal no                     : num  1 2 3 4 5 6 7 8 9 10 ...
# $ Sex                           : Factor w/ 2 levels "female","male": 2 2 2 1 1 1 2 2 2 2 ...
# $ Litter                        : Factor w/ 15 levels "Blue 20","Blue 24",..: 2 3 11 2 3 11 11 12 13 12 ...
# $ Date of Birth                 : POSIXct, format: "2018-08-02" "2018-08-04" "2018-08-02" "2018-08-02" ...
# $ Cage no                       : num  1 1 1 2 2 2 3 3 3 4 ...
# $ Cage Name                     : Factor w/ 18 levels "C1","C2","C3",..: 1 1 1 13 13 13 7 7 7 8 ...
# $ Treatment                     : Factor w/ 3 levels "Control","High",..: 1 1 1 3 3 3 2 2 2 2 ...
# $ Rack                          : num  1 1 1 1 1 1 1 1 1 2 ...
# $ Ear notch ID                  : num  1 2 3 1 2 3 1 2 3 1 ...
# $ Ear notch pos                 : Factor w/ 3 levels "(-) No Notch",..: 1 3 2 1 3 2 1 3 2 1 ...
# $ Age_received_20_Aug           : num  2.6 2.3 2.6 2.6 2.3 2.6 2.6 2.6 2.4 2.6 ...
# $ Age_first_soil_exposure_27_Aug: num  3.6 3.3 3.6 3.6 3.3 3.6 3.6 3.6 3.4 3.6 ...
# $ Rack_position                 : Factor w/ 3 levels "low","middle",..: 3 3 3 2 2 2 1 1 1 3 ...


table(mice_info$Litter)
# Blue 20   Blue 24   Blue 27  Green 13  Green 14  Green 18  Green 21  Green 30 Orange 15 Orange 19   Pink 23   Pink 24 
#       5         2         2         4         2         4         1         2         7         5         4         6 
# Purple 26    Red 22    Red 23 
#         4         2         4


## read-in mouse weights
mice_wt <- read_excel(path= paste0(datadir,"/","Weight-record-sheet-main-study.xlsx"),
                       sheet=1, range="E6:X60", col_names = TRUE)

mice_wt <- as.data.frame(mice_wt)
str(mice_wt)

# remove NA cols
sel.na <- which(!complete.cases(t(mice_wt)))
mice_wt <- mice_wt[ ,-sel.na]

# set mouseID as row name
row.names(mice_wt) <- mice_wt[,1]
mice_wt <- mice_wt[,-1]

temp <- mice_wt

# experiment days
days <- as.numeric(names(mice_wt))
length(days) # 16

# days of treatment
days.treatment <- days - 43339
days.treatment # -6 -5  1  4  8 11 15 17 22 25 29 32 36 39 43 46  # = Treatment days
names(mice_wt)
# [1] "43333" "43334" "43340" "43343" "43347" "43350" "43354" "43356" "43361" "43364" "43368" "43371" "43375" "43378" "43382"
# [16] "43385"

## remove weight of measuring tubs
## read-in tub weights
tub_wt <- read_excel(path= paste0(datadir,"/","Weight-record-sheet-main-study.xlsx"),
                      sheet=1, range="AE7:AF16", col_names = TRUE)

tub_wt <- as.data.frame(tub_wt)
str(tub_wt)
# 'data.frame':	9 obs. of  2 variables:
# $ tub   : chr  "C1" "C2" "C3" "L1" ...
# $ mass_g: num  34.2 34.8 34.5 34.5 33.9 34.1 33.9 33.4 33.6

row.names(tub_wt) <- tub_wt$tub


mice_wt.fix <- mice_wt
mice_wt.fix$tub <- paste0(substring(row.names(mice_wt.fix),first=1,last=1) , substring(row.names(mice_wt.fix),first=nchar(row.names(mice_wt.fix)),last=nchar(row.names(mice_wt.fix))) )

## subtract tub weights from recorded weights

names(mice_wt.fix)
# [1] "43333" "43334" "43340" "43343" "43347" "43350" "43354" "43356" "43361" "43364" "43368" "43371" "43375" "43378" "43382"
# [16] "43385" "tub"


for (d in 1:(length(names(mice_wt.fix))-1) ) {
  #d<-1
  print(paste0("calculating for day: ",names(mice_wt.fix)[d]))
  
  for (m in 1:length(row.names(mice_wt.fix))) {
    #m<-1
    print(paste0("calculating for mouse no: ",m,", ID = ",row.names(mice_wt.fix)[m]))
    
    this_tub_wt <- tub_wt[ mice_wt.fix$tub[m] , "mass_g" ]
    
    mice_wt.fix[m,d] <- mice_wt.fix[m,d] - this_tub_wt
    
    print(paste0("tub mass for ",mice_wt.fix$tub[m]," is ",tub_wt[ mice_wt.fix$tub[m] , "mass_g" ]))
    print(paste0("adjusted mouse weight is ",mice_wt.fix[m,d]))
    print(paste0("------------------------"))
  }
}

## remove last column $tub
mice_wt.fix <- mice_wt.fix[ , -dim(mice_wt.fix)[2] ]

## transform and prepare for ggplot
wts <- t(mice_wt.fix)

melt.wts <- melt(wts, id.vars=names(wts))
head(melt.wts)
#    Var1 Var2 value
# 1 43333 C1m1  12.3
# 2 43334 C1m1  12.8
# 3 43340 C1m1  16.0
# 4 43343 C1m1  16.9
# 5 43347 C1m1  19.0
# 6 43350 C1m1  18.4

names(melt.wts) # "Var1"  "Var2"  "value"

names(melt.wts) <- c( "Day", "mouseID", "weight_g")
melt.wts$Treatment <- substring( melt.wts$mouseID, 1, 1 )

melt.wts$Treatment <- factor(melt.wts$Treatment, levels = c("C","L","H"),
                             labels = c("Control","Low","High"))

head(melt.wts)
#     Day mouseID weight_g Treatment
# 1 43333    C1m1     12.3   Control
# 2 43334    C1m1     12.8   Control
# 3 43340    C1m1     16.0   Control
# 4 43343    C1m1     16.9   Control
# 5 43347    C1m1     19.0   Control
# 6 43350    C1m1     18.4   Control

## join to other mouse info: Sex, Litter, Age received, Rack position

dat_wts <- melt.wts

names(dat_wts)
names(mice_info)

dat_wts <- merge(x = dat_wts, y = mice_info[ ,c("mouseID","Sex","Litter","Cage Name","Date of Birth","Rack_position")], by = "mouseID", all.x=TRUE)

sel <- which(names(dat_wts)=="Cage Name")
names(dat_wts)[sel] <- "Cage_Name"

sel <- which(names(dat_wts)=="Date of Birth")
names(dat_wts)[sel] <- "Date_of_Birth"

names(dat_wts)
# [1] "mouseID"       "Day"           "weight_g"      "Treatment"     "Sex"           "Litter"        "Cage_Name"    
# [8] "Date_of_Birth" "Rack_position"    

## re-order
dat_wts <- dat_wts[ ,c( "Day", "mouseID", "Treatment", "Cage_Name", "Sex", "Litter", 
                        "Date_of_Birth", "Rack_position", "weight_g" 
   )]

dat_wts$Treatment <- factor(dat_wts$Treatment, levels = c("Control","Low","High"), ordered = TRUE) 
dat_wts$Rack_position <- factor(dat_wts$Rack_position, levels = c("low", "middle", "top"), ordered = TRUE)
dat_wts$Cage_Name <- factor(dat_wts$Cage_Name, levels = c( paste0("C",1:6),paste0("L",1:6),paste0("H",1:6)  ), ordered = TRUE)
dat_wts$Sex <- factor(dat_wts$Sex, levels = c("female", "male"), labels = c("Female","Male"))

str(dat_wts)
# 'data.frame':	864 obs. of  9 variables:
# $ Day          : int  43333 43334 43340 43343 43347 43350 43354 43356 43361 43364 ...
# $ mouseID      : Factor w/ 54 levels "C1m1","C1m2",..: 1 1 1 1 1 1 1 1 1 1 ...
# $ Treatment    : Ord.factor w/ 3 levels "Control"<"Low"<..: 1 1 1 1 1 1 1 1 1 1 ...
# $ Cage_Name    : Ord.factor w/ 18 levels "C1"<"C2"<"C3"<..: 1 1 1 1 1 1 1 1 1 1 ...
# $ Sex          : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 2 2 2 2 2 ...
# $ Litter       : Factor w/ 15 levels "Blue 20","Blue 24",..: 2 2 2 2 2 2 2 2 2 2 ...
# $ Date_of_Birth: POSIXct, format: "2018-08-02" "2018-08-02" "2018-08-02" "2018-08-02" ...
# $ Rack_position: Ord.factor w/ 3 levels "low"<"middle"<..: 3 3 3 3 3 3 3 3 3 3 ...
# $ weight_g     : num  12.3 12.8 16 16.9 19 18.4 19.3 19.4 21 21.1 ...


p <- ggplot(dat_wts, aes(x=Day,y=weight_g,color=Treatment)) + geom_line(aes(group = mouseID))
p

cols <- c("Control" = "#f46d43", "Low" = "#66c2a5" , "High" = "#5e4fa2" ) # orange, aqua, dark-blue

## 18 shapes for cages
shapes <- c(1:18)
## 15 shapes for litters
#shapes <- c(1:15)

p <- ggplot(dat_wts, aes(x=Day,y=weight_g,color=Treatment, shape=Cage_Name)) +
  geom_line(aes(group = mouseID))
p

p$data$Cage_Name

pp <- p + theme_bw() +
  facet_grid(rows=vars(Sex),cols=vars(Treatment)) +
  geom_point(aes(shape = Cage_Name), data=dat_wts) +
  #geom_point(shape = p$data$`Cage Name`) +
  #geom_point(shape = p$data$Litter) +
  scale_colour_manual(values = cols, guide = FALSE) +
  scale_shape_manual(values = shapes, name = "Enclosure") +
  labs(x = "Day", y = "Weight (g)") +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.ticks.length=unit(-5, "pt"),
    axis.text.x = element_text(margin=margin(t = 10,r = 5,b = 2,l = 5,"pt")),
    axis.text.y = element_text(margin=margin(t = 5,r = 10,b = 10,l = 10,"pt")),
    #legend.position = "none",
    strip.background = element_rect(fill="white")
    ) # +
  #guides( shape = guide_legend(ncol = 2) )
pp



### add age to dataframe 'dat_wts'

head( dat_wts$Date_of_Birth)
# [1] "2018-08-02 UTC" "2018-08-02 UTC" "2018-08-02 UTC" "2018-08-02 UTC" "2018-08-02 UTC" "2018-08-02 UTC"
class( dat_wts$Date_of_Birth ) # "POSIXct" "POSIXt" 

head(  as.Date(dat_wts$Day, origin = "1899-12-30")  ) # "2018-08-21" "2018-08-22" "2018-08-28" "2018-08-31" "2018-09-04" "2018-09-07"
head(  dat_wts$Date_of_Birth  ) #  "2018-08-02 UTC" "2018-08-02 UTC" "2018-08-02 UTC" "2018-08-02 UTC" "2018-08-02 UTC" "2018-08-02 UTC"
head(  as.Date(dat_wts$Date_of_Birth)  ) # "2018-08-02" "2018-08-02" "2018-08-02" "2018-08-02" "2018-08-02" "2018-08-02"
head(  as.Date(dat_wts$Day, origin = "1899-12-30")  ) - head(  as.Date(dat_wts$Date_of_Birth)  )
# Time differences in days
# [1] 19 20 26 29 33 36

dat_wts$Age_days <- as.Date(dat_wts$Day, origin = "1899-12-30") - as.Date( dat_wts$Date_of_Birth )
dat_wts$Age_days <- as.numeric(dat_wts$Age_days)
dat_wts$Age_weeks <- dat_wts$Age_days/7

# also add experimental day. Noting that Treatments commenced on Day 7
dat_wts$Exp_days <- as.Date(dat_wts$Day, origin = "1899-12-30") - as.Date( "2018-08-20" )
dat_wts$Exp_days <- as.numeric(dat_wts$Exp_days)
dat_wts$Exp_days
unique(dat_wts$Exp_days) # [1]  1  2  8 11 15 18 22 24 29 32 36 39 43 46 50 53

names(dat_wts)

dat <- dat_wts[ , c("weight_g", "Exp_days", "Age_days", "mouseID", "Cage_Name", "Treatment", "Sex", "Litter", "Rack_position")]

str(dat)
# 'data.frame':	864 obs. of  9 variables:
# $ weight_g     : num  12.3 12.8 16 16.9 19 18.4 19.3 19.4 21 21.1 ...
# $ Exp_days     : num  1 2 8 11 15 18 22 24 29 32 ...
# $ Age_days     : num  19 20 26 29 33 36 40 42 47 50 ...
# $ mouseID      : Factor w/ 54 levels "C1m1","C1m2",..: 1 1 1 1 1 1 1 1 1 1 ...
# $ Cage_Name    : Ord.factor w/ 18 levels "C1"<"C2"<"C3"<..: 1 1 1 1 1 1 1 1 1 1 ...
# $ Treatment    : Ord.factor w/ 3 levels "Control"<"Low"<..: 1 1 1 1 1 1 1 1 1 1 ...
# $ Sex          : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 2 2 2 2 2 ...
# $ Litter       : Factor w/ 15 levels "Blue 20","Blue 24",..: 2 2 2 2 2 2 2 2 2 2 ...
# $ Rack_position: Ord.factor w/ 3 levels "low"<"middle"<..: 3 3 3 3 3 3 3 3 3 3 ...

cols <- c("Control" = "#f46d43", "Low" = "#66c2a5" , "High" = "#5e4fa2" ) # orange, aqua, dark-blue

## 18 shapes for cages
shapes <- c(1:18)
## 15 shapes for litters
#shapes <- c(1:15)

p <- ggplot(dat_wts, aes(x=Age_days,y=weight_g,color=Treatment, shape=Cage_Name)) +
  geom_line(aes(group = mouseID))
p

p$data$Cage_Name

pp <- p + theme_bw() +
  facet_grid(rows=vars(Sex),cols=vars(Treatment)) +
  geom_point(aes(shape = Cage_Name), data=dat_wts) +
  #geom_point(shape = p$data$`Cage Name`) +
  #geom_point(shape = p$data$Litter) +
  scale_colour_manual(values = cols, guide = FALSE) +
  scale_shape_manual(values = shapes, name = "Enclosure") +
  labs(x = "Age (days)", y = "Weight (g)") +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.ticks.length=unit(-5, "pt"),
    axis.text.x = element_text(margin=margin(t = 10,r = 5,b = 2,l = 5,"pt")),
    axis.text.y = element_text(margin=margin(t = 5,r = 10,b = 10,l = 10,"pt")),
    #legend.position = "none",
    strip.background = element_rect(fill="white")
  ) # +
#guides( shape = guide_legend(ncol = 2) )
pp

ggsave(plot=pp, filename = paste0("plots/","growth-curves-by-Treatment-and-Sex.tiff"), width = 16, height = 12, units = "cm", dpi = 600, compression = "lzw")

#------------------------





#### Exploratory model of mice growth curves
#------------------------

# ... from above

str(dat)
# 'data.frame':	864 obs. of  9 variables:
# $ weight_g     : num  12.3 12.8 16 16.9 19 18.4 19.3 19.4 21 21.1 ...
# $ Exp_days     : num  1 2 8 11 15 18 22 24 29 32 ...
# $ Age_days     : num  19 20 26 29 33 36 40 42 47 50 ...
# $ mouseID      : Factor w/ 54 levels "C1m1","C1m2",..: 1 1 1 1 1 1 1 1 1 1 ...
# $ Cage_Name    : Ord.factor w/ 18 levels "C1"<"C2"<"C3"<..: 1 1 1 1 1 1 1 1 1 1 ...
# $ Treatment    : Ord.factor w/ 3 levels "Control"<"Low"<..: 1 1 1 1 1 1 1 1 1 1 ...
# $ Sex          : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 2 2 2 2 2 ...
# $ Litter       : Factor w/ 15 levels "Blue 20","Blue 24",..: 2 2 2 2 2 2 2 2 2 2 ...
# $ Rack_position: Ord.factor w/ 3 levels "low"<"middle"<..: 3 3 3 3 3 3 3 3 3 3 ...

# library(caret);packageVersion("caret") # '6.0.80'
# library(randomForest);packageVersion("randomForest") # '4.6.14'
# library(boot);packageVersion("boot") # '1.3.20'


## for exploratory model, strip the pre-exposure/pre-treatment days out of data record

dat.mod <- dat
unique(dat.mod$Exp_days) # [1]  1  2  8 11 15 18 22 24 29 32 36 39 43 46 50 53
sel <- which(dat.mod$Exp_days %in% c(1,2)) # qty 108
dat.mod <- dat.mod[-sel, ]
dim(dat.mod) #  756   9

names(dat.mod)
# [1] "weight_g"      "Exp_days"      "Age_days"      "mouseID"       "Cage_Name"     "Treatment"     "Sex"          
# [8] "Litter"        "Rack_position"



## fit Random Forest regression model and check variable importance plot - to assess importance of treatment?
library(randomForest);packageVersion("randomForest") # '4.6.14'

## don't include factor mouseID - as modelling tool will not handle categorical
## predictor with such a large number of categories

dat1 <- dat.mod[ , c( "weight_g", "Age_days", "Cage_Name", "Treatment",
                      "Sex", "Litter", "Rack_position")]
names(dat1)
#[1]  "weight_g"  "Age_days"      "Cage_Name"     "Treatment"     "Sex"           "Litter"        "Rack_position"

model1 <- randomForest(weight_g ~ ., data = dat1, importance = TRUE)
model1
# Call:
#   randomForest(formula = weight_g ~ ., data = dat1, importance = TRUE) 
# Type of random forest: regression
# Number of trees: 500
# No. of variables tried at each split: 2
# 
# Mean of squared residuals: 0.5773832
# % Var explained: 93.29

varImpPlot(model1)  

varImpPlot(model1,type=2) # Treatment has only marginal importance, lowest compared to others

importanceOrder=order(-model1$importance)
names=rownames(model1$importance)[importanceOrder]
names<-na.omit(names)

par(mfrow=c(2, 3),mar=c(4,3,2,1)+0.1)
for (name in names) {
  partialPlot(model1, dat1, eval(name), main="", xlab=name)
}

par(par.default)

dat1$Treatment     # Levels: Control < Low < High
dat1$Rack_position # Levels: low < middle < top


grDevices::tiff(file=paste0("plots/","RF-PartialPlots-all-data-in-vFINAL.tif"), width = 16, height = 14, units = "cm", res = 600, compression = "lzw")
par(mfrow=c(2, 3),mar=c(4,3,2,1)+0.1)
for (name in names) {
  partialPlot(model1, dat1, eval(name), main="", xlab=name)
}
dev.off()
par(par.default)

# Mouse weights:
# - increase with age
# - are higher in males
# - vary randomly with litters
# - vary randomly with cages
# - decrease with increasing height on the rack (low < middle < top)
# - increase with Treatment biodiversity (Control < Low < High) 
# (Higher effect/important at top, towards marginal effect/importance for lower variables)

## detach library to avoid conflict with margin() calls in ggplot
detach("package:randomForest", unload=TRUE)

#------------------------





#### Behaviour testing
#------------------------

### Open Field

of.post <- read_excel(path= paste0(datadir,"/","Open-field-post-exposure.xlsx"),
                     sheet=1, range="A1:AE55", col_names = TRUE)

of.post <- as.data.frame(of.post)
str(of.post)

of.post$Treatment <- factor(of.post$Treatment, levels = c("Control", "Low" , "High"), ordered = TRUE )

# change char to factor
of.post[ , c( "Code", "Mouse", "ID", "Sex", "Litter" )] <- 
  lapply(of.post[ , c("Code", "Mouse", "ID", "Sex", "Litter" )], 
         FUN = factor)

names(mice_info)
# [1] "mouseID"                        "Animal no"                      "Sex"                           
# [4] "Litter"                         "Date of Birth"                  "Cage no"                       
# [7] "Cage Name"                      "Treatment"                      "Rack"                          
# [10] "Ear notch ID"                   "Ear notch pos"                  "Age_received_20_Aug"           
# [13] "Age_first_soil_exposure_27_Aug" "Rack_position"      


of.post <- merge(x = of.post, y = mice_info[ ,c( "mouseID", "Date of Birth",
                                                 "Cage no", "Cage Name", "Rack",
                                                 "Age_received_20_Aug", 
                                                 "Age_first_soil_exposure_27_Aug",
                                                 "Rack_position")], by.x = "ID", by.y = "mouseID", all.x = TRUE )

str(of.post)
# 'data.frame':	54 obs. of  38 variables:
# $ ID                            : Factor w/ 54 levels "C1m1","C1m2",..: 1 2 3 4 5 6 7 8 9 10 ...
# $ Test_no                       : num  1 2 3 13 14 15 25 26 27 34 ...
# $ Treatment                     : Ord.factor w/ 3 levels "Control"<"Low"<..: 1 1 1 1 1 1 1 1 1 1 ...
# $ Code                          : Factor w/ 18 levels "C1","C2","C3",..: 1 1 1 2 2 2 3 3 3 4 ...
# $ Mouse                         : Factor w/ 3 levels "m1","m2","m3": 1 2 3 1 2 3 1 2 3 1 ...
# $ Sex                           : Factor w/ 2 levels "female","male": 2 2 2 1 1 1 2 2 2 1 ...
# $ Litter                        : Factor w/ 15 levels "Blue 20","Blue 24",..: 2 3 11 11 12 14 9 10 1 15 ...
# $ Stage                         : chr  "First stage" "First stage" "First stage" "First stage" ...
# $ Trial                         : num  1 1 1 1 1 1 1 1 1 1 ...
# $ Apparatus                     : chr  "Open field 1" "Open field 2" "Open field 3" "Open field 1" ...
# $ Duration                      : num  600 600 600 600 600 600 600 600 600 600 ...
# $ Distance                      : num  15.87 23.61 16.8 7.71 5.25 ...
# $ Dist_Corrected                : num  15.43 22.95 16.33 7.5 5.11 ...
# $ Corners_entries               : num  72 73 56 33 60 50 55 35 82 70 ...
# $ Corners_time                  : num  352 174 162 533 182 ...
# $ Corners_distance              : num  6.68 4.04 3.06 3.77 1.34 ...
# $ Corners_dist_Corrected        : num  6.5 3.93 2.98 3.66 1.3 ...
# $ Corners_head_time             : num  254.4 145.1 131.6 462.9 80.6 ...
# $ Corners_head_entries          : num  103 113 89 81 57 74 65 72 89 107 ...
# $ Walls_entries                 : num  74 148 97 38 118 86 82 72 106 74 ...
# $ Walls_time                    : num  235 296.8 366.5 56.4 255.8 ...
# $ Walls_distance                : num  8.91 12.31 10.26 3.5 2.65 ...
# $ Walls_dist_Corrected          : num  8.66 11.97 9.97 3.41 2.58 ...
# $ Walls_head_time               : num  329 283 344 119 231 ...
# $ Walls_head_entries            : num  111 210 162 97 163 111 171 164 134 135 ...
# $ Centre_entries                : num  4 79 43 6 60 36 26 37 23 4 ...
# $ Centre_time                   : num  13.2 128.8 71.6 10.4 162.5 ...
# $ Centre_distance               : num  0.484 7.744 3.599 0.441 1.966 ...
# $ Centre_dist_Corrected         : num  0.471 7.529 3.499 0.429 1.911 ...
# $ Centre_head_time              : num  16.7 157.3 123.9 17.9 288.4 ...
# $ Centre_head_entries           : num  9 107 86 19 107 43 113 96 50 29 ...
# $ Date of Birth                 : POSIXct, format: "2018-08-02" "2018-08-04" "2018-08-02" ...
# $ Cage no                       : num  1 1 1 5 5 5 9 9 9 12 ...
# $ Cage Name                     : Factor w/ 18 levels "C1","C2","C3",..: 1 1 1 2 2 2 3 3 3 4 ...
# $ Rack                          : num  1 1 1 2 2 2 3 3 3 4 ...
# $ Age_received_20_Aug           : num  2.6 2.3 2.6 2.6 2.6 3 4.1 4.1 4.1 3.7 ...
# $ Age_first_soil_exposure_27_Aug: num  3.6 3.3 3.6 3.6 3.6 4 5.1 5.1 5.1 4.7 ...
# $ Rack_position                 : Factor w/ 3 levels "low","middle",..: 3 3 3 2 2 2 1 1 1 1 ...


## Note that 'Corrected' distances are (35/36)*distances reported by the ANY-maze software.
## This is due to setting the electronic template as measuring 36 cm across to facilitate mapping of 4 x 4 
## equal zones in the Open Field ANY-maze template. However, the apparatus actually measured 35 cm across.
## Therefore software-reported distances require correction by a factor of x (35/36).


p <- ggplot(data = of.post, aes(x = Treatment, y = Centre_time)) +
  geom_boxplot() +
  facet_grid( ~ Sex) +
  theme_bw()
p

getwd() #  "C:/Workspace/PROJ/PAPER-MICRO-MICE/modelling"

ggsave(plot=p, filename = paste0("plots/","boxplots-Open-field--Centre-time--by-sex.tiff"), width = 12, height = 6, units = "cm", dpi = 600, compression = "lzw")


## isolate data for females
names(of.post)
# [1] "ID"                             "Test_no"                        "Treatment"                     
# [4] "Code"                           "Mouse"                          "Sex"                           
# [7] "Litter"                         "Stage"                          "Trial"                         
# [10] "Apparatus"                      "Duration"                       "Distance"                      
# [13] "Dist_Corrected"                 "Corners_entries"                "Corners_time"                  
# [16] "Corners_distance"               "Corners_dist_Corrected"         "Corners_head_time"             
# [19] "Corners_head_entries"           "Walls_entries"                  "Walls_time"                    
# [22] "Walls_distance"                 "Walls_dist_Corrected"           "Walls_head_time"               
# [25] "Walls_head_entries"             "Centre_entries"                 "Centre_time"                   
# [28] "Centre_distance"                "Centre_dist_Corrected"          "Centre_head_time"              
# [31] "Centre_head_entries"            "Date of Birth"                  "Cage no"                       
# [34] "Cage Name"                      "Rack"                           "Age_received_20_Aug"           
# [37] "Age_first_soil_exposure_27_Aug" "Rack_position"

sel <- which(of.post$Sex == "female")

of_ct_fem <- of.post[sel, c("ID", "Treatment", "Centre_time")]

## Kruskal-Wallis multiple comparison test 
## with post-hoc Dunn test

str(of_ct_fem)
# 'data.frame':	27 obs. of  3 variables:
# $ ID         : Factor w/ 54 levels "C1m1","C1m2",..: 4 5 6 10 11 12 13 14 15 25 ...
# $ Treatment  : Ord.factor w/ 3 levels "Control"<"Low"<..: 1 1 1 1 1 1 1 1 1 3 ...
# $ Centre_time: num  10.4 162.5 88.7 12.1 3.3 ...

of_ct_fem$group <- as.character(of_ct_fem$Treatment)

str(of_ct_fem)
# 'data.frame':	27 obs. of  4 variables:
# $ ID         : Factor w/ 54 levels "C1m1","C1m2",..: 4 5 6 10 11 12 13 14 15 25 ...
# $ Treatment  : Ord.factor w/ 3 levels "Control"<"Low"<..: 1 1 1 1 1 1 1 1 1 3 ...
# $ Centre_time: num  10.4 162.5 88.7 12.1 3.3 ...
# $ group      : chr  "Control" "Control" "Control" "Control" ...

p <- ggplot(data = of_ct_fem, aes(x = Treatment, y = Centre_time)) +
  geom_boxplot() +
  theme_bw()
p



## Exclude outliers from significance testing - using default outlier classification
## outlier is defined as values outside 1.5 times the interquartile range above the upper quartile or below the lower quartile.

bp <- boxplot(of_ct_fem$Centre_time ~ of_ct_fem$group)
bp$out #  162.5  88.7

sel <- which(of_ct_fem$Centre_time %in% bp$out)
of_ct_fem$Centre_time[sel] # 162.5  88.7
of_ct_fem[sel, ]
#     ID Treatment Centre_time   group
# 5 C2m2   Control       162.5 Control
# 6 C2m3   Control        88.7 Control

# exclude outliers
of_ct_fem__ex_out <- of_ct_fem[-sel, ]

str(of_ct_fem__ex_out)
# 'data.frame':	25 obs. of  4 variables:
# $ ID         : Factor w/ 54 levels "C1m1","C1m2",..: 4 10 11 12 13 14 15 25 26 27 ...
# $ Treatment  : Ord.factor w/ 3 levels "Control"<"Low"<..: 1 1 1 1 1 1 1 3 3 3 ...
# $ Centre_time: num  10.4 12.1 3.3 34.8 10.6 27.1 25.5 33.5 28.5 61.4 ...
# $ group      : chr  "Control" "Control" "Control" "Control" ...

table(of_ct_fem__ex_out$Treatment)
# Control     Low    High 
#       7       9       9

# Kruskal-Wallis test
kt <- kruskal.test( Centre_time ~ factor(group), of_ct_fem__ex_out) # Kruskal Wallis test
kt
# Kruskal-Wallis rank sum test
# 
# data:  Centre_time by factor(group)
# Kruskal-Wallis chi-squared = 8.0832, df = 2, p-value = 0.01757


## Dunn Test uses factor vector or non-numeric vector that can be coerced to a factor vector

pt <- dunnTest( Centre_time ~ group, data= of_ct_fem__ex_out,
                method = "bonferroni" )
# Warning message:
# group was coerced to a factor. 

pt
# Dunn (1964) Kruskal-Wallis multiple comparison
# p-values adjusted with the Bonferroni method.
# 
#       Comparison          Z     P.unadj      P.adj
# 1 Control - High -2.7303879 0.006325984 0.01897795
# 2  Control - Low -2.1911577 0.028440381 0.08532114
# 3     High - Low  0.5764614 0.564303384 1.00000000

pt$dtres
pt <- pt$res
pt

cldList(comparison = pt$Comparison,
        p.value    = pt$P.adj,
        threshold  = 0.05)

#     Group Letter MonoLetter
# 1 Control      a         a 
# 2    High      b          b
# 3     Low     ab         ab


## Include significance letters in plot

cols <- c("Control" = "#f46d43", "Low" = "#66c2a5" , "High" = "#5e4fa2" ) # orange, aqua, dark-blue

set.seed(4321)
p <- ggplot(data = of_ct_fem, aes(x = Treatment, y = Centre_time)) +
  geom_boxplot() +
  #geom_point(aes(colour = Treatment)) +
  geom_jitter(data=of_ct_fem__ex_out, aes(colour = Treatment), width=0.3, height=0) +
  scale_color_manual(values = cols, guide = FALSE) +
  theme_classic() +
  labs(y="Time in centre (s)", x=NULL) +
  annotate(geom="text", x= 1.1, y= 50, label = "a", hjust=0, vjust=0, size = 3) +
  annotate(geom="text", x= 2.1, y= 100, label = "ab", hjust=0, vjust=0, size = 3) +
  annotate(geom="text", x= 3.1, y= 70, label = "b", hjust=0, vjust=0, size = 3) +
  theme(
    plot.margin = margin(t = 2, r = 2, b = 2, l = 5, "pt"),
    axis.ticks.length=unit(-3, "pt"),
    axis.ticks.x.bottom = element_blank(),
    axis.text.x = element_text(margin=margin(t = 6,r = 5,b = 2,l = 5,"pt"), size = rel(0.85)),  # , size = rel(0.9))
    axis.text.y = element_text(margin=margin(t = 5,r = 5,b = 5,l = 2,"pt"), size = rel(0.75)),
    axis.title.y = element_text(margin=margin(t = 5,r = 2,b = 5,l = 2,"pt"), size = rel(0.8))
    
  )

p

ggsave(plot=p, filename = paste0("plots/","Open-Field-Centre-Time-Female-Only-with-signif-vFINAL.tiff"), width = 5.5, height = 4, units = "cm", dpi = 600, compression = "lzw")



# # # # # # # # #

## isolate males and subset data
names(of.post)

sel <- which(of.post$Sex == "male")

of_ct_male <- of.post[sel, c("ID", "Treatment", "Centre_time")]

of_ct_male$group <- as.character(of_ct_male$Treatment)

str(of_ct_male)
# 'data.frame':	27 obs. of  4 variables:
# $ ID         : Factor w/ 54 levels "C1m1","C1m2",..: 1 2 3 7 8 9 16 17 18 19 ...
# $ Treatment  : Ord.factor w/ 3 levels "Control"<"Low"<..: 1 1 1 1 1 1 1 1 1 3 ...
# $ Centre_time: num  13.2 128.8 71.6 46.9 208.3 ...
# $ group      : chr  "Control" "Control" "Control" "Control" ...


## Exclude boxplot outliers from Kruskal Wallis test

bp <- boxplot(of_ct_male$Centre_time ~ of_ct_male$group)
bp$out #   208.3 289.9 279.0

sel <- which(of_ct_male$Centre_time %in% bp$out)
of_ct_male$Centre_time[sel] # 208.3 289.9 279.0
of_ct_male[sel, ]
#      ID Treatment Centre_time   group
# 8  C3m2   Control       208.3 Control
# 23 H2m2      High       289.9    High
# 33 H5m3      High       279.0    High

# exclude outliers
of_ct_male__ex_out <- of_ct_male[-sel, ]

str(of_ct_male__ex_out)
# 'data.frame':	24 obs. of  4 variables:
# $ ID         : Factor w/ 54 levels "C1m1","C1m2",..: 1 2 3 7 9 16 17 18 19 20 ...
# $ Treatment  : Ord.factor w/ 3 levels "Control"<"Low"<..: 1 1 1 1 1 1 1 1 3 3 ...
# $ Centre_time: num  13.2 128.8 71.6 46.9 68.2 ...
# $ group      : chr  "Control" "Control" "Control" "Control" ...


# Kruskal-Wallis test
kt <- kruskal.test( Centre_time ~ factor(group), of_ct_male__ex_out) # Kruskal Wallis test
kt
# Kruskal-Wallis rank sum test
# 
# data:  Centre_time by factor(group)
# Kruskal-Wallis chi-squared = 2.4427, df = 2, p-value = 0.2948

## i.e. no significant difference



# # # # # # # # #
# # # # # # # # #

## Plot Open Field - Time in Centre - Female & Male


## plot using facet_grid

# don't colour outliers
bp <- boxplot(of.post$Centre_time ~ of.post$Treatment + of.post$Sex)
bp$out #   162.5  88.7 208.3 289.9 279.0

sel <- which(of.post$Centre_time %in% bp$out)
of.post$Centre_time[sel] # 162.5  88.7 208.3 289.9 279.0

cols <- c("Control" = "#f46d43", "Low" = "#66c2a5" , "High" = "#5e4fa2" ) # orange, aqua, dark-blue

of.post.COPY <- of.post
of.post.COPY$Sex <- factor(of.post.COPY$Sex, levels = c("female", "male"), labels=c("Female", "Male"))

set.seed(1234)
p <- ggplot(data=of.post.COPY, aes(x = Treatment, y = Centre_time)) +
  geom_boxplot(outlier.size=1) + 
  facet_grid(~Sex ) +
  geom_jitter(data = of.post.COPY[-sel, ], aes(colour = Treatment), width=0.3, height=0, size=1) +
  scale_color_manual(values = cols, guide = FALSE) +
  theme_classic() +
  labs(y="Time in centre (s)", x=NULL) +
  theme(
    plot.margin = margin(t = 2, r = 2, b = 2, l = 5, "pt"),
    axis.ticks.length=unit(-3, "pt"),
    axis.ticks.x.bottom = element_blank(),
    axis.text.x = element_text(margin=margin(t = 5,r = 5,b = 2,l = 5,"pt"), size = rel(0.9)),  # , size = rel(0.9))
    axis.text.y = element_text(margin=margin(t = 5,r = 5,b = 5,l = 2,"pt"), size = rel(0.8)),
    axis.title.y = element_text(margin=margin(t = 5,r = 2,b = 5,l = 2,"pt"), size = rel(0.8)),
    strip.background = element_rect(fill="white", linetype = "blank")
    
  )

p


# add significance letters
grid.text(label = "a", x = unit(0.25, "npc") , y = unit(0.35,"npc"), gp=gpar(fontsize=10.5) )
grid.text(label = "ab", x = unit(0.39, "npc") , y = unit(0.45,"npc"), gp=gpar(fontsize=10.5) )
grid.text(label = "b", x = unit(0.51, "npc") , y = unit(0.40,"npc"), gp=gpar(fontsize=10.5) )

dev.print(tiff, file = paste0("plots/","Open-Field-Time-in-Centre-Results-Female-and-Male-vFINAL.tiff"), width = 8.7, height = 6, units = "cm", res=600, compression="lzw")



# # # # # # # # #
# # # # # # # # #
# # # # # # # # #

### Compare pre- to post- Open Fields results

of.pre <- read_excel(path= paste0(datadir,"/","Open-field-pre-exposure.xlsx"),
                      sheet=1, range="A1:AE55", col_names = TRUE)

of.pre <- as.data.frame(of.pre)
str(of.pre)

of.pre$Treatment <- factor(of.pre$Treatment, levels = c("Control", "Low" , "High"), ordered = TRUE )

# change char to factor
of.pre[ , c( "Code", "Mouse", "ID", "Sex", "Litter" )] <- 
  lapply(of.pre[ , c("Code", "Mouse", "ID", "Sex", "Litter" )], 
         FUN = factor)

names(mice_info)
# [1] "mouseID"                        "Animal no"                      "Sex"                           
# [4] "Litter"                         "Date of Birth"                  "Cage no"                       
# [7] "Cage Name"                      "Treatment"                      "Rack"                          
# [10] "Ear notch ID"                   "Ear notch pos"                  "Age_received_20_Aug"           
# [13] "Age_first_soil_exposure_27_Aug" "Rack_position"    


of.pre <- merge(x = of.pre, y = mice_info[ ,c( "mouseID", "Date of Birth",
                                                 "Cage no", "Cage Name", "Rack",
                                                 "Age_received_20_Aug", 
                                                 "Age_first_soil_exposure_27_Aug",
                                                 "Rack_position")], by.x = "ID", by.y = "mouseID", all.x = TRUE )

str(of.pre)
# 'data.frame':	54 obs. of  38 variables:
# $ ID                            : Factor w/ 54 levels "C1m1","C1m2",..: 1 2 3 4 5 6 7 8 9 10 ...
# $ Test_no                       : num  1 2 3 13 14 15 25 26 27 34 ...
# $ Treatment                     : Ord.factor w/ 3 levels "Control"<"Low"<..: 1 1 1 1 1 1 1 1 1 1 ...
# $ Code                          : Factor w/ 18 levels "C1","C2","C3",..: 1 1 1 2 2 2 3 3 3 4 ...
# $ Mouse                         : Factor w/ 3 levels "m1","m2","m3": 1 2 3 1 2 3 1 2 3 1 ...
# $ Sex                           : Factor w/ 2 levels "female","male": 2 2 2 1 1 1 2 2 2 1 ...
# $ Litter                        : Factor w/ 15 levels "Blue 20","Blue 24",..: 2 3 11 11 12 14 9 10 1 15 ...
# $ Stage                         : chr  "First stage" "First stage" "First stage" "First stage" ...
# $ Trial                         : num  1 1 1 1 1 1 1 1 1 1 ...
# $ Apparatus                     : chr  "Open field 1" "Open field 3" "Open field 2" "Open field 1" ...
# $ Duration                      : num  600 600 600 600 600 600 600 600 600 600 ...
# $ Distance                      : num  29.02 22.24 8.25 21.07 14.46 ...
# $ Dist_Corrected                : num  28.21 21.63 8.02 20.49 14.06 ...
# $ Corners_entries               : num  88 53 33 66 63 66 93 88 62 80 ...
# $ Corners_time                  : num  192 161 487 416 387 ...
# $ Corners_distance              : num  6.75 4.36 3.81 8.11 6.36 ...
# $ Corners_dist_Corrected        : num  6.56 4.24 3.7 7.89 6.18 ...
# $ Corners_head time             : num  174 154 424 378 309 ...
# $ Corners_head entries          : num  104 58 63 101 81 74 103 117 75 91 ...
# $ Walls_entries                 : num  138 97 32 79 66 117 149 135 132 125 ...
# $ Walls_time                    : num  339 342 113 159 206 ...
# $ Walls_distance                : num  16.19 10.96 4.5 10.94 7.82 ...
# $ Walls_dist_Corrected          : num  15.74 10.65 4.37 10.64 7.6 ...
# $ Walls_head time               : num  339 323 168 193 259 ...
# $ Walls_head entries            : num  173 96 57 118 85 135 208 213 149 163 ...
# $ Centre_entries                : num  54 49 0 14 4 55 62 53 76 52 ...
# $ Centre_time                   : num  68.9 96.3 0 24.8 6.8 ...
# $ Centre_distance               : num  6.3 7.088 0 2.075 0.283 ...
# $ Centre_dist_Corrected         : num  6.125 6.891 0 2.017 0.275 ...
# $ Centre_head time              : num  86.8 122.7 0 29.1 8.9 ...
# $ Centre_head entries           : num  74 49 0 20 11 76 117 107 90 82 ...
# $ Date of Birth                 : POSIXct, format: "2018-08-02" "2018-08-04" "2018-08-02" "2018-08-02" ...
# $ Cage no                       : num  1 1 1 5 5 5 9 9 9 12 ...
# $ Cage Name                     : Factor w/ 18 levels "C1","C2","C3",..: 1 1 1 2 2 2 3 3 3 4 ...
# $ Rack                          : num  1 1 1 2 2 2 3 3 3 4 ...
# $ Age_received_20_Aug           : num  2.6 2.3 2.6 2.6 2.6 3 4.1 4.1 4.1 3.7 ...
# $ Age_first_soil_exposure_27_Aug: num  3.6 3.3 3.6 3.6 3.6 4 5.1 5.1 5.1 4.7 ...
# $ Rack_position                 : Factor w/ 3 levels "low","middle",..: 3 3 3 2 2 2 1 1 1 1 ...

# Note that 'Corrected' distances are (35/36)*distances reported by the ANY-maze software.
# This is due to setting the electronic template as measuring 36 cm across to facilitate mapping of 4 x 4 
# equal zones in the Open Field ANY-maze template. However, the apparatus actually measured 35 cm across.
# Therefore software-reported distances require correction by a factor of x (35/36).

p <- ggplot(data = of.pre, aes(x = Treatment, y = Centre_time)) +
  geom_boxplot() +
  facet_grid( ~ Sex) +
  theme_bw()
p


## isolate data for females and males respectively - to exclude outliers and test for difference
names(of.pre)
# [1] "ID"                             "Test_no"                        "Treatment"                     
# [4] "Code"                           "Mouse"                          "Sex"                           
# [7] "Litter"                         "Stage"                          "Trial"                         
# [10] "Apparatus"                      "Duration"                       "Distance"                      
# [13] "Dist_Corrected"                 "Corners_entries"                "Corners_time"                  
# [16] "Corners_distance"               "Corners_dist_Corrected"         "Corners_head time"             
# [19] "Corners_head entries"           "Walls_entries"                  "Walls_time"                    
# [22] "Walls_distance"                 "Walls_dist_Corrected"           "Walls_head time"               
# [25] "Walls_head entries"             "Centre_entries"                 "Centre_time"                   
# [28] "Centre_distance"                "Centre_dist_Corrected"          "Centre_head time"              
# [31] "Centre_head entries"            "Date of Birth"                  "Cage no"                       
# [34] "Cage Name"                      "Rack"                           "Age_received_20_Aug"           
# [37] "Age_first_soil_exposure_27_Aug" "Rack_position" 

sel <- which(of.pre$Sex == "female")

of_ct_fem_pre <- of.pre[sel, c("ID", "Treatment", "Centre_time")]

## Kruskal-Wallis multiple comparison test 
## with post-hoc Dunn test

str(of_ct_fem_pre)
# 'data.frame':	27 obs. of  3 variables:
# $ ID         : Factor w/ 54 levels "C1m1","C1m2",..: 4 5 6 10 11 12 13 14 15 25 ...
# $ Treatment  : Ord.factor w/ 3 levels "Control"<"Low"<..: 1 1 1 1 1 1 1 1 1 3 ...
# $ Centre_time: num  24.8 6.8 87.9 73.5 2.8 ...

of_ct_fem_pre$group <- as.character(of_ct_fem_pre$Treatment)

str(of_ct_fem_pre)
# 'data.frame':	27 obs. of  4 variables:
# $ ID         : Factor w/ 54 levels "C1m1","C1m2",..: 4 5 6 10 11 12 13 14 15 25 ...
# $ Treatment  : Ord.factor w/ 3 levels "Control"<"Low"<..: 1 1 1 1 1 1 1 1 1 3 ...
# $ Centre_time: num  24.8 6.8 87.9 73.5 2.8 ...
# $ group      : chr  "Control" "Control" "Control" "Control" ...

p <- ggplot(data = of_ct_fem_pre, aes(x = Treatment, y = Centre_time)) +
  geom_boxplot() +
  theme_bw()
p



## Exclude outliers from significance testing - using default outlier classification
## outlier is defined as values outside 1.5 times the interquartile range above the upper quartile or below the lower quartile.

bp <- boxplot(of_ct_fem_pre$Centre_time ~ of_ct_fem_pre$group)
bp$out #  110

sel <- which(of_ct_fem_pre$Centre_time %in% bp$out)
of_ct_fem_pre$Centre_time[sel] # 110
of_ct_fem_pre[sel, ]
#      ID Treatment Centre_time group
# 39 L1m3       Low         110   Low

# exclude outliers
of_ct_fem_pre__ex_out <- of_ct_fem_pre[-sel, ]

str(of_ct_fem_pre__ex_out)
# 'data.frame':	26 obs. of  4 variables:
# $ ID         : Factor w/ 54 levels "C1m1","C1m2",..: 4 5 6 10 11 12 13 14 15 25 ...
# $ Treatment  : Ord.factor w/ 3 levels "Control"<"Low"<..: 1 1 1 1 1 1 1 1 1 3 ...
# $ Centre_time: num  24.8 6.8 87.9 73.5 2.8 ...
# $ group      : chr  "Control" "Control" "Control" "Control" ...

table(of_ct_fem_pre__ex_out$Treatment)
# Control     Low    High 
#       9       8       9

# Kruskal-Wallis test
kt <- kruskal.test( Centre_time ~ factor(group), of_ct_fem_pre__ex_out) # Kruskal Wallis test
kt
# Kruskal-Wallis rank sum test
# 
# data:  Centre_time by factor(group)
# Kruskal-Wallis chi-squared = 0.52137, df = 2, p-value = 0.7705



# # # # # # # # #

## isolate males and subset data
names(of.pre)

sel <- which(of.pre$Sex == "male")

of_ct_male_pre <- of.pre[sel, c("ID", "Treatment", "Centre_time")]

of_ct_male_pre$group <- as.character(of_ct_male_pre$Treatment)

str(of_ct_male_pre)
# 'data.frame':	27 obs. of  4 variables:
# $ ID         : Factor w/ 54 levels "C1m1","C1m2",..: 1 2 3 7 8 9 16 17 18 19 ...
# $ Treatment  : Ord.factor w/ 3 levels "Control"<"Low"<..: 1 1 1 1 1 1 1 1 1 3 ...
# $ Centre_time: num  68.9 96.3 0 139.1 96.2 ...
# $ group      : chr  "Control" "Control" "Control" "Control" ...


## Exclude boxplot outliers from Kruskal Wallis test

bp <- boxplot(of_ct_male_pre$Centre_time ~ of_ct_male_pre$group)
bp$out #   0.0 139.1 183.4 181.0

sel <- which(of_ct_male_pre$Centre_time %in% bp$out)
of_ct_male_pre$Centre_time[sel] #  0.0 139.1 183.4 181.0
of_ct_male_pre[sel, ]
#      ID Treatment Centre_time   group
# 3  C1m3   Control         0.0 Control
# 7  C3m1   Control       139.1 Control
# 9  C3m3   Control       183.4 Control
# 40 L2m1       Low       181.0     Low

# exclude outliers
of_ct_male_pre__ex_out <- of_ct_male_pre[-sel, ]

str(of_ct_male_pre__ex_out)
# 'data.frame':	23 obs. of  4 variables:
# $ ID         : Factor w/ 54 levels "C1m1","C1m2",..: 1 2 8 16 17 18 19 20 21 22 ...
# $ Treatment  : Ord.factor w/ 3 levels "Control"<"Low"<..: 1 1 1 1 1 1 3 3 3 3 ...
# $ Centre_time: num  68.9 96.3 96.2 70 92.9 45.8 34.7 4 71.1 3 ...
# $ group      : chr  "Control" "Control" "Control" "Control" ...


table(of_ct_male_pre__ex_out$Treatment)
# Control     Low    High 
#       6       8       9


# Kruskal-Wallis test
kt <- kruskal.test( Centre_time ~ factor(group), of_ct_male_pre__ex_out) # Kruskal Wallis test
kt
# Kruskal-Wallis rank sum test
# 
# data:  Centre_time by factor(group)
# Kruskal-Wallis chi-squared = 2.4834, df = 2, p-value = 0.2889

## i.e. no significant difference



# # # # # # # # #
# # # # # # # # #

## Plot Open Field - BASELINE - Time in Centre - Female & Male


## plot using facet_grid

# don't colour outliers
bp <- boxplot(of.pre$Centre_time ~ of.pre$Treatment + of.pre$Sex)
bp$out #   110.0   0.0 139.1 183.4 181.0

sel <- which(of.pre$Centre_time %in% bp$out)
of.pre$Centre_time[sel] # 0.0 139.1 183.4 110.0 181.0

cols <- c("Control" = "#f46d43", "Low" = "#66c2a5" , "High" = "#5e4fa2" ) # orange, aqua, dark-blue

of.pre.COPY <- of.pre
of.pre.COPY$Sex <- factor(of.pre.COPY$Sex, levels = c("female", "male"), labels=c("Female", "Male"))

set.seed(4321)
p <- ggplot(data=of.pre.COPY, aes(x = Treatment, y = Centre_time)) +
  geom_boxplot(outlier.size=1) + 
  facet_grid(~Sex ) +
  geom_jitter(data = of.pre.COPY[-sel, ], aes(colour = Treatment), width=0.3, height=0, size=1) +
  scale_color_manual(values = cols, guide = FALSE) +
  theme_classic() +
  labs(y="Time in centre (s)", x=NULL) +
  theme(
    plot.margin = margin(t = 2, r = 2, b = 2, l = 5, "pt"),
    axis.ticks.length=unit(-3, "pt"),
    axis.ticks.x.bottom = element_blank(),
    axis.text.x = element_text(margin=margin(t = 5,r = 5,b = 2,l = 5,"pt"), size = rel(0.9)),  # , size = rel(0.9))
    axis.text.y = element_text(margin=margin(t = 5,r = 5,b = 5,l = 2,"pt"), size = rel(0.8)),
    axis.title.y = element_text(margin=margin(t = 5,r = 2,b = 5,l = 2,"pt"), size = rel(0.8)),
    strip.background = element_rect(fill="white", linetype = "blank")
    
  )

p


dev.print(tiff, file = paste0("plots/","Open-Field-BASELINE-Time-in-Centre-Results-Female-and-Male-vFINAL.tiff"), width = 8.7, height = 6, units = "cm", res=600, compression="lzw")





# # # # # # # # #
# # # # # # # # #

### Activity levels



## Distance travelled


## plot using facet_grid

# don't colour outliers
bp <- boxplot(of.post$Dist_Corrected ~ of.post$Treatment + of.post$Sex)
bp$out #   6.850278

sel <- which(of.post$Dist_Corrected %in% bp$out)
of.post$Dist_Corrected[sel] # 6.850278


# Kruskal-Wallis test
kt <- kruskal.test( Dist_Corrected ~ factor(Treatment), of.post[-sel, ]) # Kruskal Wallis test
kt
# Kruskal-Wallis rank sum test
# data:  Dist_Corrected by factor(Treatment)
# Kruskal-Wallis chi-squared = 0.70209, df = 2, p-value = 0.704

# Males & exclude outlier
kt <- kruskal.test( Dist_Corrected ~ factor(Treatment), of.post[-which(of.post$Dist_Corrected %in% bp$out | of.post$Sex == "female"), ]) # Kruskal Wallis test
kt
# Kruskal-Wallis rank sum test
# data:  Dist_Corrected by factor(Treatment)
# Kruskal-Wallis chi-squared = 4.0465, df = 2, p-value = 0.1322

# Females & exclude outlier
kt <- kruskal.test( Dist_Corrected ~ factor(Treatment), of.post[-which(of.post$Dist_Corrected %in% bp$out | of.post$Sex == "male"), ]) # Kruskal Wallis test
kt
# Kruskal-Wallis rank sum test
# data:  Dist_Corrected by factor(Treatment)
# Kruskal-Wallis chi-squared = 1.4215, df = 2, p-value = 0.4913


cols <- c("Control" = "#f46d43", "Low" = "#66c2a5" , "High" = "#5e4fa2" ) # orange, aqua, dark-blue

of.post.COPY <- of.post
of.post.COPY$Sex <- factor(of.post.COPY$Sex, levels = c("female", "male"), labels=c("Female", "Male"))

bp <- boxplot(of.post$Dist_Corrected ~ of.post$Treatment + of.post$Sex)
sel <- which(of.post$Dist_Corrected %in% bp$out)

set.seed(4321)
p <- ggplot(data=of.post.COPY, aes(x = Treatment, y = Dist_Corrected)) +
  geom_boxplot( outlier.size=1) + 
  facet_grid(~Sex ) +
  geom_jitter(data = of.post.COPY[-sel, ], aes(colour = Treatment), width=0.3, height=0, size=1) +
  scale_color_manual(values = cols, guide = FALSE) +
  theme_classic() +
  labs(y="Distance travelled (m)", x=NULL) +
  theme(
    plot.margin = margin(t = 2, r = 2, b = 2, l = 5, "pt"),
    axis.ticks.length=unit(-3, "pt"),
    axis.ticks.x.bottom = element_blank(),
    axis.text.x = element_text(margin=margin(t = 5,r = 5,b = 2,l = 5,"pt"), size = rel(0.9)),  # , size = rel(0.9))
    axis.text.y = element_text(margin=margin(t = 5,r = 5,b = 5,l = 2,"pt"), size = rel(0.8)),
    axis.title.y = element_text(margin=margin(t = 5,r = 2,b = 5,l = 2,"pt"), size = rel(0.8)),
    strip.background = element_rect(fill="white", linetype = "blank")
    
  )

p

grid.text(label = "A", x = unit(0.02, "npc") , y = unit(0.96,"npc"), gp=gpar(fontsize=11, fontface="bold") )

dev.print(tiff, file = paste0("plots/","Open-Field-Distance-Travelled-Female-and-Male-vFINAL.tiff"), width = 9, height = 6, units = "cm", res=600, compression="lzw")




# # # # # # # # #

## No of entries ?


## plot using facet_grid

# don't colour outliers
bp <- boxplot(of.post$Centre_entries ~ of.post$Treatment + of.post$Sex)
bp$out #   60 54 79 38

p <- ggplot(data=of.post, aes(x = Treatment, y = Centre_entries)) +
  geom_boxplot() +
  facet_grid(~Sex)
p


sel1 <- which(of.post$Sex == "female" & of.post$Treatment %in% c("Control","High") & of.post$Centre_entries %in% bp$out)
of.post$Centre_entries[sel1] #  60 54
sel2 <- which(of.post$Sex == "male" & of.post$Treatment %in% c("Control","Low") & of.post$Centre_entries %in% bp$out)
of.post$Centre_entries[sel2] # 79 38


# Kruskal-Wallis test
kt <- kruskal.test( Centre_entries ~ factor(Treatment), of.post[-c(sel1,sel2), ]) # Kruskal Wallis test
kt
# Kruskal-Wallis rank sum test
# data:  Centre_entries by factor(Treatment)
# Kruskal-Wallis chi-squared = 1.2144, df = 2, p-value = 0.5449

# Females & exclude outlier
sel <- which(of.post$Sex == "female")
subsel.out <- which(of.post$Treatment[sel] %in% c("Control","High") & of.post$Centre_entries[sel] %in% bp$out)
of.post[sel[-subsel.out], ]
kt <- kruskal.test( Centre_entries ~ factor(Treatment), of.post[sel[-subsel.out], ] ) # Kruskal Wallis test
kt
# Kruskal-Wallis rank sum test
# data:  Centre_entries by factor(Treatment)
# Kruskal-Wallis chi-squared = 1.9772, df = 2, p-value = 0.3721

# Males & exclude outlier
sel <- which(of.post$Sex == "male")
subsel.out <- which(of.post$Treatment[sel] %in% c("Control","Low") & of.post$Centre_entries[sel] %in% bp$out)
of.post[sel[-subsel.out], ]
kt <- kruskal.test( Centre_entries ~ factor(Treatment), of.post[sel[-subsel.out], ] ) # Kruskal Wallis test
kt
# Kruskal-Wallis rank sum test
# data:  Centre_entries by factor(Treatment)
# Kruskal-Wallis chi-squared = 5.0461, df = 2, p-value = 0.08021


cols <- c("Control" = "#f46d43", "Low" = "#66c2a5" , "High" = "#5e4fa2" ) # orange, aqua, dark-blue

of.post.COPY <- of.post
of.post.COPY$Sex <- factor(of.post.COPY$Sex, levels = c("female", "male"), labels=c("Female", "Male"))

bp <- boxplot(of.post$Centre_entries ~ of.post$Treatment + of.post$Sex)
sel1 <- which(of.post$Sex == "female" & of.post$Treatment %in% c("Control","High") & of.post$Centre_entries %in% bp$out)
sel2 <- which(of.post$Sex == "male" & of.post$Treatment %in% c("Control","Low") & of.post$Centre_entries %in% bp$out)

set.seed(4321)
p <- ggplot(data=of.post.COPY, aes(x = Treatment, y = Centre_entries)) +
  geom_boxplot( outlier.size=1) + 
  facet_grid(~Sex ) +
  geom_jitter(data = of.post.COPY[-c(sel1,sel2), ], aes(colour = Treatment), width=0.3, height=0, size=1) +
  scale_color_manual(values = cols, guide = FALSE) +
  theme_classic() +
  labs(y="Centre entries (count)", x=NULL) +
  theme(
    plot.margin = margin(t = 2, r = 2, b = 2, l = 5, "pt"),
    axis.ticks.length=unit(-3, "pt"),
    axis.ticks.x.bottom = element_blank(),
    axis.text.x = element_text(margin=margin(t = 5,r = 5,b = 2,l = 5,"pt"), size = rel(0.9)),  # , size = rel(0.9))
    axis.text.y = element_text(margin=margin(t = 5,r = 5,b = 5,l = 2,"pt"), size = rel(0.8)),
    axis.title.y = element_text(margin=margin(t = 5,r = 2,b = 5,l = 2,"pt"), size = rel(0.8)),
    strip.background = element_rect(fill="white", linetype = "blank")
    
  )

p

grid.text(label = "B", x = unit(0.02, "npc") , y = unit(0.96,"npc"), gp=gpar(fontsize=11, fontface="bold") )

dev.print(tiff, file = paste0("plots/","Open-Field-Centre-Entries-Female-and-Male-vFINAL.tiff"), width = 9, height = 6, units = "cm", res=600, compression="lzw")



# # # # # # # # #
# # # # # # # # #
# # # # # # # # #
# # # # # # # # #





### Elevated Plus

ep.post <- read_excel(path= paste0(datadir,"/","Elev-plus-post-exposure.xlsx"),
                      sheet=1, range="A1:AA55", col_names = TRUE)

ep.post <- as.data.frame(ep.post)
str(ep.post)

ep.post$Treatment <- factor(ep.post$Treatment, levels = c("Control", "Low" , "High"), ordered = TRUE )


# change char to factor
ep.post[ , c( "Code", "Mouse", "ID", "Sex", "Litter" )] <- 
  lapply(ep.post[ , c("Code", "Mouse", "ID", "Sex", "Litter" )], 
         FUN = factor)

str(ep.post)
# 'data.frame':	54 obs. of  27 variables:
# $ Test_no                : num  1 2 3 4 5 6 7 8 9 10 ...
# $ Treatment              : Ord.factor w/ 3 levels "Control"<"Low"<..: 1 1 1 2 2 2 3 3 3 3 ...
# $ Code                   : Factor w/ 18 levels "C1","C2","C3",..: 1 1 1 13 13 13 7 7 7 8 ...
# $ Mouse                  : Factor w/ 3 levels "m1","m2","m3": 1 2 3 1 2 3 1 2 3 1 ...
# $ ID                     : Factor w/ 54 levels "C1m1","C1m2",..: 1 2 3 37 38 39 19 20 21 22 ...
# $ Sex                    : Factor w/ 2 levels "female","male": 2 2 2 1 1 1 2 2 2 2 ...
# $ Litter                 : Factor w/ 15 levels "Blue 20","Blue 24",..: 2 3 11 2 3 11 11 12 13 12 ...
# $ Stage                  : chr  "First stage" "First stage" "First stage" "First stage" ...
# $ Trial                  : num  1 1 1 1 1 1 1 1 1 1 ...
# $ Apparatus              : chr  "plus maze 1" "plus maze 2" "plus maze 3" "plus maze 1" ...
# $ Duration               : num  600 600 600 600 600 600 600 600 600 600 ...
# $ Distance               : num  12.71 10.05 11.36 9.19 13.67 ...
# $ Centre_entries         : num  39 71 119 30 67 63 60 83 70 78 ...
# $ Centre_time            : num  29.6 240.9 142 45.9 93.4 ...
# $ Centre_distance        : num  1.275 2.455 3.541 0.921 2.077 ...
# $ Centre_head_time       : num  101.5 71.5 149.7 51.1 92.8 ...
# $ Centre_head_entries    : num  97 65 104 94 68 61 99 76 61 61 ...
# $ Open_arm_entries       : num  15 24 75 12 17 30 29 6 32 54 ...
# $ Open_arm_time          : num  19.9 62.8 145 39.3 17.8 ...
# $ Open_arm_distance      : num  0.138 0.27 1.94 0.263 0.32 0.806 0.727 0.107 0.794 0.725 ...
# $ Open_arm_head_time     : num  77.6 161.1 221.9 126.5 54.3 ...
# $ Open_arm_head_entries  : num  66 57 58 74 28 45 87 72 48 62 ...
# $ Closed_arm_entries     : num  37 47 82 25 54 49 49 77 56 28 ...
# $ Closed_arm_time        : num  550 296 313 515 489 ...
# $ Closed_arm_distance    : num  11.47 7.45 6.36 8.12 11.53 ...
# $ Closed_arm_head_time   : num  421 368 228 422 453 ...
# $ Closed_arm_head_entries: num  80 56 77 66 58 60 76 43 50 81 ...

ep.post$non_Open_arm_time <- ep.post$Closed_arm_time + ep.post$Centre_time

ep.post$time_centre_and_open_arm <- ep.post$Centre_time + ep.post$Open_arm_time



p <- ggplot(data = ep.post, aes(x = Treatment, y = Open_arm_time)) +
  geom_boxplot() +
  facet_grid(. ~ Sex)
p

p <- ggplot(data = ep.post, aes(x = Treatment, y = time_centre_and_open_arm)) +
  geom_boxplot() +
  facet_grid(. ~ Sex)
p

# # # # # # ## # # # 

## isolate sexes and exclude outlier data

names(ep.post)
# [1] "Test_no"                  "Treatment"                "Code"                     "Mouse"                   
# [5] "ID"                       "Sex"                      "Litter"                   "Stage"                   
# [9] "Trial"                    "Apparatus"                "Duration"                 "Distance"                
# [13] "Centre_entries"           "Centre_time"              "Centre_distance"          "Centre_head_time"        
# [17] "Centre_head_entries"      "Open_arm_entries"         "Open_arm_time"            "Open_arm_distance"       
# [21] "Open_arm_head_time"       "Open_arm_head_entries"    "Closed_arm_entries"       "Closed_arm_time"         
# [25] "Closed_arm_distance"      "Closed_arm_head_time"     "Closed_arm_head_entries"  "non_Open_arm_time"       
# [29] "time_centre_and_open_arm"

sel <- which(ep.post$Sex == "female")

ep_ct_fem <- ep.post[sel, c("ID", "Treatment", "time_centre_and_open_arm")]

ep_ct_fem$group <- as.character(ep_ct_fem$Treatment)

ep_ct_fem$ID <- as.character(ep_ct_fem$ID)
ep_ct_fem$ID <- factor(ep_ct_fem$ID)

str(ep_ct_fem)
# 'data.frame':	27 obs. of  4 variables:
# $ ID                      : Factor w/ 27 levels "C2m1","C2m2",..: 19 20 21 1 2 3 22 23 24 10 ...
# $ Treatment               : Ord.factor w/ 3 levels "Control"<"Low"<..: 2 2 2 1 1 1 2 2 2 3 ...
# $ time_centre_and_open_arm: num  85.2 111.2 120.6 50.5 589.5 ...
# $ group                   : chr  "Low" "Low" "Low" "Control" ...

p <- ggplot(data = ep_ct_fem, aes(x = Treatment, y = time_centre_and_open_arm)) +
  geom_boxplot() +
  theme_bw()
p


## Exclude outliers for significance test - using default outlier classification
## outlier is defined as values outside 1.5 times the interquartile range above the upper quartile or below the lower quartile.


bp <- boxplot(ep_ct_fem$time_centre_and_open_arm ~ ep_ct_fem$group)
bp$out #  589.5  41.8 328.3

sel <- which(ep_ct_fem$time_centre_and_open_arm %in% bp$out)
ep_ct_fem$time_centre_and_open_arm[sel] # 589.5  41.8 328.3
ep_ct_fem[sel, ]
#      ID Treatment time_centre_and_open_arm   group
# 14 C2m2   Control                    589.5 Control
# 19 L3m1       Low                     41.8     Low
# 20 L3m2       Low                    328.3     Low

# exclude outliers
ep_ct_fem__ex_out <- ep_ct_fem[-sel, ]

str(ep_ct_fem__ex_out)
# 'data.frame':	24 obs. of  4 variables:
# $ ID                      : Factor w/ 27 levels "C2m1","C2m2",..: 19 20 21 1 3 24 10 11 12 13 ...
# $ Treatment               : Ord.factor w/ 3 levels "Control"<"Low"<..: 2 2 2 1 1 2 3 3 3 3 ...
# $ time_centre_and_open_arm: num  85.2 111.2 120.6 50.5 91.5 ...
# $ group                   : chr  "Low" "Low" "Low" "Control" ...


# Kruskal-Wallis test
kt <- kruskal.test( time_centre_and_open_arm ~ factor(group), ep_ct_fem__ex_out) # Kruskal Wallis test
kt
# Kruskal-Wallis rank sum test
# 
# data:  time_centre_and_open_arm by factor(group)
# Kruskal-Wallis chi-squared = 1.9346, df = 2, p-value = 0.3801

## i.e. No significant difference!





# # # # # # # # #

## Note there are no outliers in males for Elevated plus maze for variable: "time_centre_and_open_arm"
names(ep.post)

sel <- which(ep.post$Sex == "male")

ep_ct_male <- ep.post[sel, c("ID", "Treatment", "time_centre_and_open_arm")]

ep_ct_male$group <- as.character(ep_ct_male$Treatment)

ep_ct_male$ID <- as.character(ep_ct_male$ID)
ep_ct_male$ID <- factor(ep_ct_male$ID)

str(ep_ct_male)
# 'data.frame':	27 obs. of  4 variables:
# $ ID                      : Factor w/ 27 levels "C1m1","C1m2",..: 1 2 3 10 11 12 13 14 15 19 ...
# $ Treatment               : Ord.factor w/ 3 levels "Control"<"Low"<..: 1 1 1 3 3 3 3 3 3 2 ...
# $ time_centre_and_open_arm: num  49.5 303.7 287 152 449.7 ...
# $ group                   : chr  "Control" "Control" "Control" "High" ...


## Exclude boxplot outliers from Kruskal Wallis test

bp <- boxplot(ep_ct_male$time_centre_and_open_arm ~ ep_ct_male$group)
bp$out #   numeric(0)

# i.e. no outliers to exclude


# Kruskal-Wallis test
kt <- kruskal.test( time_centre_and_open_arm ~ factor(group), ep_ct_male) # Kruskal Wallis test
kt
# Kruskal-Wallis rank sum test
# 
# data:  time_centre_and_open_arm by factor(group)
# Kruskal-Wallis chi-squared = 0.46914, df = 2, p-value = 0.7909

## i.e. No significant differences



# # # # # # # # #

## plot Elev Plus data with both sexes

## plot using facet_grid

# don't colour outliers
bp <- boxplot(ep.post$time_centre_and_open_arm ~ ep.post$Treatment + ep.post$Sex)
bp$out #   589.5  41.8 328.3

sel <- which(ep.post$time_centre_and_open_arm %in% bp$out)
ep.post$time_centre_and_open_arm[sel] # 589.5  41.8 328.3

cols <- c("Control" = "#f46d43", "Low" = "#66c2a5" , "High" = "#5e4fa2" ) # orange, aqua, dark-blue

ep.post.COPY <- ep.post
ep.post.COPY$Sex <- factor(ep.post.COPY$Sex, levels = c("female", "male"), labels=c("Female", "Male"))

set.seed(4321)
p <- ggplot(data=ep.post.COPY, aes(x = Treatment, y = time_centre_and_open_arm)) +
  geom_boxplot(outlier.size = 1) + 
  facet_grid(~Sex ) +
  geom_jitter(data = ep.post.COPY[-sel, ], aes(colour = Treatment), width=0.3, height=0, size=1) +
  scale_color_manual(values = cols, guide = FALSE) +
  theme_classic() +
  labs(y="Time in centre or open arms (s)", x=NULL) +
  theme(
    plot.margin = margin(t = 2, r = 2, b = 2, l = 5, "pt"),
    axis.ticks.length=unit(-3, "pt"),
    axis.ticks.x.bottom = element_blank(),
    axis.text.x = element_text(margin=margin(t = 5,r = 5,b = 2,l = 5,"pt"), size = rel(0.9)),  # , size = rel(0.9))
    axis.text.y = element_text(margin=margin(t = 5,r = 5,b = 5,l = 2,"pt"), size = rel(0.8)),
    axis.title.y = element_text(margin=margin(t = 5,r = 2,b = 5,l = 2,"pt"), size = rel(0.8)),
    strip.background = element_rect(fill="white", linetype = "blank")
    
  )

p


ggsave(plot=p, filename = paste0("plots/","Elev-Plus-Time-in-Centre-or-Open-Arm-Results-Female-and-Male-vFINAL.tiff"), width = 9, height = 6, units = "cm", dpi = 600, compression = "lzw")





### Compare pre- to post- Elev Plus results

ep.pre <- read_excel(path= paste0(datadir,"/","Elev-plus-pre-exposure.xlsx"),
                     sheet=1, range="A1:AA55", col_names = TRUE)

ep.pre <- as.data.frame(ep.pre)
str(ep.pre)

ep.pre$Treatment <- factor(ep.pre$Treatment, levels = c("Control", "Low" , "High"), ordered = TRUE )

# change char to factor
ep.pre[ , c( "Code", "Mouse", "ID", "Sex", "Litter" )] <- 
  lapply(ep.pre[ , c("Code", "Mouse", "ID", "Sex", "Litter" )], 
         FUN = factor)

names(mice_info)
# [1] "mouseID"                        "Animal no"                      "Sex"                           
# [4] "Litter"                         "Date of Birth"                  "Cage no"                       
# [7] "Cage Name"                      "Treatment"                      "Rack"                          
# [10] "Ear notch ID"                   "Ear notch pos"                  "Age_received_20_Aug"           
# [13] "Age_first_soil_exposure_27_Aug" "Rack_position"    


ep.pre <- merge(x = ep.pre, y = mice_info[ ,c( "mouseID", "Date of Birth",
                                               "Cage no", "Cage Name", "Rack",
                                               "Age_received_20_Aug", 
                                               "Age_first_soil_exposure_27_Aug",
                                               "Rack_position")], by.x = "ID", by.y = "mouseID", all.x = TRUE )

str(ep.pre)
# 'data.frame':	54 obs. of  34 variables:
# $ ID                            : Factor w/ 54 levels "C1m1","C1m2",..: 1 2 3 4 5 6 7 8 9 10 ...
# $ Test_no                       : num  1 2 3 13 14 15 25 26 27 34 ...
# $ Treatment                     : Ord.factor w/ 3 levels "Control"<"Low"<..: 1 1 1 1 1 1 1 1 1 1 ...
# $ Code                          : Factor w/ 18 levels "C1","C2","C3",..: 1 1 1 2 2 2 3 3 3 4 ...
# $ Mouse                         : Factor w/ 3 levels "m1","m2","m3": 1 2 3 1 2 3 1 2 3 1 ...
# $ Sex                           : Factor w/ 2 levels "female","male": 2 2 2 1 1 1 2 2 2 1 ...
# $ Litter                        : Factor w/ 15 levels "Blue 20","Blue 24",..: 2 3 11 11 12 14 9 10 1 15 ...
# $ Stage                         : chr  "First stage" "First stage" "First stage" "First stage" ...
# $ Trial                         : num  1 1 1 1 1 1 1 1 1 1 ...
# $ Apparatus                     : chr  "plus maze 1" "plus maze 2" "plus maze 3" "plus maze 1" ...
# $ Duration                      : num  600 600 600 600 600 600 600 600 600 600 ...
# $ Distance                      : num  13.45 3.24 9.82 13.59 13.75 ...
# $ Centre_entries                : num  48 34 77 69 83 65 61 79 90 30 ...
# $ Centre_time                   : num  43.8 88.9 205.9 151.5 142.8 ...
# $ Centre_distance               : num  1.723 0.585 2.187 1.971 2.609 ...
# $ Centre_head_time              : num  81.3 67.2 169.2 52.5 97.5 ...
# $ Centre_head_entries           : num  85 62 73 57 62 13 94 75 63 46 ...
# $ Open_arm_entries              : num  20 23 35 41 46 40 27 31 58 16 ...
# $ Open_arm_time                 : num  56.7 298 85.5 209.2 187.9 ...
# $ Open_arm_distance             : num  0.429 0.628 1.149 3.73 3.184 ...
# $ Open_arm_head_time            : num  82.5 271.2 119.6 250.4 218.3 ...
# $ Open_arm_head_entries         : num  50 45 49 44 46 10 80 50 54 35 ...
# $ Closed_arm_entries            : num  33 12 42 36 38 40 46 49 43 19 ...
# $ Closed_arm_time               : num  500 213 309 239 269 ...
# $ Closed_arm_distance           : num  11.45 2.13 6.71 8.12 8.43 ...
# $ Closed_arm_head_time          : num  436 262 311 297 284 ...
# $ Closed_arm_head_entries       : num  53 25 54 52 52 15 76 65 57 28 ...
# $ Date of Birth                 : POSIXct, format: "2018-08-02" "2018-08-04" "2018-08-02" "2018-08-02" ...
# $ Cage no                       : num  1 1 1 5 5 5 9 9 9 12 ...
# $ Cage Name                     : Factor w/ 18 levels "C1","C2","C3",..: 1 1 1 2 2 2 3 3 3 4 ...
# $ Rack                          : num  1 1 1 2 2 2 3 3 3 4 ...
# $ Age_received_20_Aug           : num  2.6 2.3 2.6 2.6 2.6 3 4.1 4.1 4.1 3.7 ...
# $ Age_first_soil_exposure_27_Aug: num  3.6 3.3 3.6 3.6 3.6 4 5.1 5.1 5.1 4.7 ...
# $ Rack_position                 : Factor w/ 3 levels "low","middle",..: 3 3 3 2 2 2 1 1 1 1 ...


ep.pre$time_centre_and_open_arm <- ep.pre$Centre_time + ep.pre$Open_arm_time




p <- ggplot(data = ep.pre, aes(x = Treatment, y = time_centre_and_open_arm)) +
  geom_boxplot() +
  facet_grid( ~ Sex) +
  theme_bw()
p


## isolate data for females and males respectively - to exclude outliers and test for difference
names(ep.pre)
# [1] "ID"                             "Test_no"                        "Treatment"                     
# [4] "Code"                           "Mouse"                          "Sex"                           
# [7] "Litter"                         "Stage"                          "Trial"                         
# [10] "Apparatus"                      "Duration"                       "Distance"                      
# [13] "Centre_entries"                 "Centre_time"                    "Centre_distance"               
# [16] "Centre_head_time"               "Centre_head_entries"            "Open_arm_entries"              
# [19] "Open_arm_time"                  "Open_arm_distance"              "Open_arm_head_time"            
# [22] "Open_arm_head_entries"          "Closed_arm_entries"             "Closed_arm_time"               
# [25] "Closed_arm_distance"            "Closed_arm_head_time"           "Closed_arm_head_entries"       
# [28] "Date of Birth"                  "Cage no"                        "Cage Name"                     
# [31] "Rack"                           "Age_received_20_Aug"            "Age_first_soil_exposure_27_Aug"
# [34] "Rack_position"   

sel <- which(ep.pre$Sex == "female")

#ep_ct_fem_pre <- ep.pre[sel, c("ID", "Treatment", "Centre_time")]

ep_ct_fem_pre <- ep.pre[sel, c("ID", "Treatment", "time_centre_and_open_arm")]




## Kruskal-Wallis multiple comparison test 

str(ep_ct_fem_pre)
# 'data.frame':	27 obs. of  3 variables:
# $ ID                      : Factor w/ 54 levels "C1m1","C1m2",..: 4 5 6 10 11 12 13 14 15 25 ...
# $ Treatment               : Ord.factor w/ 3 levels "Control"<"Low"<..: 1 1 1 1 1 1 1 1 1 3 ...
# $ time_centre_and_open_arm: num  360.7 330.7 271.3 84.8 97.2 ...

ep_ct_fem_pre$group <- as.character(ep_ct_fem_pre$Treatment)

str(ep_ct_fem_pre)
# 'data.frame':	27 obs. of  4 variables:
# $ ID                      : Factor w/ 54 levels "C1m1","C1m2",..: 4 5 6 10 11 12 13 14 15 25 ...
# $ Treatment               : Ord.factor w/ 3 levels "Control"<"Low"<..: 1 1 1 1 1 1 1 1 1 3 ...
# $ time_centre_and_open_arm: num  360.7 330.7 271.3 84.8 97.2 ...
# $ group                   : chr  "Control" "Control" "Control" "Control" ...

p <- ggplot(data = ep_ct_fem_pre, aes(x = Treatment, y = time_centre_and_open_arm)) +
  geom_boxplot() +
  theme_bw()
p



## Exclude outliers from significance testing - using default outlier classification
## outlier is defined as values outside 1.5 times the interquartile range above the upper quartile or below the lower quartile.

bp <- boxplot(ep_ct_fem_pre$time_centre_and_open_arm ~ ep_ct_fem_pre$group)
bp$out #  none

# sel <- which(ep_ct_fem_pre$time_centre_and_open_arm %in% bp$out)
# ep_ct_fem_pre$time_centre_and_open_arm[sel] #  
# ep_ct_fem_pre[sel, ]


# # exclude outliers
# ep_ct_fem_pre__ex_out <- ep_ct_fem_pre[-sel, ]

# str(ep_ct_fem_pre__ex_out)

# table(ep_ct_fem_pre__ex_out$Treatment)

table(ep_ct_fem_pre$Treatment)
# Control     Low    High 
#       9       9       9 

# Kruskal-Wallis test
#kt <- kruskal.test( time_centre_and_open_arm ~ factor(group), ep_ct_fem_pre__ex_out) # Kruskal Wallis test
kt <- kruskal.test( time_centre_and_open_arm ~ factor(group), ep_ct_fem_pre) # Kruskal Wallis test
kt
# Kruskal-Wallis rank sum test
# 
# data:  time_centre_and_open_arm by factor(group)
# Kruskal-Wallis chi-squared = 3.3474, df = 2, p-value = 0.1875



# # # # # # # # #

## isolate males and subset data
names(ep.pre)

sel <- which(ep.pre$Sex == "male")

ep_ct_male_pre <- ep.pre[sel, c("ID", "Treatment", "time_centre_and_open_arm")]

ep_ct_male_pre$group <- as.character(ep_ct_male_pre$Treatment)

str(ep_ct_male_pre)
# 'data.frame':	27 obs. of  4 variables:
# $ ID                      : Factor w/ 54 levels "C1m1","C1m2",..: 1 2 3 7 8 9 16 17 18 19 ...
# $ Treatment               : Ord.factor w/ 3 levels "Control"<"Low"<..: 1 1 1 1 1 1 1 1 1 3 ...
# $ time_centre_and_open_arm: num  100 387 291 158 246 ...
# $ group                   : chr  "Control" "Control" "Control" "Control" ...


## Exclude boxplot outliers from Kruskal Wallis test

bp <- boxplot(ep_ct_male_pre$time_centre_and_open_arm ~ ep_ct_male_pre$group)
bp$out #   none

# sel <- which(ep_ct_male_pre$time_centre_and_open_arm %in% bp$out)
# ep_ct_male_pre$time_centre_and_open_arm[sel] # 
# ep_ct_male_pre[sel, ]
# 
# 
# # exclude outliers
# ep_ct_male_pre__ex_out <- ep_ct_male_pre[-sel, ]

#str(ep_ct_male_pre__ex_out)


#table(ep_ct_male_pre__ex_out$Treatment)

table(ep_ct_male_pre$Treatment)
# Control     Low    High 
#       9       9       9


# Kruskal-Wallis test
kt <- kruskal.test( time_centre_and_open_arm ~ factor(group), ep_ct_male_pre) # Kruskal Wallis test
kt
# Kruskal-Wallis rank sum test
# 
# data:  time_centre_and_open_arm by factor(group)
# Kruskal-Wallis chi-squared = 4.7019, df = 2, p-value = 0.09528

## i.e. no significant difference



# # # # # # # # #
# # # # # # # # #

## Plot Elev Plus - BASELINE - Time in Centre & open Arms - Female & Male


## plot using facet_grid

# don't colour outliers
bp <- boxplot(ep.pre$time_centre_and_open_arm ~ ep.pre$Treatment + ep.pre$Sex)
bp$out #   none

# sel <- which(ep.pre$time_centre_and_open_arm %in% bp$out)
# ep.pre$time_centre_and_open_arm[sel] # 

cols <- c("Control" = "#f46d43", "Low" = "#66c2a5" , "High" = "#5e4fa2" ) # orange, aqua, dark-blue

ep.pre.COPY <- ep.pre
ep.pre.COPY$Sex <- factor(ep.pre.COPY$Sex, levels = c("female", "male"), labels=c("Female", "Male"))

set.seed(4321)
p <- ggplot(data=ep.pre.COPY, aes(x = Treatment, y = time_centre_and_open_arm)) +
  geom_boxplot(outlier.size=1) + 
  facet_grid(~Sex ) +
  #geom_jitter(data = ep.pre.COPY[-sel, ], aes(colour = Treatment), width=0.3, height=0, size=1) +
  geom_jitter(data = ep.pre.COPY, aes(colour = Treatment), width=0.3, height=0, size=1) +
  scale_color_manual(values = cols, guide = FALSE) +
  theme_classic() +
  labs(y="Time in centre or open arms (s)", x=NULL) +
  theme(
    plot.margin = margin(t = 2, r = 2, b = 2, l = 5, "pt"),
    axis.ticks.length=unit(-3, "pt"),
    axis.ticks.x.bottom = element_blank(),
    axis.text.x = element_text(margin=margin(t = 5,r = 5,b = 2,l = 5,"pt"), size = rel(0.9)),  # , size = rel(0.9))
    axis.text.y = element_text(margin=margin(t = 5,r = 5,b = 5,l = 2,"pt"), size = rel(0.8)),
    axis.title.y = element_text(margin=margin(t = 5,r = 2,b = 5,l = 2,"pt"), size = rel(0.8)),
    strip.background = element_rect(fill="white", linetype = "blank")
    
  )

p


dev.print(tiff, file = paste0("plots/","Elev-Plus-BASELINE-Time-in-Centre-and-Open-Arms-Results-Female-and-Male-vFINAL.tiff"), width = 8.7, height = 6, units = "cm", res=600, compression="lzw")





# # # # # # # # #


## Distance travelled


## plot using facet_grid

# don't colour outliers
bp <- boxplot(ep.post$Distance ~ ep.post$Treatment + ep.post$Sex)
bp$out #   5.789  2.624 16.023

sel <- which(ep.post$Distance %in% bp$out)
ep.post$Distance[sel] #  2.624  5.789 16.023


# Kruskal-Wallis test
kt <- kruskal.test( Distance ~ factor(Treatment), ep.post[-sel, ]) # Kruskal Wallis test
kt
# Kruskal-Wallis rank sum test
# data:  Distance by factor(Treatment)
# Kruskal-Wallis chi-squared = 0.33697, df = 2, p-value = 0.8449

# Males & exclude outlier
kt <- kruskal.test( Distance ~ factor(Treatment), ep.post[-which(ep.post$Distance %in% bp$out | ep.post$Sex == "female"), ]) # Kruskal Wallis test
kt
# Kruskal-Wallis rank sum test
# data:  Distance by factor(Treatment)
# Kruskal-Wallis chi-squared = 2.4744, df = 2, p-value = 0.2902

# Females & exclude outlier
kt <- kruskal.test( Distance ~ factor(Treatment), ep.post[-which(ep.post$Distance %in% bp$out | ep.post$Sex == "male"), ]) # Kruskal Wallis test
kt
# Kruskal-Wallis rank sum test
# data:  Distance by factor(Treatment)
# Kruskal-Wallis chi-squared = 5.0123, df = 2, p-value = 0.08158


cols <- c("Control" = "#f46d43", "Low" = "#66c2a5" , "High" = "#5e4fa2" ) # orange, aqua, dark-blue

ep.post.COPY <- ep.post
ep.post.COPY$Sex <- factor(ep.post.COPY$Sex, levels = c("female", "male"), labels=c("Female", "Male"))

bp <- boxplot(ep.post$Distance ~ ep.post$Treatment + ep.post$Sex)
sel <- which(ep.post$Distance %in% bp$out)

set.seed(4321)
p <- ggplot(data=ep.post.COPY, aes(x = Treatment, y = Distance)) +
  geom_boxplot( outlier.size=1) +
  facet_grid(~Sex ) +
  geom_jitter(data = ep.post.COPY[-sel, ], aes(colour = Treatment), width=0.3, height=0, size=1) +
  scale_color_manual(values = cols, guide = FALSE) +
  theme_classic() +
  labs(y="Distance travelled (m)", x=NULL) +
  theme(
    plot.margin = margin(t = 2, r = 2, b = 2, l = 5, "pt"),
    axis.ticks.length=unit(-3, "pt"),
    axis.ticks.x.bottom = element_blank(),
    axis.text.x = element_text(margin=margin(t = 5,r = 5,b = 2,l = 5,"pt"), size = rel(0.9)),  # , size = rel(0.9))
    axis.text.y = element_text(margin=margin(t = 5,r = 5,b = 5,l = 2,"pt"), size = rel(0.8)),
    axis.title.y = element_text(margin=margin(t = 5,r = 2,b = 5,l = 2,"pt"), size = rel(0.8)),
    strip.background = element_rect(fill="white", linetype = "blank")

  )

p

grid.text(label = "A", x = unit(0.02, "npc") , y = unit(0.96,"npc"), gp=gpar(fontsize=11, fontface="bold") )

dev.print(tiff, file = paste0("plots/","Elev-Plus-Distance-Travelled-Female-and-Male-vFINAL.tiff"), width = 9, height = 6, units = "cm", res=600, compression="lzw")




# # # # # # # # #

## No of entries ?

## Use entries into centre or open arms

ep.post.COPY <- ep.post
ep.post.COPY$Sex <- factor(ep.post.COPY$Sex, levels = c("female", "male"), labels=c("Female", "Male"))

ep.post.COPY$Centre_and_open_arm_entries <- ep.post.COPY$Centre_entries + ep.post.COPY$Open_arm_entries

## plot using facet_grid

# don't colour outliers
bp <- boxplot(ep.post.COPY$Centre_and_open_arm_entries ~ ep.post.COPY$Treatment + ep.post.COPY$Sex)
bp$out #   144 177 132  50

p <- ggplot(data=ep.post.COPY, aes(x = Treatment, y = Centre_and_open_arm_entries)) +
  geom_boxplot() +
  facet_grid(~Sex)
p


sel1 <- which(ep.post.COPY$Sex == "Female" & ep.post.COPY$Treatment == "Control" & ep.post.COPY$Centre_and_open_arm_entries %in% bp$out)
ep.post.COPY$Centre_and_open_arm_entries[sel1] #  144
sel2 <- which(ep.post.COPY$Sex == "Male" & ep.post.COPY$Treatment %in% c("Low","High") & ep.post.COPY$Centre_and_open_arm_entries %in% bp$out)
ep.post.COPY$Centre_and_open_arm_entries[sel2] # 132  50 177


# Kruskal-Wallis test
kt <- kruskal.test( Centre_and_open_arm_entries ~ factor(Treatment), ep.post.COPY[-c(sel1,sel2), ]) # Kruskal Wallis test
kt
# Kruskal-Wallis rank sum test
# data:  Centre_and_open_arm_entries by factor(Treatment)
# Kruskal-Wallis chi-squared = 2.027, df = 2, p-value = 0.3629

# Females & exclude outlier
sel <- which(ep.post.COPY$Sex == "Female")
subsel.out <- which(ep.post.COPY$Treatment[sel] == "Control" & ep.post.COPY$Centre_and_open_arm_entries[sel] %in% bp$out)
ep.post.COPY[sel[-subsel.out], ]
kt <- kruskal.test( Centre_and_open_arm_entries ~ factor(Treatment), ep.post.COPY[sel[-subsel.out], ] ) # Kruskal Wallis test
kt
# Kruskal-Wallis rank sum test
# data:  Centre_and_open_arm_entries by factor(Treatment)
# Kruskal-Wallis chi-squared = 3.4015, df = 2, p-value = 0.1825

# Males & exclude outlier
sel <- which(ep.post.COPY$Sex == "Male")
subsel.out <- which(ep.post.COPY$Treatment[sel] %in% c("Low","High") & ep.post.COPY$Centre_and_open_arm_entries[sel] %in% bp$out)
ep.post.COPY[sel[-subsel.out], ]
kt <- kruskal.test( Centre_and_open_arm_entries ~ factor(Treatment), ep.post.COPY[sel[-subsel.out], ] ) # Kruskal Wallis test
kt
# Kruskal-Wallis rank sum test
# data:  Centre_and_open_arm_entries by factor(Treatment)
# Kruskal-Wallis chi-squared = 0.21402, df = 2, p-value = 0.8985


cols <- c("Control" = "#f46d43", "Low" = "#66c2a5" , "High" = "#5e4fa2" ) # orange, aqua, dark-blue

bp <- boxplot(ep.post.COPY$Centre_and_open_arm_entries ~ ep.post.COPY$Treatment + ep.post.COPY$Sex)
sel1 <- which(ep.post.COPY$Sex == "Female" & ep.post.COPY$Treatment == "Control" & ep.post.COPY$Centre_and_open_arm_entries %in% bp$out)
sel2 <- which(ep.post.COPY$Sex == "Male" & ep.post.COPY$Treatment %in% c("Low","High") & ep.post.COPY$Centre_and_open_arm_entries %in% bp$out)

set.seed(4321)
p <- ggplot(data=ep.post.COPY, aes(x = Treatment, y = Centre_and_open_arm_entries)) +
  geom_boxplot( outlier.size=1) +
  facet_grid(~Sex ) +
  geom_jitter(data = ep.post.COPY[-c(sel1,sel2), ], aes(colour = Treatment), width=0.3, height=0, size=1) +
  scale_color_manual(values = cols, guide = FALSE) +
  theme_classic() +
  labs(y="Centre + open arm entries (count)", x=NULL) +
  theme(
    plot.margin = margin(t = 2, r = 2, b = 2, l = 5, "pt"),
    axis.ticks.length=unit(-3, "pt"),
    axis.ticks.x.bottom = element_blank(),
    axis.text.x = element_text(margin=margin(t = 5,r = 5,b = 2,l = 5,"pt"), size = rel(0.9)),  # , size = rel(0.9))
    axis.text.y = element_text(margin=margin(t = 5,r = 5,b = 5,l = 2,"pt"), size = rel(0.8)),
    axis.title.y = element_text(margin=margin(t = 5,r = 2,b = 5,l = 2,"pt"), size = rel(0.8)),
    strip.background = element_rect(fill="white", linetype = "blank")

  )

p

grid.text(label = "B", x = unit(0.02, "npc") , y = unit(0.96,"npc"), gp=gpar(fontsize=11, fontface="bold") )

dev.print(tiff, file = paste0("plots/","Elev-Plus-CENTRE-and-OPEN-Arm-Entries-Female-and-Male-vFINAL.tiff"), width = 9, height = 6, units = "cm", res=600, compression="lzw")


#------------------------





#### Dust quantification
#------------------------

dust <- read_excel(path= paste0(datadir,"/","Air-dust-weights-within-enclosures.xlsx"),
                   sheet=1, range="A3:D23", col_names = TRUE)
dust <- as.data.frame(dust)
str(dust)

dust$Treatment <- substring(dust$Enclosure,first = 1,last = 1)
levels(factor(dust$Treatment))
#[1] "C" "E" "H" "L"

unique( dust$Enclosure )
# [1] "C1"     "L1"     "H1"     "H2"     "C2"     "L2"     "L3"     "H3"     "C3"     "H4"     "L4"     "C4"     "C5"    
# [14] "H5"     "L5"     "L6"     "C6"     "H6"     "Empty1" "Empty2"

# ignore Empty measurements - these were checking +/- 0.05 mg precision of the scales
dim(dust) # 20 5
dust <- dust[-c(19,20), ]

dust
# Enclosure no_dust_mg with_dust_mg actual_dust_mg Treatment
# 1         C1    3311.61      3314.27           2.66         C
# 2         L1    3203.75      3212.71           8.96         L
# 3         H1    3181.73      3189.72           7.99         H
# 4         H2    3340.97      3354.52          13.55         H
# 5         C2    3380.20      3381.58           1.38         C
# 6         L2    3198.74      3210.03          11.29         L
# 7         L3    3279.20      3294.09          14.89         L
# 8         H3    3231.12      3248.87          17.75         H
# 9         C3    3473.82      3476.98           3.16         C
# 10        H4    3386.04      3400.12          14.08         H
# 11        L4    3345.59      3363.88          18.29         L
# 12        C4    3502.18      3505.10           2.92         C
# 13        C5    3338.23      3341.11           2.88         C
# 14        H5    3448.29      3460.54          12.25         H
# 15        L5    3368.33      3380.84          12.51         L
# 16        L6    3319.03      3332.96          13.93         L
# 17        C6    3458.19      3460.68           2.49         C
# 18        H6    3374.56      3382.22           7.66         H

dust$Treatment <- factor(dust$Treatment, levels = c("C","L","H"),
                         labels = c("Control", "Low", "High"),
                         ordered = TRUE)



## convert measured dust into dose per animal per week

# dust collected over 3 x 20 cm lengths of 2.5 cm wide tubing = area of 150 cm2.
# experiment had 3 mice per cage over 7 weeks.
# large mouse cage dimensions are 41 cm long x 25 cm wide = 1025 cm2
# convert mg to g

dust$actual_dust_permouse_perweek_g <- (1/150)*(1025)*(1/7)*(1/3)*(1/1000)*dust$actual_dust_mg

str(dust)
# 'data.frame':	18 obs. of  6 variables:
# $ Enclosure                      : chr  "C1" "L1" "H1" "H2" ...
# $ no_dust_mg                     : num  3312 3204 3182 3341 3380 ...
# $ with_dust_mg                   : num  3314 3213 3190 3355 3382 ...
# $ actual_dust_mg                 : num  2.66 8.96 7.99 13.55 1.38 ...
# $ Treatment                      : Ord.factor w/ 3 levels "Control"<"Low"<..: 1 2 3 3 1 2 2 3 1 3 ...
# $ actual_dust_permouse_perweek_g: num  0.000866 0.002916 0.0026 0.004409 0.000449 ...

boxplot(dust$actual_dust_permouse_perweek_g ~ Treatment, data=dust)

p <- ggplot(data=dust, aes(x = Treatment, y = actual_dust_permouse_perweek_g)) +
  geom_boxplot()

pp <- p + theme_bw() +
  labs(y="Dust exposure (g.mouse-1.week-1)", x=NULL)

pp


## minimise plot

# don't colour outliers
bp <- boxplot(dust$actual_dust_permouse_perweek_g ~ dust$Treatment)
bp$out #  0.0004490476

sel <- which(dust$actual_dust_permouse_perweek_g %in% bp$out)
dust$actual_dust_permouse_perweek_g[sel] # 0.0004490476



# Kruskal-Wallis test
kt <- kruskal.test( actual_dust_permouse_perweek_g ~ factor(as.character(Treatment)), dust[-sel, ]) # Kruskal Wallis test
kt
# Kruskal-Wallis rank sum test
# data:  actual_dust_permouse_perweek_g by factor(Treatment)
# Kruskal-Wallis chi-squared = 10.209, df = 2, p-value = 0.006069


table( dust[-sel, ]$Treatment )
# Control     Low    High 
#       5       6       6 


## Dunn Test uses factor vector or non-numeric vector that can be coerced to a factor vector
pt <- dunnTest( actual_dust_permouse_perweek_g ~ factor(as.character(Treatment)), data= dust[-sel, ],
                method = "bonferroni" )
# Warning message:
# group was coerced to a factor. 

pt
# Dunn (1964) Kruskal-Wallis multiple comparison
# p-values adjusted with the Bonferroni method.
# 
#       Comparison          Z     P.unadj       P.adj
# 1 Control - High -2.5617739 0.010413908 0.031241724
# 2  Control - Low -2.9978206 0.002719177 0.008157532
# 3     High - Low -0.4573296 0.647434186 1.000000000

pt$dtres
pt <- pt$res
pt

cldList(comparison = pt$Comparison,
        p.value    = pt$P.adj,
        threshold  = 0.05)

# Group Letter MonoLetter
# 1 Control      a         a 
# 2    High      b          b
# 3     Low      b          b



cols <- c("Control" = "#f46d43", "Low" = "#66c2a5" , "High" = "#5e4fa2" ) # orange, aqua, dark-blue


set.seed(321)
p <- ggplot(data=dust, aes(x = Treatment, y = actual_dust_permouse_perweek_g)) +
  geom_boxplot(outlier.size=1) +
  geom_jitter(data = dust[-sel, ], aes(colour = Treatment), width=0.3, height=0, size=1) +
  scale_color_manual(values = cols, guide = FALSE) +
  theme_classic() +
  labs(y= bquote('Dust exposure (g.'*mouse^-1~week^-1*')'), x = NULL ) +
  theme(
    plot.margin = margin(t = 2, r = 2, b = 2, l = 5, "pt"),
    axis.ticks.length=unit(-3, "pt"),
    axis.ticks.x.bottom = element_blank(),
    axis.text.x = element_text(margin=margin(t = 5,r = 5,b = 2,l = 5,"pt"), size = rel(0.9)),  # , size = rel(0.9))
    axis.text.y = element_text(margin=margin(t = 5,r = 5,b = 5,l = 2,"pt"), size = rel(0.8)),
    axis.title.y = element_text(margin=margin(t = 5,r = 2,b = 5,l = 2,"pt"), size = rel(0.8)) #,
    #strip.background = element_rect(fill="white", linetype = "blank")
    
  )

p

# add significance letters
grid.text(label = "a", x = unit(0.4, "npc") , y = unit(0.3,"npc"), gp=gpar(fontsize=10.5) )
grid.text(label = "b", x = unit(0.65, "npc") , y = unit(0.88,"npc"), gp=gpar(fontsize=10.5) )
grid.text(label = "b", x = unit(0.9, "npc") , y = unit(0.85,"npc"), gp=gpar(fontsize=10.5) )

dev.print(tiff, file = paste0("plots/","Dust-quantification-boxplots-with-Signif-letters-vFINAL.tiff"), width = 7, height = 6, units = "cm", res=600, compression="lzw")




sel <- which(dust$Treatment == "Control")
summary(dust$actual_dust_permouse_perweek_g[sel])
#      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
# 0.0004490 0.0008241 0.0009013 0.0008401 0.0009469 0.0010283

sel <- which(dust$Treatment == "Low")
summary(dust$actual_dust_permouse_perweek_g[sel])
#     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
# 0.002916 0.003773 0.004302 0.004332 0.004767 0.005952 

sel <- which(dust$Treatment == "High")
summary(dust$actual_dust_permouse_perweek_g[sel])
#     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
# 0.002493 0.002946 0.004198 0.003974 0.004538 0.005776 





## Ottman et al 2018 have average soil exposures of ~ 32 g.mouse-1.week-1

32/0.004332 # Exposure ratio:  Ottman et soil / Low soil dust = 7386

32/0.003974 # Exposure ratio:  Ottman et soil / High soil dust = 8052.34

# Zhou et al 2015 also use similar quantities of exposure material directly into cage

#------------------------





#### Import microbiome data
#------------------------

raw.otu <- read_excel(path= paste0(datadir,"/","absoluteabundance.xlsx"),
                          sheet=1, range="A1:LN10485", col_names = TRUE)

raw.otu <- as.data.frame(raw.otu)
str(raw.otu)
# this is a combined OTU abundance table & taxonomy table

dim(raw.otu) # 10484  326

names(raw.otu)

# remove "-16S-V3-V4" from column names
gsub(pattern = "-16S-V3-V4",x = names(raw.otu), replacement = "")

names(raw.otu) <- gsub(pattern = "-16S-V3-V4",x = names(raw.otu), replacement = "")
names(raw.otu)
names(raw.otu)[1] # "#OTU ID"
names(raw.otu)[1] <- "OTU_ID"
names(raw.otu)


## split off taxonomy table ... then tidy up later
raw.tax <- raw.otu[ ,c("OTU_ID", "Consensus Lineage")]


## tidy up OTU table
row.names(raw.otu) <- raw.otu$OTU_ID
names(raw.otu)[-c(1, length(names(raw.otu)))]

temp <- raw.otu

raw.otu <- raw.otu[ , -c(1, length(names(raw.otu)))]
dim(raw.otu) # 10484   324
names(raw.otu)
# [1] "L3m2Ce"   "L3m3T01"  "C5a1w1"   "L1a2w1"   "H5s1w1"   "L2s1w7"   "L1m3T01"  "bat11blk" "L5m3T16"  "C3a1w6"   "H3m3Ce"   "Rem29"   
# [13] "H1a2w1"   "L5s1w1"   "Clr26"    "L3m3T16"  "H3m2Ce"   "L1m3T16"  "ABLK2w6"  "L3m3Ce"   "H2s1w7"   "L5m3T01"  "L2s2w7"   "bat8blk" 
# [25] "bat12blk" "C3a2w6"   "L3m1Ce"   "C5a2w1"   "L1a1w1"   "H5s2w1"   "ABLK1w6"  "H2s2w7"   "H1a1w1"   "L5s2w1"   "H3m1Ce"   "L6a1w6"  
# [37] "L3s2w7"   "H3m1T01"  "C2a2w6"   "bfre3T13" "H1m1T01"  "C4a2w1"   "L2m1Ce"   "C6m2Ce"   "H4s2w1"   "H5m1T16"  "H3m1T16"  "C6m3Ce"  
# [49] "H6a1w6"   "H3s2w7"   "L4s2w1"   "H1m1T16"  "H5m1T01"  "H2m1Ce"   "H1m2T01"  "C1m3T16"  "C4a1w1"   "L2m2Ce"   "C6m1Ce"   "C5m3T01" 
# [61] "H4s1w1"   "H5m2T16"  "L6a2w6"   "L3s1w7"   "H3m2T01"  "C3m3T16"  "H2m3Ce"   "C2a1w6"   "C1m3T01"  "L4s1w1"   "H1m2T16"  "bat14blk"
# [73] "H5m2T01"  "C5m3T16"  "H2m2Ce"   "C3m3T01"  "H3m2T16"  "L2m3Ce"   "Rem23"    "H6a2w6"   "H3s1w7"   "L2m2T16"  "bat4blk"  "L6m2T01" 
# [85] "Rem36"    "L1s2w7"   "L4a1w6"   "L2a1w1"   "H6s2w1"   "C4m2Ce"   "L4m2T01"  "C6a2w1"   "L2m2T01"  "H1s2w7"   "H4a1w6"   "C4m3Ce"  
# [97] "L6m2T16"  "Clr39"    "H2a1w1"   "L6s2w1"   "L4m2T16"  "L2a2w1"   "H6s1w1"   "C4m1Ce"   "Rem13"    "L4m1T01"  "C6a1w1"   "L2m1T16" 
# [109] "bat7blk"  "L1s1w7"   "L6m1T01"  "L4a2w6"   "H2a2w1"   "L6s1w1"   "L4m1T16"  "H1s1w7"   "L2m1T01"  "H4a2w6"   "L6m1T16"  "C5m1Ce"  
# [121] "L3a2w1"   "C2m1T01"  "C6m1T16"  "L1m2Ce"   "Clr16"    "H1m3Ce"   "C1a1w6"   "L5a2w6"   "C4m1T16"  "C2m1T16"  "H1m2Ce"   "Rem19"   
# [133] "H3a2w1"   "C6m1T01"  "H5a2w6"   "C4m1T01"  "L1m3Ce"   "C1a2w6"   "L5a1w6"   "C4m2T16"  "H4m3T01"  "C5m2Ce"   "H2m3T16"  "C2m2T01" 
# [145] "L3a1w1"   "C6m2T16"  "Clr33"    "H6m3T01"  "L1m1Ce"   "C5m3Ce"   "H5a1w6"   "H4m3T16"  "C4m2T01"  "C2m2T16"  "H2m3T01"  "H1m1Ce"  
# [157] "bat1blk"  "H6m3T16"  "C6m2T01"  "H3a1w1"   "C2a2w1"   "L4m1Ce"   "L3m2T01"  "H2s2w1"   "bfre2T5"  "L6a1w1"   "bat10blk" "L5m2T16" 
# [169] "L5s2w7"   "Rem16"    "C4a2w6"   "L1m2T01"  "L2s2w1"   "H6a1w1"   "Clr19"    "L3m2T16"  "H4m1Ce"   "L5m2T01"  "H5s2w7"   "L1m2T16" 
# [181] "bfre3T5"  "L5m1T16"  "bat13blk" "bat9blk"  "L5s1w7"   "H4m3Ce"   "C4a1w6"   "L1m1T01"  "C2a1w1"   "L4m2Ce"   "L3m1T01"  "Rem33"   
# [193] "H2s1w1"   "bfre1T5"  "L6a2w1"   "L4m3Ce"   "L5m1T01"  "L1m1T16"  "H5s1w7"   "L2s1w1"   "H6a2w1"   "L3m1T16"  "H4m2Ce"   "L4s1w7"  
# [205] "L1a2w6"   "C5a1w6"   "Clr36"    "H5m3Ce"   "C3m1T16"  "L5m2Ce"   "C5m1T01"  "C3a1w1"   "bfre2T13" "H3s1w1"   "C1m1T16"  "C1m1Ce"  
# [217] "L5m3Ce"   "C3m1T01"  "H4s1w7"   "H1a2w6"   "bat16blk" "L3s1w1"   "C5m1T16"  "Rem39"    "H5m2Ce"   "C1m1T01"  "ABLK2w1"  "Clr13"   
# [229] "L5m1Ce"   "C5m2T01"  "C3a2w1"   "H5m3T16"  "bfre1T13" "H1m3T01"  "C1m2T16"  "H3s2w1"   "C1m2Ce"   "L4s2w7"   "L1a1w6"   "C5a2w6"  
# [241] "H3m3T01"  "C3m2T16"  "H5m3T01"  "bat15blk" "C5m2T16"  "L3s2w1"   "H5m1Ce"   "C1m2T01"  "ABLK1w1"  "H1m3T16"  "C3m2T01"  "H3m3T16" 
# [253] "H4s2w7"   "H1a1w6"   "C1m3Ce"   "bat5blk"  "L6m3T01"  "L3a2w6"   "L2m3T16"  "L6s1w7"   "L4m3T01"  "C3m1Ce"   "H1s1w1"   "L5a2w1"  
# [265] "C1a1w1"   "H3a2w6"   "L6m3T16"  "H6s1w7"   "L2m3T01"  "L4m3T16"  "L1s1w1"   "H5a2w1"   "C3m2Ce"   "H1s2w1"   "L5a1w1"   "C1a2w1"  
# [277] "bat6blk"  "L3a1w6"   "L6s2w7"   "L1s2w1"   "H5a1w1"   "C3m3Ce"   "H3a1w6"   "Clr23"    "H6s2w7"   "L4a1w1"   "H6m1T01"  "C2m2Ce"  
# [289] "L6m1Ce"   "H2m1T16"  "C6a2w6"   "H4m1T01"  "Clr29"    "L2a1w6"   "H6m1Ce"   "H6m1T16"  "Rem26"    "H4a1w1"   "H2m1T01"  "H4m1T16" 
# [301] "H2a1w6"   "C2m3Ce"   "C6a1w6"   "H6m3Ce"   "C4m3T16"  "H4m2T01"  "L2a2w6"   "C6m3T16"  "L4a2w1"   "H6m2T01"  "C2m1Ce"   "L6m2Ce"  
# [313] "H2m2T16"  "C2m3T01"  "H4m2T16"  "C4m3T01"  "H2a2w6"   "L6m3Ce"   "H6m2Ce"   "H6m2T16"  "C6m3T01"  "C2m3T16"  "H4a2w1"   "H2m2T01"


## Create 'otuTable'
#  otu_table - Works on any numeric matrix. 
#  You must also specify if the species are rows or columns
otu.16s <- as.matrix(raw.otu)
OTU.16s <- otu_table(otu.16s, taxa_are_rows = TRUE)

dim(OTU.16s) # 10484   324



## tidy up Taxonomy table

str(raw.tax)
# 'data.frame':	10484 obs. of  2 variables:
# $ OTU_ID           : chr  "OTU_1" "OTU_2" "OTU_3" "OTU_4" ...
# $ Consensus Lineage: chr  "k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Lachnospiraceae; g__; s__" "k__Bacteria; p__Firmicutes; c__Bacilli; o__Lactobacillales; f__Lactobacillaceae; g__Lactobacillus; s__" "k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Lachnospiraceae; g__; s__" "k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__; g__; s__" ...

# Separate the "Consensus Lineage" column, currently separated by semicolon
# so that each taxonomic rank has its own column
# e.g. "k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Rhizobiales; f__Hyphomicrobiaceae; g__Rhodoplanes; s__"

# note there is a space included in the separator "; "

tax <- separate(raw.tax, `Consensus Lineage`, c("Kingdom","Phylum","Class","Order", "Family", "Genus","Species"), sep= "; ", remove=TRUE)

# Warning message:
# Expected 7 pieces. Missing pieces filled with `NA` in 1244 rows [36, 71, 149, 376, 529, 552, 553, 655, 803, 841, 868, 891, 999, 1009, 1048, 1062, 1077, 1169, 1214, 1275, ...]. 

# move OTU_ID to row.name then delete that column
row.names(tax) <- tax$OTU_ID
tax <- tax[, -1]

# tidy up unknown taxa
dim(tax) # 10484     7

taxa_prefix <- c("k__", "p__", "c__", "o__", "f__", "g__", "s__")

for (i in 1:length(names(tax))) {
  sel <- which( is.na(tax[, i]) | tax[, i] == taxa_prefix[i] )
  tax[ sel, names(tax)[i] ] <- paste0(taxa_prefix[i],"unknown")
}



## Create 'taxonomyTable'
#  tax_table - Works on any character matrix. 
#  The rownames must match the OTU names (taxa_names) of the otu_table if you plan to combine it with a phyloseq-object.
tax <- as.matrix(tax)
TAX.16s <- tax_table(tax)


## Create a phyloseq object, merging OTU & TAX tables
phy0.16s = phyloseq(OTU.16s, TAX.16s)
phy0.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 10484 taxa and 324 samples ]
# tax_table()   Taxonomy Table:    [ 10484 taxa by 7 taxonomic ranks ]



## now join to sample data table

samp <- data.frame(sample = sample_names(phy0.16s), samp_type = NA, mouseID = NA, Treatment = NA, Time = NA)
str(samp)
samp$sample <- as.character(samp$sample)

# fill mouseID based on sample names

samp$sample
# [1] "L3m2Ce"   "L3m3T01"  "C5a1w1"   "L1a2w1"   "H5s1w1"   "L2s1w7"   "L1m3T01"  "bat11blk" "L5m3T16"  "C3a1w6"   "H3m3Ce"   "Rem29"   
# [13] "H1a2w1"   "L5s1w1"   "Clr26"    "L3m3T16"  "H3m2Ce"   "L1m3T16"  "ABLK2w6"  "L3m3Ce"   "H2s1w7"   "L5m3T01"  "L2s2w7"   "bat8blk" 
# [25] "bat12blk" "C3a2w6"   "L3m1Ce"   "C5a2w1"   "L1a1w1"   "H5s2w1"   "ABLK1w6"  "H2s2w7"   "H1a1w1"   "L5s2w1"   "H3m1Ce"   "L6a1w6"  
# [37] "L3s2w7"   "H3m1T01"  "C2a2w6"   "bfre3T13" "H1m1T01"  "C4a2w1"   "L2m1Ce"   "C6m2Ce"   "H4s2w1"   "H5m1T16"  "H3m1T16"  "C6m3Ce"  
# [49] "H6a1w6"   "H3s2w7"   "L4s2w1"   "H1m1T16"  "H5m1T01"  "H2m1Ce"   "H1m2T01"  "C1m3T16"  "C4a1w1"   "L2m2Ce"   "C6m1Ce"   "C5m3T01" 
# [61] "H4s1w1"   "H5m2T16"  "L6a2w6"   "L3s1w7"   "H3m2T01"  "C3m3T16"  "H2m3Ce"   "C2a1w6"   "C1m3T01"  "L4s1w1"   "H1m2T16"  "bat14blk"
# [73] "H5m2T01"  "C5m3T16"  "H2m2Ce"   "C3m3T01"  "H3m2T16"  "L2m3Ce"   "Rem23"    "H6a2w6"   "H3s1w7"   "L2m2T16"  "bat4blk"  "L6m2T01" 
# [85] "Rem36"    "L1s2w7"   "L4a1w6"   "L2a1w1"   "H6s2w1"   "C4m2Ce"   "L4m2T01"  "C6a2w1"   "L2m2T01"  "H1s2w7"   "H4a1w6"   "C4m3Ce"  
# [97] "L6m2T16"  "Clr39"    "H2a1w1"   "L6s2w1"   "L4m2T16"  "L2a2w1"   "H6s1w1"   "C4m1Ce"   "Rem13"    "L4m1T01"  "C6a1w1"   "L2m1T16" 
# [109] "bat7blk"  "L1s1w7"   "L6m1T01"  "L4a2w6"   "H2a2w1"   "L6s1w1"   "L4m1T16"  "H1s1w7"   "L2m1T01"  "H4a2w6"   "L6m1T16"  "C5m1Ce"  
# [121] "L3a2w1"   "C2m1T01"  "C6m1T16"  "L1m2Ce"   "Clr16"    "H1m3Ce"   "C1a1w6"   "L5a2w6"   "C4m1T16"  "C2m1T16"  "H1m2Ce"   "Rem19"   
# [133] "H3a2w1"   "C6m1T01"  "H5a2w6"   "C4m1T01"  "L1m3Ce"   "C1a2w6"   "L5a1w6"   "C4m2T16"  "H4m3T01"  "C5m2Ce"   "H2m3T16"  "C2m2T01" 
# [145] "L3a1w1"   "C6m2T16"  "Clr33"    "H6m3T01"  "L1m1Ce"   "C5m3Ce"   "H5a1w6"   "H4m3T16"  "C4m2T01"  "C2m2T16"  "H2m3T01"  "H1m1Ce"  
# [157] "bat1blk"  "H6m3T16"  "C6m2T01"  "H3a1w1"   "C2a2w1"   "L4m1Ce"   "L3m2T01"  "H2s2w1"   "bfre2T5"  "L6a1w1"   "bat10blk" "L5m2T16" 
# [169] "L5s2w7"   "Rem16"    "C4a2w6"   "L1m2T01"  "L2s2w1"   "H6a1w1"   "Clr19"    "L3m2T16"  "H4m1Ce"   "L5m2T01"  "H5s2w7"   "L1m2T16" 
# [181] "bfre3T5"  "L5m1T16"  "bat13blk" "bat9blk"  "L5s1w7"   "H4m3Ce"   "C4a1w6"   "L1m1T01"  "C2a1w1"   "L4m2Ce"   "L3m1T01"  "Rem33"   
# [193] "H2s1w1"   "bfre1T5"  "L6a2w1"   "L4m3Ce"   "L5m1T01"  "L1m1T16"  "H5s1w7"   "L2s1w1"   "H6a2w1"   "L3m1T16"  "H4m2Ce"   "L4s1w7"  
# [205] "L1a2w6"   "C5a1w6"   "Clr36"    "H5m3Ce"   "C3m1T16"  "L5m2Ce"   "C5m1T01"  "C3a1w1"   "bfre2T13" "H3s1w1"   "C1m1T16"  "C1m1Ce"  
# [217] "L5m3Ce"   "C3m1T01"  "H4s1w7"   "H1a2w6"   "bat16blk" "L3s1w1"   "C5m1T16"  "Rem39"    "H5m2Ce"   "C1m1T01"  "ABLK2w1"  "Clr13"   
# [229] "L5m1Ce"   "C5m2T01"  "C3a2w1"   "H5m3T16"  "bfre1T13" "H1m3T01"  "C1m2T16"  "H3s2w1"   "C1m2Ce"   "L4s2w7"   "L1a1w6"   "C5a2w6"  
# [241] "H3m3T01"  "C3m2T16"  "H5m3T01"  "bat15blk" "C5m2T16"  "L3s2w1"   "H5m1Ce"   "C1m2T01"  "ABLK1w1"  "H1m3T16"  "C3m2T01"  "H3m3T16" 
# [253] "H4s2w7"   "H1a1w6"   "C1m3Ce"   "bat5blk"  "L6m3T01"  "L3a2w6"   "L2m3T16"  "L6s1w7"   "L4m3T01"  "C3m1Ce"   "H1s1w1"   "L5a2w1"  
# [265] "C1a1w1"   "H3a2w6"   "L6m3T16"  "H6s1w7"   "L2m3T01"  "L4m3T16"  "L1s1w1"   "H5a2w1"   "C3m2Ce"   "H1s2w1"   "L5a1w1"   "C1a2w1"  
# [277] "bat6blk"  "L3a1w6"   "L6s2w7"   "L1s2w1"   "H5a1w1"   "C3m3Ce"   "H3a1w6"   "Clr23"    "H6s2w7"   "L4a1w1"   "H6m1T01"  "C2m2Ce"  
# [289] "L6m1Ce"   "H2m1T16"  "C6a2w6"   "H4m1T01"  "Clr29"    "L2a1w6"   "H6m1Ce"   "H6m1T16"  "Rem26"    "H4a1w1"   "H2m1T01"  "H4m1T16" 
# [301] "H2a1w6"   "C2m3Ce"   "C6a1w6"   "H6m3Ce"   "C4m3T16"  "H4m2T01"  "L2a2w6"   "C6m3T16"  "L4a2w1"   "H6m2T01"  "C2m1Ce"   "L6m2Ce"  
# [313] "H2m2T16"  "C2m3T01"  "H4m2T16"  "C4m3T01"  "H2a2w6"   "L6m3Ce"   "H6m2Ce"   "H6m2T16"  "C6m3T01"  "C2m3T16"  "H4a2w1"   "H2m2T01"

## populate: samp$samp_type

sel <- grep(pattern = "s", x = samp$sample)
# inspect
samp$sample[sel]
# set values
samp$samp_type[sel] <- "soil"

sel1 <- grep(pattern = "m", x = samp$sample)
# inspect
samp$sample[sel1]
sel2 <- grep(pattern = "Rem", x = samp$sample)
samp$sample[sel2]

samp$samp_type[sel1] <- "fecal"
# overwrite 
samp$samp_type[sel2] <- "soil"

sel1 <- grep(pattern = "a", x = samp$sample)
sel2 <- grep(pattern = "bat", x = samp$sample)
samp$sample[sel1]
samp$sample[sel2]
# set values
samp$samp_type[sel1] <- "air"
# overwrite
samp$samp_type[sel2] <- "extraction blank"


sel <- grep(pattern = "ABLK", x = samp$sample)
# inspect
samp$sample[sel]
# set values
samp$samp_type[sel] <- "air sampling blank"

sel <- grep(pattern = "bfre", x = samp$sample)
# inspect
samp$sample[sel]
# set values
samp$samp_type[sel] <- "fresh bedding"

sel <- grep(pattern = "Clr", x = samp$sample)
# inspect
samp$sample[sel]
# set values
samp$samp_type[sel] <- "soil"

sel <- grep(pattern = "Ce", x = samp$sample)
# inspect
samp$sample[sel]
# set values
samp$samp_type[sel] <- "cecal"

sel <- which(is.na(samp$samp_type))
# empty



## populate: samp$mouseID

sel <- which(samp$samp_type %in% c("fecal","cecal"))
samp$mouseID[sel] <- samp$sample[sel]

# only use first 4 characters
samp$mouseID[sel] <- substr(x = samp$mouseID[sel], start = 1, stop = 4)

temp <- samp



## populate: samp$Treatment

sel <- which( substr(samp$sample, 1, 1) == "H" | substr(samp$sample, 1, 3) == "Rem" )
# inspect
samp$sample[sel]

samp$Treatment[sel] <- "High"


sel <- which( substr(samp$sample, 1, 1) == "L" | substr(samp$sample, 1, 3) == "Clr" )
# inspect
samp$sample[sel]

samp$Treatment[sel] <- "Low"


sel <- which( substr(samp$sample, 1, 2) %in% c("C1","C2","C3","C4","C5","C6") )
# inspect
samp$sample[sel]

samp$Treatment[sel] <- "Control"

table(samp$Treatment)
# Control    High     Low 
#      78     111     111

# note extra samples for High & Low are associated with source soils


## populate: samp$Time

sel <- grep(pattern = "T01", x = samp$sample)
# inspect
samp$sample[sel]
# set values
samp$Time[sel] <- "Week 0"

sel <- grep(pattern = "T16", x = samp$sample)
# inspect
samp$sample[sel]
# set values
samp$Time[sel] <- "Week 7"


sel <- grep(pattern = "Rem", x = samp$sample)
# inspect
samp$sample[sel]
# set values
samp$Time[sel] <- "Source soil"


sel <- grep(pattern = "Clr", x = samp$sample)
# inspect
samp$sample[sel]
# set values
samp$Time[sel] <- "Source soil"


sel <- grep(pattern = "w1", x = samp$sample)
# inspect
samp$sample[sel]
# set values
samp$Time[sel] <- "Week 1"

sel <- grep(pattern = "T5", x = samp$sample)
# inspect
samp$sample[sel]
# set values
samp$Time[sel] <- "Week 2"

sel <- grep(pattern = "T13", x = samp$sample)
# inspect
samp$sample[sel]
# set values
samp$Time[sel] <- "Week 6"

sel <- grep(pattern = "w6", x = samp$sample)
# inspect
samp$sample[sel]
# set values
samp$Time[sel] <- "Week 6"

sel <- grep(pattern = "T16", x = samp$sample)
# inspect
samp$sample[sel]
# set values
samp$Time[sel] <- "Week 7"

sel <- grep(pattern = "w7", x = samp$sample)
# inspect
samp$sample[sel]
# set values
samp$Time[sel] <- "Week 7"

sel <- grep(pattern = "Ce", x = samp$sample)
# inspect
samp$sample[sel]
# set values
samp$Time[sel] <- "Post-exposure"


temp <- samp

str(samp)
# 'data.frame':	324 obs. of  5 variables:
# $ sample   : chr  "L3m2Ce" "L3m3T01" "C5a1w1" "L1a2w1" ...
# $ samp_type: chr  "cecal" "fecal" "air" "air" ...
# $ mouseID  : chr  "L3m2" "L3m3" NA NA ...
# $ Treatment: chr  "Low" "Low" "Control" "Low" ...
# $ Time     : chr  "Post-exposure" "Week 0" "Week 1" "Week 1" ...


## merge with 'mice_info'
names(samp) #  "sample"    "samp_type" "mouseID"   "Treatment" "Time"     
names(mice_info)
# [1] "mouseID"                        "Animal no"                      "Sex"                           
# [4] "Litter"                         "Date of Birth"                  "Cage no"                       
# [7] "Cage Name"                      "Treatment"                      "Rack"                          
# [10] "Ear notch ID"                   "Ear notch pos"                  "Age_received_20_Aug"           
# [13] "Age_first_soil_exposure_27_Aug" "Rack_position"              

samp.join <- merge(x = samp, y = mice_info, by = "mouseID", all.x = TRUE)

# mouseID, Sex, Litter, Cage Name, Treatment, Rack, Rack_position



### Combine SAMPDATA into phyloseq object

row.names(samp.join) <- samp.join$sample

head(samp.join)

# tidy-up dataframe
sel <- which(names(samp.join)=="Treatment.y")
samp.join <- samp.join[ ,-sel]

sel <- which(names(samp.join)=="Treatment.x")
names(samp.join)[sel] <- "Treatment"


## fill in Cage Name - for soil and air  and not Clr or Rem

sel <- which( substr(samp.join$sample, 1, 2) %in% c("L1","L2","L3","L4","L5","L6","H1","H2","H3","H4","H5","H6") )
samp.join$sample[sel]

samp.join$`Cage Name`[sel] <- substr(samp.join$sample, 1, 2)[sel]



SAMP.16s <- sample_data(samp.join)


phy0.16s <- merge_phyloseq(phy0.16s, SAMP.16s)
phy0.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 10484 taxa and 324 samples ]
# sample_data() Sample Data:       [ 324 samples by 17 sample variables ]
# tax_table()   Taxonomy Table:    [ 10484 taxa by 7 taxonomic ranks ]


#------------------------





#### Data cleaning and removal of contaminant taxa - using decontam R package
#------------------------

phy0.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 10484 taxa and 324 samples ]
# sample_data() Sample Data:       [ 324 samples by 17 sample variables ]
# tax_table()   Taxonomy Table:    [ 10484 taxa by 7 taxonomic ranks ]


## remove taxa not assigned as Bacteria

levels(factor(tax_table(phy0.16s)[, "Kingdom"])) # "k__Archaea"   "k__Bacteria"  "Unclassified"
rem_taxa <- which(tax_table(phy0.16s)[, "Kingdom"] %in%  c("k__Archaea","Unclassified") ) # qty 31
phy1.16s <- prune_taxa(phy0.16s, taxa = row.names(tax_table(phy0.16s)[-rem_taxa, ]) )
phy1.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 10453 taxa and 324 samples ]
# sample_data() Sample Data:       [ 324 samples by 17 sample variables ]
# tax_table()   Taxonomy Table:    [ 10453 taxa by 7 taxonomic ranks ]


## remove taxa not assigned at the phylum level

rem_taxa <- which(tax_table(phy1.16s)[, "Phylum"] == "p__unknown") # qty 134
phy1.16s <- prune_taxa(phy1.16s, taxa = row.names(tax_table(phy1.16s)[-rem_taxa, ]) )
phy1.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 10319 taxa and 324 samples ]
# sample_data() Sample Data:       [ 324 samples by 17 sample variables ]
# tax_table()   Taxonomy Table:    [ 10319 taxa by 7 taxonomic ranks ]


rank_names(phy1.16s) #  "Kingdom" "Phylum"  "Class"   "Order"   "Family"  "Genus"   "Species"
sort( as.character( unique( tax_table(phy1.16s)[, "Phylum"] ) ))
sort( as.character( unique( tax_table(phy1.16s)[, "Class"] ) ))
sort( as.character( unique( tax_table(phy1.16s)[, "Order"] ) ))
sort( as.character( unique( tax_table(phy1.16s)[, "Family"] ) ))
sort( as.character( unique( tax_table(phy1.16s)[, "Genus"] ) ))


## remove taxa associated with chloroplast, streptophyta, and mitochondria

rem_taxa1 <- which(tax_table(phy1.16s)[, "Class"] == "c__Chloroplast") # qty 42 OTUs
rem_taxa2 <- which(tax_table(phy1.16s)[, "Order"] == "o__Streptophyta") # qty 8 OTUs
rem_taxa3 <- which(tax_table(phy1.16s)[, "Family"] == "f__mitochondria") # qty 33 OTUs
c(rem_taxa1,rem_taxa2,rem_taxa3)
# [1]   207   592  1361  2351  2629  2697  2752  3171  3225  3537  3546  3552  3563  4019  4203  4257  4364  4489
# [19]  4506  5073  5080  5401  5581  5729  5918  6055  6057  7035  7401  7422  7476  7543  7685  7711  7753  7934
# [37]  8791  8813  8831  9446  9954 10129   207   592  2351  2697  2752  4489  5581  6057   652  1071  1611  1616
# [55]  1643  2148  3132  3164  3265  3565  3940  4351  4714  4719  5190  5679  5856  6107  6895  7113  7273  7764
# [73]  8066  8171  8413  9210  9501  9560  9634  9689  9964 10253 10268
length( c(rem_taxa1,rem_taxa2,rem_taxa3) ) # 83
length( unique(c(rem_taxa1,rem_taxa2,rem_taxa3)) ) # 75
length( unique(c(rem_taxa1,rem_taxa2)) ) # 42 ... i.e. overlap between "o__Streptophyta" and "c__Chloroplast"
unique(c(rem_taxa1,rem_taxa2,rem_taxa3))
# [1]   207   592  1361  2351  2629  2697  2752  3171  3225  3537  3546  3552  3563  4019  4203  4257  4364  4489
# [19]  4506  5073  5080  5401  5581  5729  5918  6055  6057  7035  7401  7422  7476  7543  7685  7711  7753  7934
# [37]  8791  8813  8831  9446  9954 10129   652  1071  1611  1616  1643  2148  3132  3164  3265  3565  3940  4351
# [55]  4714  4719  5190  5679  5856  6107  6895  7113  7273  7764  8066  8171  8413  9210  9501  9560  9634  9689
# [73]  9964 10253 10268

# remove chloroplast and mitochondria
phy1.16s <- prune_taxa(phy1.16s, taxa = row.names(tax_table(phy1.16s)[-unique(c(rem_taxa1,rem_taxa2,rem_taxa3)), ]) )
phy1.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 10244 taxa and 324 samples ]
# sample_data() Sample Data:       [ 324 samples by 17 sample variables ]
# tax_table()   Taxonomy Table:    [ 10244 taxa by 7 taxonomic ranks ]


min( taxa_sums(phy1.16s) ) # 2

temp <- phy1.16s


## remove taxa that do not occur in at least two samples

trim.16s <- prune_taxa( taxa = apply( otu_table(phy1.16s), 1, function(x) {sum(x > 0) >= 2 }), x = phy1.16s)
trim.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 8039 taxa and 324 samples ]
# sample_data() Sample Data:       [ 324 samples by 17 sample variables ]
# tax_table()   Taxonomy Table:    [ 8039 taxa by 7 taxonomic ranks ]


## remove taxa if <50 sequence reads across samples

trim.16s <- prune_taxa(taxa = taxa_sums(trim.16s) >= 50 , x = trim.16s)
trim.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 3474 taxa and 324 samples ]
# sample_data() Sample Data:       [ 324 samples by 17 sample variables ]
# tax_table()   Taxonomy Table:    [ 3474 taxa by 7 taxonomic ranks ]


ntaxa(trim.16s) # 3474
nsamples(trim.16s) # 324

min(sample_sums(trim.16s)) # 4
max(sample_sums(trim.16s)) # 153088

hist( sample_sums(trim.16s) )
sort(  sample_sums(trim.16s) )[1:60]
# L1m2T16   H4s2w1   L2m3Ce bat13blk  ABLK2w6  bat8blk  ABLK1w6 bat12blk  ABLK1w1  bfre1T5  bat7blk bat14blk  ABLK2w1 
#       4       22       31      116      160      231      231      369      450      606      669      691      839 
# bat9blk bfre1T13   L3a2w6  bfre3T5   H6a2w6   C2a2w6   L4a1w6   L3a1w6   L5a1w6   L2a1w6   L2a2w6 bat15blk bfre2T13 
#    1046     1090     1307     1351     1422     1681     1773     1777     2032     2270     2289     2776     2844 
# H3a1w6   L5a2w6   H4a2w6   H6a1w6   L1a1w6   C3a1w1   L6a2w6   H4a1w6   C2a1w6 bfre3T13  bfre2T5   H3a2w6   H1a2w6 
#   3031     3034     3232     3554     3587     3645     4016     4134     4334     4524     4676     5150     5188 
# C6a1w6   C5a1w6   H1a1w6   C6a2w6 bat16blk   L4a2w6   C4a1w6   H4a2w1   L6a1w6   L1a2w6   C4a2w6   L5a2w1   H2a2w1 
#   5419     6994     7494     7761     7928     8783     8926     8983    10220    10487    10582    10684    11077 
# C3a2w6 bat11blk   H4s1w1  bat1blk   H4a1w1   H6a2w1   C1a1w6  H4m3T01 
#  12045    12564    13901    14075    14235    14818    15241    15298


# guide to use of R decontam package here:
# https://benjjneb.github.io/decontam/vignettes/decontam_intro.html


## remove non-representative/failed low biomass samples

sort(sample_sums(trim.16s))[1:10]
# L1m2T16   H4s2w1   L2m3Ce bat13blk  ABLK2w6  bat8blk  ABLK1w6 bat12blk  ABLK1w1  bfre1T5 
#       4       22       31      116      160      231      231      369      450      606

## these sample failed; they should be high biomass sequence read counts:
## L1m2T16   H4s2w1   L2m3Ce

## therefore omit these samples: 
## L1m2T16 = fecal mouse#2 enclosure L1 time16 (week 7)
## H4s2w1  = soil subsample#2 enclosure H4 week 1
## L2m3Ce  = cecal sample mouse#3 enclosure L2


length(sample_names(trim.16s)) # 324

sel <- which(sample_names(trim.16s) %in% c("L1m2T16", "H4s2w1", "L2m3Ce"))
ok_samples <- sample_names(trim.16s)[-sel]
length(ok_samples) # 321

ok.trim.16s <- prune_samples( sample_names(trim.16s) %in% ok_samples, trim.16s )
ok.trim.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 3474 taxa and 321 samples ]
# sample_data() Sample Data:       [ 321 samples by 17 sample variables ]
# tax_table()   Taxonomy Table:    [ 3474 taxa by 7 taxonomic ranks ]

min(sample_sums(ok.trim.16s)) # 116
min(taxa_sums(ok.trim.16s)) # 50

table( ok.trim.16s@sam_data$samp_type )
# air air sampling blank              cecal   extraction blank 
#  72                  4                 53                 14 
# fecal      fresh bedding               soil 
#   107                  6                 65 

unique( ok.trim.16s@sam_data$samp_type )
# [1] "cecal"              "fecal"              "air"                "soil"               "extraction blank"  
# [6] "air sampling blank" "fresh bedding"     


## create new variable in sample data called 'Sample_or_Control', containing:
#  - True Sample = study sample
#  - Control Sample = negative control

ok.trim.16s@sam_data$Sample_or_Control <- "True Sample" # then over-write with Controls

sel <- which(ok.trim.16s@sam_data$samp_type %in% c("extraction blank", "air sampling blank"))
ok.trim.16s@sam_data$Sample_or_Control[sel] <- "Control Sample"

# summarize Sample_or_Control as logical variable, with  TRUE for control samples
# as required by isContaminant()

sample_data(ok.trim.16s)$is.neg <- sample_data(ok.trim.16s)$Sample_or_Control == "Control Sample"


## Add concentration data
quants <- read_excel(path= paste0(datadir,"/","quant-readings.xlsx"),
                     sheet=1, range="A1:B325", col_names = TRUE)

quants <- as.data.frame(quants)
str(quants)
# 'data.frame':	324 obs. of  2 variables:
# $ Sample_Name   : chr  "Rem13" "Rem16" "Rem19" "Rem23" ...
# $ Conc_ng_per_uL: num  10 10 10 10 10 10 10 10 10 10 ...


# create variable quant_reading to hold concentration information
ok.trim.16s@sam_data$quant_reading <- NA

## join concentration info to variable $quant_reading
for (i in 1:length(ok.trim.16s@sam_data$sample)) {
  sel <- which(quants$Sample_Name == ok.trim.16s@sam_data$sample[i] )
  if (length(sel)==1) { ok.trim.16s@sam_data$quant_reading[i] <- quants$Conc_ng_per_uL[sel] }
}

# inspect first 10 rows
ok.trim.16s@sam_data[ , c("sample","quant_reading", "Sample_or_Control", "is.neg")][1:10, ]
# Sample Data:        [10 samples by 4 sample variables]:
#            sample quant_reading Sample_or_Control is.neg
# L3m2Ce     L3m2Ce       10.0000       True Sample  FALSE
# L3m3T01   L3m3T01        9.0000       True Sample  FALSE
# C5a1w1     C5a1w1        0.0141       True Sample  FALSE
# L1a2w1     L1a2w1        0.0113       True Sample  FALSE
# H5s1w1     H5s1w1       10.0000       True Sample  FALSE
# L2s1w7     L2s1w7       10.0000       True Sample  FALSE
# L1m3T01   L1m3T01       10.0000       True Sample  FALSE
# bat11blk bat11blk        0.0010    Control Sample   TRUE
# L5m3T16   L5m3T16       10.0000       True Sample  FALSE
# C3a1w6     C3a1w6        0.0199       True Sample  FALSE


# inspect data
dim(ok.trim.16s@sam_data) # 321  20
ok.trim.16s@sam_data[ 1:200 , c("sample","quant_reading", "Sample_or_Control", "is.neg")]
ok.trim.16s@sam_data[ 201:321 , c("sample","quant_reading", "Sample_or_Control", "is.neg")]


## inspect library sizes in each sample, as a function of whether that sample 
#  was a true positive sample or a negative control

df <- as.data.frame(sample_data(ok.trim.16s))
df$LibrarySize <- sample_sums(ok.trim.16s)
df <- df[order(df$LibrarySize),]
df$Index <- seq(nrow(df))
ggplot(data=df, aes(x=Index, y=LibrarySize, color=Sample_or_Control)) + geom_point()


### Split contaminant analysis, treating 'low biomass' and 'high biomass' samples separately
##  Per user documentation for R decontam package:
##  a) Assess low biomass using 'isNotContaminant()' function
##  b) Assess high biomass using 'isContaminant()' function
##  c) then pool all contaminants and remove from subsequent microbiome data analysis

### a) Assess low biomass: air, air sampling blanks, fresh bedding, and corresponding extraction blanks

# Air
sel <- which(ok.trim.16s@sam_data$samp_type == "air")
lb.air <- ok.trim.16s@sam_data$sample[sel]

# Fresh bedding
sel <- which(ok.trim.16s@sam_data$samp_type == "fresh bedding")
lb.fb <- ok.trim.16s@sam_data$sample[sel]

# Air sampling blanks
sel <- which(ok.trim.16s@sam_data$samp_type == "air sampling blank")
lb.air.blk <- ok.trim.16s@sam_data$sample[sel]

# Extraction blanks
sel <- which(ok.trim.16s@sam_data$sample %in% c("bat8blk",
                                                "bat9blk",
                                                "bat13blk",
                                                "bat14blk",
                                                "bat12blk")) # extraction blank for fresh bedding samples & soil (below)
lb.ext.blk <- ok.trim.16s@sam_data$sample[sel]


lb_samples <- c(lb.air,lb.fb,lb.air.blk,lb.ext.blk)
lb_samples
# [1] "C5a1w1"   "L1a2w1"   "C3a1w6"   "H1a2w1"   "C3a2w6"   "C5a2w1"   "L1a1w1"   "H1a1w1"   "L6a1w6"   "C2a2w6"  
# [11] "C4a2w1"   "H6a1w6"   "C4a1w1"   "L6a2w6"   "C2a1w6"   "H6a2w6"   "L4a1w6"   "L2a1w1"   "C6a2w1"   "H4a1w6"  
# [21] "H2a1w1"   "L2a2w1"   "C6a1w1"   "L4a2w6"   "H2a2w1"   "H4a2w6"   "L3a2w1"   "C1a1w6"   "L5a2w6"   "H3a2w1"  
# [31] "H5a2w6"   "C1a2w6"   "L5a1w6"   "L3a1w1"   "H5a1w6"   "H3a1w1"   "C2a2w1"   "L6a1w1"   "C4a2w6"   "H6a1w1"  
# [41] "C4a1w6"   "C2a1w1"   "L6a2w1"   "H6a2w1"   "L1a2w6"   "C5a1w6"   "C3a1w1"   "H1a2w6"   "C3a2w1"   "L1a1w6"  
# [51] "C5a2w6"   "H1a1w6"   "L3a2w6"   "L5a2w1"   "C1a1w1"   "H3a2w6"   "H5a2w1"   "L5a1w1"   "C1a2w1"   "L3a1w6"  
# [61] "H5a1w1"   "H3a1w6"   "L4a1w1"   "C6a2w6"   "L2a1w6"   "H4a1w1"   "H2a1w6"   "C6a1w6"   "L2a2w6"   "L4a2w1"  
# [71] "H2a2w6"   "H4a2w1"   "bfre3T13" "bfre2T5"  "bfre3T5"  "bfre1T5"  "bfre2T13" "bfre1T13" "ABLK2w6"  "ABLK1w6" 
# [81] "ABLK2w1"  "ABLK1w1"  "bat8blk"  "bat12blk" "bat14blk" "bat13blk" "bat9blk" 

lb.ok.trim.16s <- prune_samples( sample_names(ok.trim.16s) %in% lb_samples, ok.trim.16s )
lb.ok.trim.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 3474 taxa and 87 samples ]
# sample_data() Sample Data:       [ 87 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 3474 taxa by 7 taxonomic ranks ]

min(sample_sums(lb.ok.trim.16s)) # 116
min(taxa_sums(lb.ok.trim.16s))   # 0

# prune taxa that have zero sequence reads
lb.ok.trim.16s <- prune_taxa(taxa = taxa_sums(lb.ok.trim.16s) > 0, x = lb.ok.trim.16s)
lb.ok.trim.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 2811 taxa and 87 samples ]
# sample_data() Sample Data:       [ 87 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 2811 taxa by 7 taxonomic ranks ]

## need inputs to function:
# isNotContaminant(seqtab = <integer matrix>, neg = <logical for negative controls>,
#                  method = "prevalence", threshold = 0.5, normalize = TRUE, detailed = FALSE)

mat <- as.matrix(as.data.frame(t(lb.ok.trim.16s@otu_table)))
# requires a feature table recording the observed abundances of each sequence (or OTU)
# in each sample. Rows should correspond to samples, and columns to sequences (or OTUs).
class(mat) # "matrix"
mat[1:5, 1:5]
#         OTU_1 OTU_2 OTU_3 OTU_4 OTU_5
# C5a1w1   1185   211  3449  2007   111
# L1a2w1    445     0   678    57   123
# C3a1w6   1769   244  5575  3822   636
# H1a2w1    153   132    64    84   352
# ABLK2w6     1     2     1     0     1

lb.NOT.contam <- isNotContaminant(seqtab = mat, neg = lb.ok.trim.16s@sam_data$is.neg,
                                  method = "prevalence", threshold = 0.5,
                                  normalize = TRUE, detailed = TRUE)
# with detailed = TRUE, returns a data.frame containing diagnostic information on
# the non-contaminant decision. 
head(lb.NOT.contam)
#              freq prev p.freq       p.prev            p not.contaminant
# OTU_1 0.031961951   78     NA 1.878745e-04 1.878745e-04            TRUE
# OTU_2 0.007202436   67     NA 2.160981e-03 2.160981e-03            TRUE
# OTU_3 0.087721566   84     NA 3.962451e-04 3.962451e-04            TRUE
# OTU_4 0.048224660   77     NA 1.566598e-05 1.566598e-05            TRUE
# OTU_5 0.012677907   77     NA 3.630779e-04 3.630779e-04            TRUE
# OTU_6 0.003664329   35     NA 1.361211e-01 1.361211e-01            TRUE

table(lb.NOT.contam$not.contaminant)
# FALSE  TRUE 
#   726  2085 

lb.taxa.NotContam <- row.names(lb.NOT.contam)[which(lb.NOT.contam$not.contaminant==TRUE)]
lb.taxa.NotContam <- sort(lb.taxa.NotContam)

lb.taxa.IsContam <- row.names(lb.NOT.contam)[which(lb.NOT.contam$not.contaminant==FALSE)]
lb.taxa.IsContam <- sort(lb.taxa.IsContam)




### b) Assess high biomass: soil, cecal, fecal, and corresponding extraction blanks

unique( ok.trim.16s@sam_data$samp_type )

# Soil
sel <- which(ok.trim.16s@sam_data$samp_type == "soil")
hb.soil <- ok.trim.16s@sam_data$sample[sel]

# Cecal
sel <- which(ok.trim.16s@sam_data$samp_type == "cecal")
hb.cecal <- ok.trim.16s@sam_data$sample[sel]

# Fecal
sel <- which(ok.trim.16s@sam_data$samp_type == "fecal")
hb.fecal <- ok.trim.16s@sam_data$sample[sel]

# Extraction blanks
sel <- which(ok.trim.16s@sam_data$sample %in% c(
  "bat1blk",
  "bat4blk",
  "bat5blk",
  "bat6blk",
  "bat7blk",
  "bat10blk",
  "bat11blk",
  "bat12blk",   # extraction blank for both fresh bedding & soil
  "bat15blk",
  "bat16blk"))
hb.ext.blk <- ok.trim.16s@sam_data$sample[sel]


hb_samples <- c(hb.soil,hb.cecal,hb.fecal,hb.ext.blk)
hb_samples
# [1] "H5s1w1"   "L2s1w7"   "Rem29"    "L5s1w1"   "Clr26"    "H2s1w7"   "L2s2w7"   "H5s2w1"   "H2s2w7"   "L5s2w1"  
# [11] "L3s2w7"   "H3s2w7"   "L4s2w1"   "H4s1w1"   "L3s1w7"   "L4s1w1"   "Rem23"    "H3s1w7"   "Rem36"    "L1s2w7"  
# [21] "H6s2w1"   "H1s2w7"   "Clr39"    "L6s2w1"   "H6s1w1"   "Rem13"    "L1s1w7"   "L6s1w1"   "H1s1w7"   "Clr16"   
# [31] "Rem19"    "Clr33"    "H2s2w1"   "L5s2w7"   "Rem16"    "L2s2w1"   "Clr19"    "H5s2w7"   "L5s1w7"   "Rem33"   
# [41] "H2s1w1"   "H5s1w7"   "L2s1w1"   "L4s1w7"   "Clr36"    "H3s1w1"   "H4s1w7"   "L3s1w1"   "Rem39"    "Clr13"   
# [51] "H3s2w1"   "L4s2w7"   "L3s2w1"   "H4s2w7"   "L6s1w7"   "H1s1w1"   "H6s1w7"   "L1s1w1"   "H1s2w1"   "L6s2w7"  
# [61] "L1s2w1"   "Clr23"    "H6s2w7"   "Clr29"    "Rem26"    "L3m2Ce"   "H3m3Ce"   "H3m2Ce"   "L3m3Ce"   "L3m1Ce"  
# [71] "H3m1Ce"   "L2m1Ce"   "C6m2Ce"   "C6m3Ce"   "H2m1Ce"   "L2m2Ce"   "C6m1Ce"   "H2m3Ce"   "H2m2Ce"   "C4m2Ce"  
# [81] "C4m3Ce"   "C4m1Ce"   "C5m1Ce"   "L1m2Ce"   "H1m3Ce"   "H1m2Ce"   "L1m3Ce"   "C5m2Ce"   "L1m1Ce"   "C5m3Ce"  
# [91] "H1m1Ce"   "L4m1Ce"   "H4m1Ce"   "H4m3Ce"   "L4m2Ce"   "L4m3Ce"   "H4m2Ce"   "H5m3Ce"   "L5m2Ce"   "C1m1Ce"  
# [101] "L5m3Ce"   "H5m2Ce"   "L5m1Ce"   "C1m2Ce"   "H5m1Ce"   "C1m3Ce"   "C3m1Ce"   "C3m2Ce"   "C3m3Ce"   "C2m2Ce"  
# [111] "L6m1Ce"   "H6m1Ce"   "C2m3Ce"   "H6m3Ce"   "C2m1Ce"   "L6m2Ce"   "L6m3Ce"   "H6m2Ce"   "L3m3T01"  "L1m3T01" 
# [121] "L5m3T16"  "L3m3T16"  "L1m3T16"  "L5m3T01"  "H3m1T01"  "H1m1T01"  "H5m1T16"  "H3m1T16"  "H1m1T16"  "H5m1T01" 
# [131] "H1m2T01"  "C1m3T16"  "C5m3T01"  "H5m2T16"  "H3m2T01"  "C3m3T16"  "C1m3T01"  "H1m2T16"  "H5m2T01"  "C5m3T16" 
# [141] "C3m3T01"  "H3m2T16"  "L2m2T16"  "L6m2T01"  "L4m2T01"  "L2m2T01"  "L6m2T16"  "L4m2T16"  "L4m1T01"  "L2m1T16" 
# [151] "L6m1T01"  "L4m1T16"  "L2m1T01"  "L6m1T16"  "C2m1T01"  "C6m1T16"  "C4m1T16"  "C2m1T16"  "C6m1T01"  "C4m1T01" 
# [161] "C4m2T16"  "H4m3T01"  "H2m3T16"  "C2m2T01"  "C6m2T16"  "H6m3T01"  "H4m3T16"  "C4m2T01"  "C2m2T16"  "H2m3T01" 
# [171] "H6m3T16"  "C6m2T01"  "L3m2T01"  "L5m2T16"  "L1m2T01"  "L3m2T16"  "L5m2T01"  "L5m1T16"  "L1m1T01"  "L3m1T01" 
# [181] "L5m1T01"  "L1m1T16"  "L3m1T16"  "C3m1T16"  "C5m1T01"  "C1m1T16"  "C3m1T01"  "C5m1T16"  "C1m1T01"  "C5m2T01" 
# [191] "H5m3T16"  "H1m3T01"  "C1m2T16"  "H3m3T01"  "C3m2T16"  "H5m3T01"  "C5m2T16"  "C1m2T01"  "H1m3T16"  "C3m2T01" 
# [201] "H3m3T16"  "L6m3T01"  "L2m3T16"  "L4m3T01"  "L6m3T16"  "L2m3T01"  "L4m3T16"  "H6m1T01"  "H2m1T16"  "H4m1T01" 
# [211] "H6m1T16"  "H2m1T01"  "H4m1T16"  "C4m3T16"  "H4m2T01"  "C6m3T16"  "H6m2T01"  "H2m2T16"  "C2m3T01"  "H4m2T16" 
# [221] "C4m3T01"  "H6m2T16"  "C6m3T01"  "C2m3T16"  "H2m2T01"  "bat11blk" "bat12blk" "bat4blk"  "bat7blk"  "bat1blk" 
# [231] "bat10blk" "bat16blk" "bat15blk" "bat5blk"  "bat6blk"

hb.ok.trim.16s <- prune_samples( sample_names(ok.trim.16s) %in% hb_samples, ok.trim.16s )
hb.ok.trim.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 3474 taxa and 235 samples ]
# sample_data() Sample Data:       [ 235 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 3474 taxa by 7 taxonomic ranks ]

min(sample_sums(hb.ok.trim.16s)) # 369
min(taxa_sums(hb.ok.trim.16s))   # 0

# prune taxa that have zero sequence reads
hb.ok.trim.16s <- prune_taxa(taxa = taxa_sums(hb.ok.trim.16s) > 0, x = hb.ok.trim.16s)
hb.ok.trim.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 3414 taxa and 235 samples ]
# sample_data() Sample Data:       [ 235 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 3414 taxa by 7 taxonomic ranks ]
min(taxa_sums(hb.ok.trim.16s))   # 1


head(hb.ok.trim.16s@sam_data[ , c("sample","quant_reading", "is.neg")])
# Sample Data:        [6 samples by 3 sample variables]:
#            sample quant_reading is.neg
# L3m2Ce     L3m2Ce         1e+01  FALSE
# L3m3T01   L3m3T01         9e+00  FALSE
# H5s1w1     H5s1w1         1e+01  FALSE
# L2s1w7     L2s1w7         1e+01  FALSE
# L1m3T01   L1m3T01         1e+01  FALSE
# bat11blk bat11blk         1e-03   TRUE


## run 'isContaminant()' function

contamdf.either <- isContaminant(hb.ok.trim.16s, method="either", conc="quant_reading" , neg="is.neg", threshold = 0.1)
# message: Using same threshold value for the frequency and prevalence contaminant identification.
table(contamdf.either$contaminant)
# FALSE  TRUE 
#  3393    21 

head(contamdf.either)
#             freq prev    p.freq     p.prev  p contaminant
# OTU_1 0.05286111  191 0.4949171 0.26549467 NA       FALSE
# OTU_2 0.05297538  201 0.4930715 0.69963695 NA       FALSE
# OTU_3 0.07024606  198 0.4667332 0.34384987 NA       FALSE
# OTU_4 0.02893894  179 0.4520002 0.41270659 NA       FALSE
# OTU_5 0.03054148  192 0.4658868 0.57778180 NA       FALSE
# OTU_6 0.01358069  135 0.5014358 0.07721348 NA        TRUE


hb.taxa.NotContam <- row.names(contamdf.either)[which(contamdf.either$contaminant==FALSE)]
hb.taxa.NotContam <- sort(hb.taxa.NotContam)

hb.taxa.IsContam <- row.names(contamdf.either)[which(contamdf.either$contaminant==TRUE)]
hb.taxa.IsContam <- sort(hb.taxa.IsContam)



### c) pool contaminants and remove from subsequent microbiome data analysis

# contaminants identified from low biomass samples
lb.taxa.IsContam

# contaminants identified from high biomass samples
hb.taxa.IsContam

# Do contaminant taxa overlap?
length(lb.taxa.IsContam) # 726
length(hb.taxa.IsContam) # 21
length(c(lb.taxa.IsContam, hb.taxa.IsContam)) # 747
length(unique(c(lb.taxa.IsContam, hb.taxa.IsContam))) # 744
length(which(lb.taxa.IsContam %in% hb.taxa.IsContam)) # 3 OTUs are overlapping

all.taxa.IsContam <- unique(c(lb.taxa.IsContam,hb.taxa.IsContam))
length(all.taxa.IsContam) # 744


## remove blanks from ok.trim.16s
unique(ok.trim.16s@sam_data$samp_type)
# [1] "soil"               "extraction blank"   "cecal"              "air"               
# [5] "fresh bedding"      "fecal"              "air sampling blank"

clean.16s <- prune_samples(ok.trim.16s@sam_data$samp_type %in% c("soil",
                                                                 "cecal",
                                                                 "air",
                                                                 "fresh bedding",
                                                                 "fecal"),
                           ok.trim.16s)


## then remove contaminant taxa
sel.rm <- which(row.names(clean.16s@otu_table) %in% all.taxa.IsContam) # qty 744
keep_taxa <- row.names(clean.16s@otu_table)[-sel.rm]

clean.16s <- prune_taxa(clean.16s, taxa = keep_taxa )
clean.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 2730 taxa and 303 samples ]
# sample_data() Sample Data:       [ 303 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 2730 taxa by 7 taxonomic ranks ]

min(sample_sums(clean.16s)) # 345
sort(sample_sums(clean.16s))[1:10]
# bfre1T5 bfre1T13  bfre3T5   L3a2w6   H6a2w6   C2a2w6   L3a1w6   L4a1w6   L5a1w6 bfre2T13 
#     345      839      945     1307     1400     1681     1683     1719     1850     2179 

min(taxa_sums(clean.16s)) # 9

#---------------------






#### Ordination of soil only samples
#------------------------

unique(clean.16s@sam_data$samp_type)
# [1] "cecal"         "fecal"         "air"           "soil"          "fresh bedding"

soil.dc.16s <- prune_samples( clean.16s@sam_data$samp_type == "soil", clean.16s )
soil.dc.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 2730 taxa and 65 samples ]
# sample_data() Sample Data:       [ 65 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 2730 taxa by 7 taxonomic ranks ]

min(sample_sums(soil.dc.16s)) # 12524
min(taxa_sums(soil.dc.16s)) # 0

# prune taxa that have zero sequence reads
soil.dc.16s <- prune_taxa(taxa = taxa_sums(soil.dc.16s) > 0, x = soil.dc.16s)
soil.dc.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 2345 taxa and 65 samples ]
# sample_data() Sample Data:       [ 65 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 2345 taxa by 7 taxonomic ranks ]


sort(  sample_sums(soil.dc.16s) )
# H4s1w1  Rem19 H5s2w1 H6s2w1 L3s2w7 H6s1w1 L4s1w1  Rem29 H3s2w7  Rem36  Clr29 L6s1w1  Rem39  Rem13  Rem26 H3s1w1 
# 12524  14024  15241  15297  15550  15768  16289  16455  17208  18208  18381  18655  18872  19844  19869  19882 
# L5s2w1  Clr26 L4s2w1 H1s2w1 L2s2w1 H2s1w1  Rem16 L1s2w1  Clr33  Rem33  Clr36 L2s1w1 L3s2w1  Clr23 L5s1w1  Clr19 
# 20051  20398  20442  20668  20868  21389  22365  22497  22824  24000  24663  24841  25196  25230  25421  25964 
# H2s2w1 H6s2w7 H3s2w1 H4s1w7  Clr39 H1s1w1 L1s1w1 L3s1w1  Clr16 L1s2w7 H1s1w7  Rem23 H1s2w7 H5s1w1 H2s2w7 L6s2w7 
# 26225  26362  28360  28400  28739  29651  33379  33578  33940  34036  35401  36772  38175  38591  41192  43747 
# H3s1w7 H5s2w7 H5s1w7 L4s1w7 H2s1w7 L6s2w1 L5s1w7 L6s1w7 L5s2w7 H4s2w7 L2s1w7 H6s1w7 L2s2w7 L4s2w7 L1s1w7  Clr13 
# 44425  44807  45190  45254  45849  51242  51383  51424  51723  51882  53256  53557  54015  54825  55598  57129 
# L3s1w7 
# 63813  

table( soil.dc.16s@sam_data$samp_type )
# soil 
# 65 

table( soil.dc.16s@sam_data$Treatment )
# High  Low 
#   32   33 

table( soil.dc.16s@sam_data$Time )
# Source soil      Week 1      Week 7 
#          18          23          24 



## Use lowest number of reads to create rarefied dataset

#set.seed(123)
seed <- 123
rare.soil.dc.16s <- rarefy_even_depth(soil.dc.16s, sample.size = min(sample_sums(soil.dc.16s)),
                                      rngseed = seed, replace = FALSE, trimOTUs = TRUE, verbose = TRUE)

sample_sums(rare.soil.dc.16s) # all 12524
min( taxa_sums(rare.soil.dc.16s) ) # 1

ntaxa(rare.soil.dc.16s) # 2298
nsamples(rare.soil.dc.16s) # 65


### ORDINATION PLOT [finished-plot] # # # # # # # # # # # # # # # 
### NMDS + Bray-Curtis

set.seed(123)
ord <- ordinate(rare.soil.dc.16s, "NMDS", "bray")

ord
# Call:
#   metaMDS(comm = veganifyOTU(physeq), distance = distance) 
# 
# global Multidimensional Scaling using monoMDS
# 
# Data:     wisconsin(sqrt(veganifyOTU(physeq))) 
# Distance: bray 
# 
# Dimensions: 2 
# Stress:     0.04704312 
# Stress type 1, weak ties
# No convergent solutions - best solution after 20 tries
# Scaling: centring, PC rotation, halfchange scaling 
# Species: expanded scores based on 'wisconsin(sqrt(veganifyOTU(physeq)))'

str(ord)

rare.soil.dc.16s@sam_data$samp_type # all "soil"
rare.soil.dc.16s@sam_data$Time      # either: "Source soil", "Week 1" , "Week 7"

length(unique(rare.soil.dc.16s@sam_data$Treatment)) # 2
length(unique(rare.soil.dc.16s@sam_data$Cage.Name)) # 13
length(unique(rare.soil.dc.16s@sam_data$sample)) # 65


p <- plot_ordination(rare.soil.dc.16s, ord, type="samples", color="Treatment", shape="Time")
p
str(p)
p$data$Treatment <- factor(p$data$Treatment, levels = c("Control","Low","High"), ordered = TRUE)
p$data$Time <- factor(p$data$Time, levels = c("Source soil", "Week 1", "Week 7"), ordered = TRUE)

cols <- c("Control" = "#f46d43", "Low" = "#66c2a5" , "High" = "#5e4fa2" ) # orange, aqua, dark-blue

shapes <- c("Source soil" = 15, "Week 1" = 16, "Week 7" = 17 )

pp <- p +
  scale_colour_manual(values = cols) +
  scale_shape_manual(values = shapes) +
  theme_bw() +
  annotate(geom="text", x= 1.25, y= -0.35, label = paste0("Stress = ",round(ord$stress,5)), hjust=1, vjust=1, size=3.25 ) +
  guides(
    color = guide_legend(order = 1),
    shape = guide_legend(order = 2)
  )

pp


## Minimise plot size


## default gggplot2 plot margins
theme_bw()$plot.margin
#[1] 5.5pt 5.5pt 5.5pt 5.5pt


pp <- p + geom_point(size=0.8) +
  scale_colour_manual(values = cols) +
  scale_shape_manual(values = shapes) +
  theme_bw() +
  #annotate(geom="text", x= 1.40, y= 0.375, label = paste0("Stress = ",round(ord$stress,4)), hjust=1, vjust=1, size=2 ) +
  annotate(geom="text", x= 1.40, y= 0.375, label = paste0("Stress = ",round(ord$stress,4)), hjust=1, vjust=1, size=2.5 ) +
  guides(
    color = guide_legend(order = 1),
    shape = guide_legend(order = 2)
  )+
  #ggtitle("A") +
  labs(x = NULL, y = NULL) +
  theme(
    axis.text.x = element_blank(),
    axis.text.y = element_blank(),
    axis.ticks = element_blank(),
    plot.title=element_text(face = "bold", hjust = 0),
    plot.margin = margin(t = 2, r = 5, b = 2, l = 2, "pt"),   # https://ggplot2.tidyverse.org/reference/element.html
    #http://r-statistics.co/Complete-Ggplot2-Tutorial-Part2-Customizing-Theme-With-R-Code.html#Legend%20Positions
    legend.justification=c(1,0), 
    legend.position=c(0.95, 0.05),
    legend.background = element_blank(),
    legend.key = element_blank(),
    #legend.key.size = unit(2, "pt"),
    legend.key.size = unit(2.5, "pt"),
    #legend.text = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.5)),
    #legend.title = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.6)),
    legend.text = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.6)),
    legend.title = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.7)),
    
    #legend.margin=margin(t = 5.5, r = 2, b = 5.5, l = 0, unit='pt'),
    legend.margin = margin(t = 0,r = 0,b = 0,l = 0,"pt"),
    #legend.box.spacing = unit(2, "pt"),
    legend.box.spacing = unit(0, "pt"),
    legend.box.margin = margin(t = 0,r = 0,b = 0,l = 0,"pt"),
    legend.spacing = unit(4, "pt"),
    
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  )

pp


grid.text(label = "A" , x = unit(0.075, "npc") , y = unit(0.93,"npc"), gp=gpar(fontsize=11, fontface="bold") )


dev.print(tiff, file = paste0("plots/","Ordination-Soil-only-1xRarefy-A-vFINAL.tiff"), width = 7, height = 5.95, units = "cm", res=600, compression="lzw")





# Test hypothesis that microbiota vary (with different centroids) by Treatment

# Calculate bray curtis distance matrix
set.seed(123)
bray.rare.soil.dc.16s <- phyloseq::distance(rare.soil.dc.16s, method = "bray")
sampledf <- data.frame(sample_data(rare.soil.dc.16s))
str(sampledf)


# Adonis test
set.seed(123)
adonis(bray.rare.soil.dc.16s ~ Treatment, data = sampledf)
# Call:
# adonis(formula = bray.rare.soil.dc.16s ~ Treatment, data = sampledf) 
# 
# Permutation: free
# Number of permutations: 999
# 
# Terms added sequentially (first to last)
# 
#           Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)    
# Treatment  1    4.3160  4.3160  75.445 0.54495  0.001 ***
# Residuals 63    3.6041  0.0572         0.45505           
# Total     64    7.9201                 1.00000           
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Homogeneity of dispersion test
beta <- betadisper(bray.rare.soil.dc.16s, sampledf$Treatment)
set.seed(123)
permutest(beta)
# Permutation test for homogeneity of multivariate dispersions
# Permutation: free
# Number of permutations: 999
# 
# Response: Distances
#           Df  Sum Sq  Mean Sq      F N.Perm Pr(>F)
# Groups     1 0.00101 0.001008 0.0724    999   0.78
# Residuals 63 0.87728 0.013925 


table(sampledf$Treatment)
# High  Low 
# 32   33 

table(sampledf$Time)
# Source soil      Week 1      Week 7 
#          18          23          24

#
#------------------------






#### Alpha diversity of ALL soils - including merged sample bootstrap resampling
#------------------------

## the function below uses merged-sample bootstrap resampling to estimate the 
## overall site-level alpha diversity.
## merging by group for triplicates (source soils) and duplicates (treatment soils in trays)



## create function to rarefy for initial sample > merge samples by type > rarefy again > calculate shannon's index
# # # # # # # # # # # #
calc_AlphaDiv_in_parallel_any_merge_no <- function(phy_obj, merge_by ) {
  
  # rarefy
  seed <- 123+j
  r16s <- rarefy_even_depth(phy_obj, sample.size = min(sample_sums(phy_obj)),
                            rngseed = seed, replace = TRUE, trimOTUs = TRUE, verbose = TRUE)
  
  # merge samples
  merged.r16s <- merge_samples(r16s, group= eval(parse(text= paste0("r16s@sam_data$",merge_by))) )
  
  
  # rarefy back to min sample size for this bootstrap sample - i.e. rarefied-merged-rarefied
  seed <- 234+j
  rmr16s <- rarefy_even_depth(merged.r16s, sample.size = min(sample_sums(phy_obj)),
                              rngseed = seed, replace = TRUE, trimOTUs = TRUE, verbose = TRUE)
  
  # - - - - - - - - - -
  
  ### ALPHA DIVERSITY
  ## calculate Shannon's index
  shan.rmr16s <- plot_richness(rmr16s, measures=c("Shannon"))
  
  # export data
  out <- data.frame(sample=shan.rmr16s$data$samples,shannon=shan.rmr16s$data$value,
                    group=shan.rmr16s$data$samples)
  
  out$eff_no <- exp(out$shannon)
  out$calc_type <- "bootstrap"
  
  return(out)
  
}
# # # # # # # # # # # #


## apply this to previously decontaminated dataset: soil.dc.16s

table( soil.dc.16s@sam_data$Time )
# Source soil      Week 1      Week 7 
#          18          23          24

names( soil.dc.16s@sam_data )
# [1] "mouseID"                        "sample"                         "samp_type"                     
# [4] "Treatment"                      "Time"                           "Animal.no"                     
# [7] "Sex"                            "Litter"                         "Date.of.Birth"                 
# [10] "Cage.no"                        "Cage.Name"                      "Rack"                          
# [13] "Ear.notch.ID"                   "Ear.notch.pos"                  "Age_received_20_Aug"           
# [16] "Age_first_soil_exposure_27_Aug" "Rack_position"                  "Sample_or_Control"             
# [19] "is.neg"                         "quant_reading" 

soil.dc.16s@sam_data[ ,c("sample","samp_type","Treatment","Time")]

soil.dc.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 2345 taxa and 65 samples ]
# sample_data() Sample Data:       [ 65 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 2345 taxa by 7 taxonomic ranks ]

min(sample_sums(soil.dc.16s)) # 12524
min(taxa_sums(soil.dc.16s)) # 1


phy_obj <- soil.dc.16s


phy_obj@sam_data[ ,c("sample","samp_type","Treatment","Time")]

# create grouping variable for each site

phy_obj@sam_data$group <- NA

phy_obj@sam_data$group <- paste0( substring(phy_obj@sam_data$sample,1,2), substring(phy_obj@sam_data$sample,nchar(phy_obj@sam_data$sample)-1,nchar(phy_obj@sam_data$sample)))

# then overwrite with ...

sel <- grep(pattern = "Clr1", x = phy_obj@sam_data$sample)
phy_obj@sam_data$group[sel] <- "Clr"

sel <- grep(pattern = "Clr2", x = phy_obj@sam_data$sample)
phy_obj@sam_data$group[sel] <- "Clr"

sel <- grep(pattern = "Clr3", x = phy_obj@sam_data$sample)
phy_obj@sam_data$group[sel] <- "Clr"

sel <- grep(pattern = "Rem1", x = phy_obj@sam_data$sample)
phy_obj@sam_data$group[sel] <- "Rem"

sel <- grep(pattern = "Rem2", x = phy_obj@sam_data$sample)
phy_obj@sam_data$group[sel] <- "Rem"

sel <- grep(pattern = "Rem3", x = phy_obj@sam_data$sample)
phy_obj@sam_data$group[sel] <- "Rem"




head( phy_obj@sam_data[ ,c("sample","samp_type","Treatment","Time","group")] )
# Sample Data:        [6 samples by 5 sample variables]:
#        sample samp_type Treatment        Time group
# H5s1w1 H5s1w1      soil      High      Week 1  H5w1
# L2s1w7 L2s1w7      soil       Low      Week 7  L2w7
# Rem29   Rem29      soil      High Source soil   Rem
# L5s1w1 L5s1w1      soil       Low      Week 1  L5w1
# Clr26   Clr26      soil       Low Source soil   Clr
# H2s1w7 H2s1w7      soil      High      Week 7  H2w7

length(unique(phy_obj@sam_data$group)) # 26
sort(unique(phy_obj@sam_data$group))
# [1] "Clr"  "H1w1" "H1w7" "H2w1" "H2w7" "H3w1" "H3w7" "H4w1" "H4w7" "H5w1" "H5w7" "H6w1" "H6w7" "L1w1" "L1w7"
# [16] "L2w1" "L2w7" "L3w1" "L3w7" "L4w1" "L4w7" "L5w1" "L5w7" "L6w1" "L6w7" "Rem"


# bootstrap outputs will automatically be stored in a list: b_out

# number of bootstrap resamples
B <- 100

# iterate in parallel
cl<-makeCluster( detectCores()-1 ) # detectCores()-1
registerDoParallel(cl) # remember to stopcluster()

b_out <- foreach(j=1:B, .packages=c('phyloseq')) %dopar%
  calc_AlphaDiv_in_parallel_any_merge_no(phy_obj=phy_obj, merge_by="group" )

stopCluster(cl)


length(b_out) # 100
names(b_out[[1]]) # "sample"    "shannon"   "group"     "eff_no"    "calc_type"
dim(b_out[[1]]) # 26  5

b_out[[1]]
# sample  shannon group   eff_no calc_type
# 1     Clr 5.582610   Clr 265.7645 bootstrap
# 2    H1w1 5.888738  H1w1 360.9494 bootstrap
# 3    H1w7 5.922157  H1w7 373.2157 bootstrap
# 4    H2w1 5.852293  H2w1 348.0315 bootstrap
# 5    H2w7 5.866730  H2w7 353.0923 bootstrap
# 6    H3w1 5.773728  H3w1 321.7349 bootstrap
# 7    H3w7 5.908777  H3w7 368.2555 bootstrap
# 8    H4w1 5.835952  H4w1 342.3906 bootstrap
# 9    H4w7 5.910237  H4w7 368.7935 bootstrap
# 10   H5w1 5.905837  H5w1 367.1743 bootstrap
# 11   H5w7 5.914514  H5w7 370.3743 bootstrap
# 12   H6w1 5.809601  H6w1 333.4859 bootstrap
# 13   H6w7 5.872727  H6w7 355.2163 bootstrap
# 14   L1w1 5.543545  L1w1 255.5825 bootstrap
# 15   L1w7 5.163747  L1w7 174.8183 bootstrap
# 16   L2w1 5.496611  L2w1 243.8641 bootstrap
# 17   L2w7 5.103908  L2w7 164.6641 bootstrap
# 18   L3w1 5.475788  L3w1 238.8385 bootstrap
# 19   L3w7 5.194748  L3w7 180.3228 bootstrap
# 20   L4w1 5.450656  L4w1 232.9110 bootstrap
# 21   L4w7 5.140608  L4w7 170.8196 bootstrap
# 22   L5w1 5.565565  L5w1 261.2727 bootstrap
# 23   L5w7 5.144408  L5w7 171.4700 bootstrap
# 24   L6w1 5.338649  L6w1 208.2311 bootstrap
# 25   L6w7 5.090755  L6w7 162.5126 bootstrap
# 26    Rem 5.866909   Rem 353.1557 bootstrap


### 1st calculate diversity from one rarefying step
### then append bootstrap-derived uncertainty...(?)

# rarefy #1
seed <- 123
r1.16s <- rarefy_even_depth(phy_obj, sample.size = min(sample_sums(phy_obj)),
                            rngseed = seed, replace = FALSE, trimOTUs = TRUE, verbose = TRUE)
shan.r1.16s <- plot_richness(r1.16s, measures=c("Shannon"))
shan.r1.16s
out <- data.frame(sample=shan.r1.16s$data$samples,shannon=shan.r1.16s$data$value,
                  group=NA )  #
out$eff_no <- exp(out$shannon) # calculate effective no of species
out$calc_type <- "rarefyx1"
str(out)

## fix "group" variable based on "sample"
out$sample <- as.character(out$sample)
out$group <- paste0( substring(out$sample,1,2), substring(out$sample,nchar(out$sample)-1,nchar(out$sample)))


# then overwrite with ...
sel <- grep(pattern = "Clr1", x = out$sample)
out$group[sel] <- "Clr"

sel <- grep(pattern = "Clr2", x = out$sample)
out$group[sel] <- "Clr"

sel <- grep(pattern = "Clr3", x = out$sample)
out$group[sel] <- "Clr"

sel <- grep(pattern = "Rem1", x = out$sample)
out$group[sel] <- "Rem"

sel <- grep(pattern = "Rem2", x = out$sample)
out$group[sel] <- "Rem"

sel <- grep(pattern = "Rem3", x = out$sample)
out$group[sel] <- "Rem"

str(out)
# 'data.frame':	65 obs. of  5 variables:
# $ sample   : chr  "H5s1w1" "L2s1w7" "Rem29" "L5s1w1" ...
# $ shannon  : num  5.86 5.06 5.8 5.56 5.09 ...
# $ group    : chr  "H5w1" "L2w7" "Rem" "L5w1" ...
# $ eff_no   : num  349 157 331 261 162 ...
# $ calc_type: chr  "rarefyx1" "rarefyx1" "rarefyx1" "rarefyx1" ...

names(out)        # "sample"    "shannon"   "group"     "eff_no"    "calc_type"

names(b_out[[1]]) #  "sample"    "shannon"   "group"     "eff_no"    "calc_type"
dim(b_out[[1]]) # 26  5

temp <- out

for (j in 1:B) {
  temp <- rbind(temp,b_out[[j]])
}

head(temp)
temp[1:50, ]
tail(temp)

names(temp)
# "sample"    "shannon"   "group"     "eff_no"    "calc_type"


temp$Time <- substring(temp$group, nchar(temp$group)-1, nchar(temp$group))
# fix source soils: "em"  "lr"
sel <- which(temp$Time=="em")
temp$Time[sel] <- "Source\nsoil"
sel <- which(temp$Time=="lr")
temp$Time[sel] <- "Source\nsoil"

levels(factor(temp$Time)) # [1] "Source\nsoil" "w1"           "w7"      
temp$Time <- factor(temp$Time, levels = c("Source\nsoil","w1","w7"), labels=c("Source\nsoil","Week 1","Week 7"))

str(temp)
# 'data.frame':	2665 obs. of  6 variables:
# $ sample   : chr  "H5s1w1" "L2s1w7" "Rem29" "L5s1w1" ...
# $ shannon  : num  5.86 5.06 5.8 5.56 5.09 ...
# $ group    : chr  "H5w1" "L2w7" "Rem" "L5w1" ...
# $ eff_no   : num  349 157 331 261 162 ...
# $ calc_type: chr  "rarefyx1" "rarefyx1" "rarefyx1" "rarefyx1" ...
# $ Time     : Factor w/ 3 levels "Source\nsoil",..: 2 3 1 2 1 3 3 2 3 2 ...


levels(factor(temp$group))
# [1] "Clr"  "H1w1" "H1w7" "H2w1" "H2w7" "H3w1" "H3w7" "H4w1" "H4w7" "H5w1" "H5w7" "H6w1" "H6w7" "L1w1" "L1w7"
# [16] "L2w1" "L2w7" "L3w1" "L3w7" "L4w1" "L4w7" "L5w1" "L5w7" "L6w1" "L6w7" "Rem" 


temp$group <- factor(temp$group, levels = c("Clr","Rem",
                                            "L1w1", "L1w7", "L2w1", "L2w7", "L3w1", "L3w7", "L4w1", "L4w7", "L5w1", "L5w7", "L6w1", "L6w7",
                                            "H1w1", "H1w7", "H2w1", "H2w7", "H3w1", "H3w7", "H4w1", "H4w7", "H5w1", "H5w7", "H6w1", "H6w7"
                                            ), ordered = TRUE)

str(temp)



melt.out <- melt(temp,id.vars = c("group","calc_type", "Time"), measure.vars = "eff_no")


## plot

cols <- c("Clr"="#66c2a5",
          "Rem"="#5e4fa2",
          "L1w1"="#66c2a5", "L1w7"="#66c2a5", "L2w1"="#66c2a5", "L2w7"="#66c2a5", "L3w1"="#66c2a5", "L3w7"="#66c2a5", "L4w1"="#66c2a5", "L4w7"="#66c2a5", "L5w1"="#66c2a5", "L5w7"="#66c2a5", "L6w1"="#66c2a5", "L6w7"="#66c2a5",
          "H1w1"="#5e4fa2", "H1w7"="#5e4fa2", "H2w1"="#5e4fa2", "H2w7"="#5e4fa2", "H3w1"="#5e4fa2", "H3w7"="#5e4fa2", "H4w1"="#5e4fa2", "H4w7"="#5e4fa2", "H5w1"="#5e4fa2", "H5w7"="#5e4fa2", "H6w1"="#5e4fa2", "H6w7"="#5e4fa2",
          "Low" = "#66c2a5", "High" = "#5e4fa2")

shapes <- c("Clr"=15,
          "Rem"=15,
          "L1w1"=16, "L1w7"=17, "L2w1"=16, "L2w7"=17, "L3w1"=16, "L3w7"=17, "L4w1"=16, "L4w7"=17, "L5w1"=16, "L5w7"=17, "L6w1"=16, "L6w7"=17,
          "H1w1"=16, "H1w7"=17, "H2w1"=16, "H2w7"=17, "H3w1"=16, "H3w7"=17, "H4w1"=16, "H4w7"=17, "H5w1"=16, "H5w7"=17, "H6w1"=16, "H6w7"=17
          )

# test plot
p <- ggplot(data=melt.out, aes(x=group, value)) + 
  #ggtitle("b") +
  geom_violin(data = melt.out[ which(melt.out$calc_type == "bootstrap"), ], aes(color = group) ) + scale_colour_manual(values = cols) +
  #geom_point(data = melt.out[ which(melt.out$calc_type == "rarefyx1"), ], color="gray20", shape=1  ) +
  geom_point(data = melt.out[ which(melt.out$calc_type == "rarefyx1"), ], aes(shape = group) , color="grey40", size=1 ) +
  scale_shape_manual(values = shapes) +
  #theme_bw() +
  theme_classic() +
  #theme(axis.text.x  = element_text(angle=90, vjust=0.5) ) +
  labs(x = NULL, y = "Effective OTUs (count)") +
  theme(legend.position="none") +
  #facet_wrap(~Time, scales="free_x")
  facet_grid(~Time, scales="free_x", space="free_x")
p



levels(melt.out$group)
# [1] "Clr"  "Rem"  "L1w1" "L1w7" "L2w1" "L2w7" "L3w1" "L3w7" "L4w1" "L4w7" "L5w1" "L5w7" "L6w1" "L6w7" "H1w1" "H1w7" "H2w1"
# [18] "H2w7" "H3w1" "H3w7" "H4w1" "H4w7" "H5w1" "H5w7" "H6w1" "H6w7"
substring(levels(melt.out$group),1,2)
# [1] "Cl" "Re" "L1" "L1" "L2" "L2" "L3" "L3" "L4" "L4" "L5" "L5" "L6" "L6" "H1" "H1" "H2" "H2" "H3" "H3" "H4" "H4" "H5" "H5"
# [25] "H6" "H6"


## Indicate significant differences...?
sel <- which(melt.out$calc_type == "rarefyx1")
rare1 <- melt.out[sel, ]
rare1$group2 <- substring(rare1$group, 1,1 )
str(rare1)
rare1$group2 <- factor(rare1$group2, levels = c("C", "H", "L", "R"), labels = c("Clr", "High", "Low", "Rem"))

## remove outliers before statistical test
#bp <- boxplot(rare1$value ~ rare1$group2 + rare1$Time)
bp <- boxplot(value ~ group2 + Time , data = rare1)

sel <- which(rare1$value %in% bp$out)
rare1$value[sel] # 308.15276 108.65263  25.05333
rare1[sel,]
#    group calc_type         Time variable     value group2
# 33  H2w1  rarefyx1       Week 1   eff_no 308.15276   High
# 35   Rem  rarefyx1 Source\nsoil   eff_no 108.65263    Rem
# 50   Clr  rarefyx1 Source\nsoil   eff_no  25.05333    Clr

## exclude outliers from Kruskal-Wallis significance testing
rare1_exout <- rare1[-sel, ]


## Mann-Whitney-Wilcoxon Test
wilcox.test(value ~ group2, data=rare1_exout[which(rare1_exout$group2 %in% c("Clr","Rem")), ])
# Wilcoxon rank sum test
# data:  value by group2
# W = 6, p-value = 0.004662
# alternative hypothesis: true location shift is not equal to 0

table( rare1_exout[which(rare1_exout$group2 %in% c("Clr","Rem")),  ]$group2)
# Clr High  Low  Rem 
#   8    0    0    8

wilcox.test(value ~ group2, data=rare1_exout[which(rare1_exout$group2 %in% c("High","Low") & rare1_exout$Time=="Week 1"), ])
# Wilcoxon rank sum test
# data:  value by group2
# W = 120, p-value = 3.093e-06
# alternative hypothesis: true location shift is not equal to 0

table( rare1_exout[which(rare1_exout$group2 %in% c("High","Low") & rare1_exout$Time=="Week 1"), ]$group2)
# Clr High  Low  Rem 
#   0   10   12    0 

wilcox.test(value ~ group2, data=rare1_exout[which(rare1_exout$group2 %in% c("High","Low") & rare1_exout$Time=="Week 7"), ])
# Wilcoxon rank sum test
# data:  value by group2
# W = 144, p-value = 7.396e-07
# alternative hypothesis: true location shift is not equal to 0

table( rare1_exout[which(rare1_exout$group2 %in% c("High","Low") & rare1_exout$Time=="Week 7"), ]$group2)
# Clr High  Low  Rem 
# 0   12   12    0 

hist(melt.out$value)
summary(melt.out$value)

# create arrow annotation, "Low" = "#f46d43", "High" = "#5e4fa2"
anno.arrow <- data.frame(x_start = c(1.5, 1, 7 , 1   ), x_end = c(3, 6, 12 , 6  ), 
                         y_start = c(260, 350, 210 , 350   ), y_end = c(250, 350, 180, 355  ), 
                         arrow_colors = c("Low", "High", "Low", "High"),         # add these to cols vector: "Low" = "#f46d43", "High" = "#5e4fa2"
                         Time = c("Source\nsoil", "Week 1","Week 1", "Week 7"))

# also add significance annotation
annotation_df <- data.frame(
  Time = c("Source\nsoil", "Week 1", "Week 7"),
  start=c("Clr","L1w1","L1w7"),
  end = c("Rem","H6w1","H6w7"),
  y = c(408, 408, 408),
  label = c("**","***","***"), stringsAsFactors = FALSE)
str(annotation_df)
# ensure no conflict in factor levels 
annotation_df$start <- factor(annotation_df$start, levels = c("Clr","Rem",
                                                              "L1w1", "L1w7", "L2w1", "L2w7", "L3w1", "L3w7", "L4w1", "L4w7", "L5w1", "L5w7", "L6w1", "L6w7",
                                                              "H1w1", "H1w7", "H2w1", "H2w7", "H3w1", "H3w7", "H4w1", "H4w7", "H5w1", "H5w7", "H6w1", "H6w7"
                                          ), ordered = TRUE)
annotation_df$end <- factor(annotation_df$end, levels = c("Clr","Rem",
                                                              "L1w1", "L1w7", "L2w1", "L2w7", "L3w1", "L3w7", "L4w1", "L4w7", "L5w1", "L5w7", "L6w1", "L6w7",
                                                              "H1w1", "H1w7", "H2w1", "H2w7", "H3w1", "H3w7", "H4w1", "H4w7", "H5w1", "H5w7", "H6w1", "H6w7"
), ordered = TRUE)



pp <- #p + 
  
  ggplot(data=melt.out, aes(x=group, value)) + 
  #ggtitle("b") +
  geom_violin(data = melt.out[ which(melt.out$calc_type == "bootstrap"), ], aes(color = group) ) + scale_colour_manual(values = cols) +
  geom_point(data = melt.out[ which(melt.out$calc_type == "rarefyx1"), ], aes(shape = group) , color="grey40", size=1 ) +
  scale_shape_manual(values = shapes) +
  theme_classic() +
  labs(x = NULL, y = "Effective OTUs (count)") +
  theme(legend.position="none") +
  facet_grid(~Time, scales="free_x", space="free_x") +
  
  scale_x_discrete(labels=c(
    "Clr",  "Rem",
    "L1w1"="L1", "L1w7"="L1", "L2w1"="L2", "L2w7"="L2", "L3w1"="L3", "L3w7"="L3", "L4w1"="L4", "L4w7"="L4", "L5w1"="L5", "L5w7"="L5", "L6w1"="L6", "L6w7"="L6",
    "H1w1"="H1", "H1w7"="H1", "H2w1"="H2", "H2w7"="H2", "H3w1"="H3", "H3w7"="H3", "H4w1"="H4", "H4w7"="H4", "H5w1"="H5", "H5w7"="H5", "H6w1"="H6", "H6w7"="H6"
    
  )) +
  
  ## add arrows - based on this post: https://www.r-bloggers.com/adding-different-annotation-to-each-facet-in-ggplot/
  geom_segment(data = anno.arrow, aes(x = x_start, y = y_start, xend = x_end, yend = y_end, colour = arrow_colors),   #, colour = arrow_colors
               arrow = arrow(length = unit(5, "pt"))  ) +

  geom_signif(data=annotation_df,
              aes(xmin=start, xmax=end, annotations=label, y_position=y),
              textsize = 3.5, size = 0.5, tip_length = 0, margin_top = 0, vjust = 0.4 ,               # vjust = -0.2
              manual=TRUE) + # , size=0.5
  
  ylim(0,415) +
  
  theme(
    #plot.title=element_text(face = "bold", hjust = -0.18),  # adjust hjust by trial-and-error
    #plot.margin = margin(t = 2, r = 2, b = 2, l = 15, "pt"),
    #plot.margin = margin(t = 2, r = 2, b = 2, l = 5, "pt"),
    plot.margin = margin(t = 0, r = 0, b = 0, l = 5, "pt"),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    
    axis.ticks.length=unit(-3, "pt"),
    
    # axis.text.x = element_text(margin=margin(t = 5,r = 5,b = 2,l = 5,"pt"), angle=60, vjust=0.5, size = rel(0.6)),
    # axis.text.y = element_text(margin=margin(t = 5,r = 5,b = 5,l = 5,"pt"), size = rel(0.5)),
    # axis.title.y = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.6)),
    
    axis.text.x = element_text(margin=margin(t = 5,r = 5,b = 2,l = 5,"pt"), angle=60, vjust=0.5, size = rel(0.7)),
    axis.text.y = element_text(margin=margin(t = 5,r = 5,b = 5,l = 5,"pt"), size = rel(0.6)),
    axis.title.y = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.7)),
    
    strip.background = element_rect(fill="white", linetype = "blank"),
    #strip.text = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"))
    #strip.text = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.6))
    strip.text = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.7))
    
  )
pp





## change panel widths
## https://stackoverflow.com/questions/31572239/set-space-in-facet-wrap-like-in-facet-grid

# Get the ggplot grob
gt <- ggplotGrob(pp)

# Check for the widths - you need to change the two that are set to 1null
gt$widths
# [1] 5pt                0cm                 1grobwidth          0.642896862460219cm 2.6null            
# [6] 5.5pt               12.2null            5.5pt               12.2null            0cm                
# [11] 0cm                 0pt                 2pt                

# Change panel widths
gt$widths[5] # 2.6null
gt$widths[7] # 12.2null
gt$widths[9] # 12.2null

# Replace the default widths with relative widths:
gt$widths[5] <- unit(4.2, "null")

# Draw the plot
grid.newpage()
grid.draw(gt)



grid.text(label = "B" , x = unit(0.035, "npc") , y = unit(0.93,"npc"), gp=gpar(fontsize=11, fontface="bold") )


dev.print(tiff, file = paste0("plots/","Alpha-diversity-Source-ALL-soils-with-merged-sample-bootstrap-with-Sig-B-vFINAL2b-Edit-Panel-Widths.tiff"), width = 8.57, height = 5.95, units = "cm", res=600, compression="lzw")


#------------------------





#### Ordination of air and fresh bedding (low biomass) samples
#------------------------

unique(clean.16s@sam_data$samp_type)
#  "cecal"         "fecal"         "air"           "soil"          "fresh bedding"    

lowb.dc.16s <- prune_samples(clean.16s@sam_data$samp_type %in% c("air","fresh bedding"), clean.16s)
lowb.dc.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 2730 taxa and 78 samples ]
# sample_data() Sample Data:       [ 78 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 2730 taxa by 7 taxonomic ranks ]

min(taxa_sums(lowb.dc.16s)) # 0

# prune taxa that have zero sequence reads
lowb.dc.16s <- prune_taxa(taxa = taxa_sums(lowb.dc.16s) > 0, x = lowb.dc.16s)
lowb.dc.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 2073 taxa and 78 samples ]
# sample_data() Sample Data:       [ 78 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 2073 taxa by 7 taxonomic ranks ]

min(taxa_sums(lowb.dc.16s)) # 2
min(sample_sums(lowb.dc.16s)) # 345

sort(  sample_sums(lowb.dc.16s) )
# bfre1T5 bfre1T13  bfre3T5   L3a2w6   H6a2w6   C2a2w6   L3a1w6   L4a1w6   L5a1w6 bfre2T13   L2a1w6   L2a2w6 
# 345      839      945     1307     1400     1681     1683     1719     1850     2179     2254     2289 
# L5a2w6   H3a1w6   H4a2w6   H6a1w6   L1a1w6  bfre2T5   H4a1w6   C3a1w1 bfre3T13   L6a2w6   C2a1w6   H1a2w6 
# 2722     2867     3024     3370     3374     3459     3577     3584     3646     3902     3917     4950 
# H3a2w6   C6a1w6   C5a1w6   H1a1w6   C6a2w6   H4a2w1   L4a2w6   C4a1w6   L6a1w6   L1a2w6   C4a2w6   L5a2w1 
# 5110     5419     6900     6997     7738     8409     8563     8798     9648     9919    10442    10463 
# H2a2w1   C3a2w6   H4a1w1   H6a2w1   H5a1w1   C1a1w6   C1a2w6   H5a2w6   L6a2w1   H5a2w1   H2a1w1   C5a1w1 
# 10712    11767    13535    14330    14831    15195    15677    16244    16250    16953    17243    17636 
# H2a2w6   C2a1w1   L1a2w1   C6a1w1   H1a2w1   C4a2w1   L1a1w1   H1a1w1   C3a1w6   C4a1w1   C6a2w1   H3a2w1 
# 18218    19387    21463    24075    24373    26899    27130    27239    29960    30520    30695    31116 
# H5a1w6   L6a1w1   L4a2w1   L5a1w1   C1a2w1   C5a2w1   L2a2w1   L2a1w1   C2a2w1   C5a2w6   H2a1w6   H3a1w1 
# 31405    31626    33465    33470    33612    34760    34876    35718    36100    36880    40548    46030 
# L3a2w1   L3a1w1   H6a1w1   C1a1w1   L4a1w1   C3a2w1 
# 55905    56460    61540    72623    81978    86863 

## use samples above sequence read threshold of 800

lowb.dc.16s <- prune_samples( samples = sample_sums(lowb.dc.16s) >= 800 , x = lowb.dc.16s )
lowb.dc.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 2073 taxa and 77 samples ]
# sample_data() Sample Data:       [ 77 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 2073 taxa by 7 taxonomic ranks ]


table( lowb.dc.16s@sam_data$samp_type )
# air  fresh bedding 
#  72              5 

table( lowb.dc.16s@sam_data$Treatment )
# Control    High     Low 
#      24      24      24 

table( lowb.dc.16s@sam_data$Time )
# Week 1 Week 2 Week 6 
#     36      2     39 


## Use lowest number of reads to create rarefied dataset

seed <- 123
rare.lowb.dc.16s <- rarefy_even_depth(lowb.dc.16s, sample.size = min(sample_sums(lowb.dc.16s)),
                                      rngseed = seed, replace = FALSE, trimOTUs = TRUE, verbose = TRUE)

sample_sums(rare.lowb.dc.16s) # all 839
min(taxa_sums(rare.lowb.dc.16s)) # 1
ntaxa(rare.lowb.dc.16s) # 1743
nsamples(rare.lowb.dc.16s) # 77


### ORDINATION PLOT [finished-plot] # # # # # # # # # # # # # # # 
### NMDS + Bray-Curtis

set.seed(123)
ord <- ordinate(rare.lowb.dc.16s, "NMDS", "bray")

ord
# Call:
# metaMDS(comm = veganifyOTU(physeq), distance = distance) 
# 
# global Multidimensional Scaling using monoMDS
# 
# Data:     wisconsin(sqrt(veganifyOTU(physeq))) 
# Distance: bray 
# 
# Dimensions: 2 
# Stress:     0.1460756 
# Stress type 1, weak ties
# No convergent solutions - best solution after 20 tries
# Scaling: centring, PC rotation, halfchange scaling 
# Species: expanded scores based on 'wisconsin(sqrt(veganifyOTU(physeq)))' 

str(ord)

rare.lowb.dc.16s@sam_data$samp_type
rare.lowb.dc.16s@sam_data$Time     


length(unique(rare.lowb.dc.16s@sam_data$samp_type )) # 2
length(unique(rare.lowb.dc.16s@sam_data$Cage.Name )) # 13

p <- plot_ordination(rare.lowb.dc.16s, ord, type="samples", color="Treatment", shape="samp_type")
p
str(p)
#p$data$Treatment <- factor(p$data$Treatment, levels = c("Control","Low","High"), ordered = TRUE)
#p$data$Cage.Name <- factor(p$data$Cage.Name, levels = c( paste0("C",1:6),paste0("L",1:6),paste0("H",1:6)  ), ordered = TRUE)

head(p$data)
p$data$Treatment <- as.character(p$data$Treatment)
sel <- which(is.na( p$data$Treatment))
p$data$Treatment[sel] <- "N/A"
p$data$Treatment <- factor(p$data$Treatment, levels = c("Control","Low","High","N/A"), ordered = TRUE)

cols <- c("Control" = "#f46d43", "Low" = "#66c2a5" , "High" = "#5e4fa2", "N/A" = "#999999"  ) # orange, aqua, dark-blue

shapes <- c(1,2)


pp <- p + theme_bw() +
  scale_colour_manual(values = cols, name="Treatment") +
  scale_shape_manual(values = shapes, name="Sample type") +
  #scale_fill_manual(values = cols) +
  #geom_polygon(aes(fill = Treatment), alpha = 0.3) +
  annotate(geom="text", x= -2.5, y= -1.5, label = paste0("Stress = ",round(ord$stress,5)), hjust=0, vjust=0, size=3.25 ) +
  guides(
    color = guide_legend(order = 1),
    shape = guide_legend(order = 2)
  )

pp




## Minimise plot size & display week number

sel <- which(p$data$samp_type == "fresh bedding")
p$data$samp_type[sel] <- "fresh\nbedding"

pp <- p + theme_bw() +
  #ggtitle("C") +
  geom_point() + geom_text(label = substr(p$data$Time,start=6,stop=6), size = 2 ,hjust = 0, nudge_x = 0.15) +
  #geom_point() + geom_text(label = substr(p$data$Time,start=6,stop=6), size = 1.8 ,hjust = 0, nudge_x = 0.15) +
  scale_colour_manual(values = cols, name="Treatment") +
  scale_shape_manual(values = shapes, name="Sample\ntype") +
  #annotate(geom="text", x= 2.4, y= -2.5, label = paste0("Stress = ",round(ord$stress,4)), hjust=1, vjust=0, size= 2.5 ) +  # size=rel(0.8)
  #annotate(geom="text", x= 2.5, y= -2.5, label = paste0("Stress = ",round(ord$stress,4)), hjust=1, vjust=0, size= 2 ) +  # size=rel(0.8)
  annotate(geom="text", x= 2.5, y= -2.5, label = paste0("Stress = ",round(ord$stress,4)), hjust=1, vjust=0, size= 2.5 ) +  # size=rel(0.8)
  guides(
    color = guide_legend(order = 1),
    shape = guide_legend(order = 2)
  ) +
  labs(x = NULL, y = NULL) +
  theme(
    axis.text.x = element_blank(),
    axis.text.y = element_blank(),
    axis.ticks = element_blank(),
    plot.title=element_text(face = "bold", hjust = 0),  # adjust hjust by trial-and-error
    plot.margin = margin(t = 2, r = 5, b = 2, l = 2, "pt"),  # l = 10

    #legend.key.size = unit(2, "pt"),
    #legend.text = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.5)),
    #legend.title = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.6)),
    legend.key.size = unit(2.5, "pt"),
    legend.text = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.6)),
    legend.title = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.7)),
    
    legend.margin = margin(t = 0,r = 0,b = 0,l = 2,"pt"),
    legend.box.spacing = unit(2, "pt"),
    legend.box.margin = margin(t = 0,r = 0,b = 0,l = 0,"pt"),
    legend.spacing = unit(4, "pt"),
    
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
    
  )

pp


grid.text(label = "C" , x = unit(0.075, "npc") , y = unit(0.93,"npc"), gp=gpar(fontsize=11, fontface="bold") )


dev.print(tiff, file = paste0("plots/","Ordination-Air-FreshBedding-withWeekNo-1xRarefy-by-Treatment-and-Sample-Type-C-vFINAL.tiff"), width = 7, height = 5.95, units = "cm", res=600, compression="lzw")




table(p$data$Treatment)
# Control     Low    High     N/A 
#      24      24      24       5

table(p$data$Time)
# Week 1 Week 2 Week 6 
#     36      2     39


# Test hypothesis that microbiota vary (with different centroids) by Treatment

# exclude fresh bedding from analysis

sel <- which( is.na(rare.lowb.dc.16s@sam_data$Treatment) ) # qty 5
keep_samples <- sample_names(rare.lowb.dc.16s)[-sel]

rare.lowb.dc.16s.exclbfre <- prune_samples(samples = keep_samples, x = rare.lowb.dc.16s)

min( sample_sums(rare.lowb.dc.16s.exclbfre) ) # 839
min( taxa_sums(rare.lowb.dc.16s.exclbfre) ) # 0

# prune taxa that have zero sequence reads
rare.lowb.dc.16s.exclbfre <- prune_taxa(taxa = taxa_sums(rare.lowb.dc.16s.exclbfre) > 0, x = rare.lowb.dc.16s.exclbfre)
rare.lowb.dc.16s.exclbfre
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 1735 taxa and 72 samples ]
# sample_data() Sample Data:       [ 72 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 1735 taxa by 7 taxonomic ranks ]


# Calculate bray curtis distance matrix
set.seed(123)
bray.rare.lowb.dc.16s.exclbfre <- phyloseq::distance(rare.lowb.dc.16s.exclbfre, method = "bray")
sampledf <- data.frame(sample_data(rare.lowb.dc.16s.exclbfre))
str(sampledf)


# # Adonis test
# set.seed(123)
# adonis(bray.rare.lowb.dc.16s.exclbfre ~ Treatment, data = sampledf)
# # Call:
# # adonis(formula = bray.rare.lowb.dc.16s.exclbfre ~ Treatment,      data = sampledf) 
# # 
# # Permutation: free
# # Number of permutations: 999
# # 
# # Terms added sequentially (first to last)
# # 
# #           Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)    
# # Treatment  2    2.8957 1.44787  6.6488 0.16158  0.001 ***
# # Residuals 69   15.0258 0.21776         0.83842           
# # Total     71   17.9215                 1.00000           
# # ---
# # Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1


# Adonis test
set.seed(123)
adonis(bray.rare.lowb.dc.16s.exclbfre ~ Treatment + Time , data = sampledf)
# Call:
# adonis(formula = bray.rare.lowb.dc.16s.exclbfre ~ Treatment +      Time, data = sampledf) 
# 
# Permutation: free
# Number of permutations: 999
# 
# Terms added sequentially (first to last)
# 
#           Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)    
# Treatment  2    2.8957 1.44787  7.2118 0.16158  0.001 ***
# Time       1    1.3737 1.37374  6.8425 0.07665  0.001 ***
# Residuals 68   13.6520 0.20077         0.76177           
# Total     71   17.9215                 1.00000           
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1


# Homogeneity of dispersion test
beta <- betadisper(bray.rare.lowb.dc.16s.exclbfre, sampledf$Treatment)
set.seed(123)
permutest(beta)
# Permutation test for homogeneity of multivariate dispersions
# Permutation: free
# Number of permutations: 999
# 
# Response: Distances
#           Df  Sum Sq Mean Sq      F N.Perm Pr(>F)    
# Groups     2 0.75066 0.37533 47.496    999  0.001 ***
# Residuals 69 0.54526 0.00790                         
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#
#------------------------





#### Alpha diversity of air samples- including merged sample bootstrap resampling
#------------------------

## the function below uses merged-sample bootstrap resampling to estimate the 
## overall site-level alpha diversity.
## merging by group for triplicates (source soils) and duplicates (treatment soils in trays)



## create function to rarefy for initial sample > merge samples by type > rarefy again > calculate shannon's index
# # # # # # # # # # # #
calc_AlphaDiv_in_parallel_any_merge_no <- function(phy_obj, merge_by ) {
  
  # rarefy
  seed <- 123+j
  r16s <- rarefy_even_depth(phy_obj, sample.size = min(sample_sums(phy_obj)),
                            rngseed = seed, replace = TRUE, trimOTUs = TRUE, verbose = TRUE)
  
  # merge samples
  merged.r16s <- merge_samples(r16s, group= eval(parse(text= paste0("r16s@sam_data$",merge_by))) )
  
  
  # rarefy back to min sample size for this bootstrap sample - i.e. rarefied-merged-rarefied
  seed <- 234+j
  rmr16s <- rarefy_even_depth(merged.r16s, sample.size = min(sample_sums(phy_obj)),
                              rngseed = seed, replace = TRUE, trimOTUs = TRUE, verbose = TRUE)
  
  # - - - - - - - - - -
  
  ### ALPHA DIVERSITY
  ## calculate Shannon's index
  shan.rmr16s <- plot_richness(rmr16s, measures=c("Shannon"))
  
  # export data
  out <- data.frame(sample=shan.rmr16s$data$samples,shannon=shan.rmr16s$data$value,
                    group=shan.rmr16s$data$samples)
  
  out$eff_no <- exp(out$shannon)
  out$calc_type <- "bootstrap"
  
  return(out)
  
}
# # # # # # # # # # # #


## apply this to previously decontaminated dataset: lowb.dc.16s

air.dc.16s <- prune_samples(lowb.dc.16s@sam_data$samp_type == "air", lowb.dc.16s)
air.dc.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 2073 taxa and 72 samples ]
# sample_data() Sample Data:       [ 72 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 2073 taxa by 7 taxonomic ranks ]

min(taxa_sums(air.dc.16s)) # 1
min(sample_sums(air.dc.16s)) # 1307
sort(  sample_sums(air.dc.16s) )
# L3a2w6 H6a2w6 C2a2w6 L3a1w6 L4a1w6 L5a1w6 L2a1w6 L2a2w6 L5a2w6 H3a1w6 H4a2w6 H6a1w6 L1a1w6 H4a1w6 C3a1w1 L6a2w6 
# 1307   1400   1681   1683   1719   1850   2254   2289   2722   2867   3024   3370   3374   3577   3584   3902 
# C2a1w6 H1a2w6 H3a2w6 C6a1w6 C5a1w6 H1a1w6 C6a2w6 H4a2w1 L4a2w6 C4a1w6 L6a1w6 L1a2w6 C4a2w6 L5a2w1 H2a2w1 C3a2w6 
# 3917   4950   5110   5419   6900   6997   7738   8409   8563   8798   9648   9919  10442  10463  10712  11767 
# H4a1w1 H6a2w1 H5a1w1 C1a1w6 C1a2w6 H5a2w6 L6a2w1 H5a2w1 H2a1w1 C5a1w1 H2a2w6 C2a1w1 L1a2w1 C6a1w1 H1a2w1 C4a2w1 
# 13535  14330  14831  15195  15677  16244  16250  16953  17243  17636  18218  19387  21463  24075  24373  26899 
# L1a1w1 H1a1w1 C3a1w6 C4a1w1 C6a2w1 H3a2w1 H5a1w6 L6a1w1 L4a2w1 L5a1w1 C1a2w1 C5a2w1 L2a2w1 L2a1w1 C2a2w1 C5a2w6 
# 27130  27239  29960  30520  30695  31116  31405  31626  33465  33470  33612  34760  34876  35718  36100  36880 
# H2a1w6 H3a1w1 L3a2w1 L3a1w1 H6a1w1 C1a1w1 L4a1w1 C3a2w1 
# 40548  46030  55905  56460  61540  72623  81978  86863

table( air.dc.16s@sam_data$Time )
# Week 1 Week 6 
#     36     36

table( air.dc.16s@sam_data$Treatment )
# Control    High     Low 
#      24      24      24 

names( air.dc.16s@sam_data )
# [1] "mouseID"                        "sample"                         "samp_type"                     
# [4] "Treatment"                      "Time"                           "Animal.no"                     
# [7] "Sex"                            "Litter"                         "Date.of.Birth"                 
# [10] "Cage.no"                        "Cage.Name"                      "Rack"                          
# [13] "Ear.notch.ID"                   "Ear.notch.pos"                  "Age_received_20_Aug"           
# [16] "Age_first_soil_exposure_27_Aug" "Rack_position"                  "Sample_or_Control"             
# [19] "is.neg"                         "quant_reading" 

air.dc.16s@sam_data[ ,c("sample","samp_type","Treatment","Time")]


phy_obj <- air.dc.16s


phy_obj@sam_data[ ,c("sample","samp_type","Treatment","Time")]

# create grouping variable for each site

phy_obj@sam_data$group <- NA

phy_obj@sam_data$group <- paste0( substring(phy_obj@sam_data$sample,1,2), substring(phy_obj@sam_data$sample,nchar(phy_obj@sam_data$sample)-1,nchar(phy_obj@sam_data$sample)))



head( phy_obj@sam_data[ ,c("sample","samp_type","Treatment","Time","group")] )
# Sample Data:        [6 samples by 5 sample variables]:
#        sample samp_type Treatment   Time group
# C5a1w1 C5a1w1       air   Control Week 1  C5w1
# L1a2w1 L1a2w1       air       Low Week 1  L1w1
# C3a1w6 C3a1w6       air   Control Week 6  C3w6
# H1a2w1 H1a2w1       air      High Week 1  H1w1
# C3a2w6 C3a2w6       air   Control Week 6  C3w6
# C5a2w1 C5a2w1       air   Control Week 1  C5w1

length(unique(phy_obj@sam_data$group)) # 36
sort(unique(phy_obj@sam_data$group))
# [1] "C1w1" "C1w6" "C2w1" "C2w6" "C3w1" "C3w6" "C4w1" "C4w6" "C5w1" "C5w6" "C6w1" "C6w6"
# [13] "H1w1" "H1w6" "H2w1" "H2w6" "H3w1" "H3w6" "H4w1" "H4w6" "H5w1" "H5w6" "H6w1" "H6w6"
# [25] "L1w1" "L1w6" "L2w1" "L2w6" "L3w1" "L3w6" "L4w1" "L4w6" "L5w1" "L5w6" "L6w1" "L6w6"


# bootstrap outputs will automatically be stored in a list: b_out

# number of bootstrap resamples
B <- 100

# iterate in parallel
cl<-makeCluster( detectCores()-1 ) # detectCores()-1
registerDoParallel(cl) # remember to stopcluster()

b_out <- foreach(j=1:B, .packages=c('phyloseq')) %dopar%
  calc_AlphaDiv_in_parallel_any_merge_no(phy_obj=phy_obj, merge_by="group" )

stopCluster(cl)

getwd() # "C:/Workspace/PROJ/PAPER-MICRO-MICE/modelling"
saveRDS(b_out, file = "b_out__alpha_diversity_AIR.RDS")


length(b_out) # 100
names(b_out[[1]]) # "sample"    "shannon"   "group"     "eff_no"    "calc_type"
dim(b_out[[1]]) # 36  5

b_out[[1]]
#    sample  shannon group    eff_no calc_type
# 1    C1w1 4.246433  C1w1  69.85580 bootstrap
# 2    C1w6 3.670383  C1w6  39.26693 bootstrap
# 3    C2w1 3.746987  C2w1  42.39316 bootstrap
# 4    C2w6 3.481686  C2w6  32.51448 bootstrap
# 5    C3w1 3.458858  C3w1  31.78065 bootstrap
# 6    C3w6 3.529948  C3w6  34.12219 bootstrap
# 7    C4w1 3.761194  C4w1  42.99975 bootstrap
# 8    C4w6 3.903370  C4w6  49.56919 bootstrap
# 9    C5w1 3.690483  C5w1  40.06419 bootstrap
# 10   C5w6 3.846638  C5w6  46.83533 bootstrap
# 11   C6w1 3.672844  C6w1  39.36371 bootstrap
# 12   C6w6 3.841666  C6w6  46.60305 bootstrap
# 13   H1w1 5.494804  H1w1 243.42381 bootstrap
# 14   H1w6 3.593221  H1w6  36.35097 bootstrap
# 15   H2w1 5.402024  H2w1 221.85507 bootstrap
# 16   H2w6 3.657781  H2w6  38.77521 bootstrap
# 17   H3w1 5.377068  H3w1 216.38684 bootstrap
# 18   H3w6 4.046765  H3w6  57.21209 bootstrap
# 19   H4w1 5.435367  H4w1 229.37712 bootstrap
# 20   H4w6 3.928525  H4w6  50.83196 bootstrap
# 21   H5w1 4.995722  H5w1 147.77965 bootstrap
# 22   H5w6 3.696729  H5w6  40.31524 bootstrap
# 23   H6w1 5.069572  H6w1 159.10628 bootstrap
# 24   H6w6 4.568162  H6w6  96.36681 bootstrap
# 25   L1w1 5.304356  L1w1 201.21142 bootstrap
# 26   L1w6 3.773489  L1w6  43.53168 bootstrap
# 27   L2w1 5.325298  L2w1 205.46955 bootstrap
# 28   L2w6 3.776049  L2w6  43.64328 bootstrap
# 29   L3w1 5.481867  L3w1 240.29487 bootstrap
# 30   L3w6 3.987824  L3w6  53.93742 bootstrap
# 31   L4w1 4.647823  L4w1 104.35756 bootstrap
# 32   L4w6 3.846751  L4w6  46.84065 bootstrap
# 33   L5w1 5.195952  L5w1 180.53999 bootstrap
# 34   L5w6 3.507482  L5w6  33.36415 bootstrap
# 35   L6w1 5.241539  L6w1 188.96064 bootstrap
# 36   L6w6 4.300082  L6w6  73.70586 bootstrap


### 1st calculate diversity from one rarefying step
### then append bootstrap-derived uncertainty...

# rarefy #1
seed <- 123
r1.16s <- rarefy_even_depth(phy_obj, sample.size = min(sample_sums(phy_obj)),
                            rngseed = seed, replace = FALSE, trimOTUs = TRUE, verbose = TRUE)
shan.r1.16s <- plot_richness(r1.16s, measures=c("Shannon"))
shan.r1.16s
out <- data.frame(sample=shan.r1.16s$data$samples,shannon=shan.r1.16s$data$value,
                  group=NA )  #
out$eff_no <- exp(out$shannon) # calculate effective no of species
out$calc_type <- "rarefyx1"
str(out)

## fix "group" variable based on "sample"
out$sample <- as.character(out$sample)
out$group <- paste0( substring(out$sample,1,2), substring(out$sample,nchar(out$sample)-1,nchar(out$sample)))


str(out)
# 'data.frame':	72 obs. of  5 variables:
# $ sample   : chr  "C5a1w1" "L1a2w1" "C3a1w6" "H1a2w1" ...
# $ shannon  : num  3.62 4.98 3.44 5.22 3.52 ...
# $ group    : chr  "C5w1" "L1w1" "C3w6" "H1w1" ...
# $ eff_no   : num  37.5 145.7 31.1 185.4 33.7 ...
# $ calc_type: chr  "rarefyx1" "rarefyx1" "rarefyx1" "rarefyx1" ...

names(out)        # "sample"    "shannon"   "group"     "eff_no"    "calc_type"

names(b_out[[1]]) #  "sample"    "shannon"   "group"     "eff_no"    "calc_type"
dim(b_out[[1]]) # 36  5

temp <- out

for (j in 1:B) {
  temp <- rbind(temp,b_out[[j]])
}

head(temp)
temp[1:50, ]
tail(temp)

names(temp)
# "sample"    "shannon"   "group"     "eff_no"    "calc_type"

levels(factor(temp$group))
# [1] "C1w1" "C1w6" "C2w1" "C2w6" "C3w1" "C3w6" "C4w1" "C4w6" "C5w1" "C5w6" "C6w1" "C6w6"
# [13] "H1w1" "H1w6" "H2w1" "H2w6" "H3w1" "H3w6" "H4w1" "H4w6" "H5w1" "H5w6" "H6w1" "H6w6"
# [25] "L1w1" "L1w6" "L2w1" "L2w6" "L3w1" "L3w6" "L4w1" "L4w6" "L5w1" "L5w6" "L6w1" "L6w6"

temp$group <- factor(temp$group, levels = c(
  "C1w1", "C1w6", "C2w1", "C2w6", "C3w1", "C3w6", "C4w1", "C4w6", "C5w1", "C5w6", "C6w1", "C6w6",
  "L1w1", "L1w6", "L2w1", "L2w6", "L3w1", "L3w6", "L4w1", "L4w6", "L5w1", "L5w6", "L6w1", "L6w6",
  "H1w1", "H1w6", "H2w1", "H2w6", "H3w1", "H3w6", "H4w1", "H4w6", "H5w1", "H5w6", "H6w1", "H6w6"
  ), ordered = TRUE)


temp$Time <- substring(temp$group, first = nchar(as.character(temp$group))-1, last = nchar(as.character(temp$group)))
temp$Time <- factor(temp$Time, levels = c("w1", "w6"), labels = c("Week 1", "Week 6"))

str(temp)
# 'data.frame':	3672 obs. of  6 variables:
# $ sample   : chr  "C5a1w1" "L1a2w1" "C3a1w6" "H1a2w1" ...
# $ shannon  : num  3.62 4.98 3.44 5.22 3.52 ...
# $ group    : Ord.factor w/ 36 levels "C1w1"<"C1w6"<..: 9 13 6 25 6 9 13 25 24 4 ...
# $ eff_no   : num  37.5 145.7 31.1 185.4 33.7 ...
# $ calc_type: chr  "rarefyx1" "rarefyx1" "rarefyx1" "rarefyx1" ...
# $ Time     : Factor w/ 2 levels "Week 1","Week 6": 1 1 2 1 2 1 1 1 2 2 ...

temp.air.alpha <- temp

melt.out <- melt(temp,id.vars = c("group","calc_type", "Time"), measure.vars = "eff_no")


## plot

#cols <- c("Control" = "#f46d43", "Low" = "#66c2a5" , "High" = "#5e4fa2" ) # orange, aqua, dark-blue

cols <- c(
  "Control" ="#f46d43", "Low" ="#66c2a5", "High" ="#5e4fa2",
  "C1w1"="#f46d43", "C1w6"="#f46d43", "C2w1"="#f46d43", "C2w6"="#f46d43", "C3w1"="#f46d43", "C3w6"="#f46d43", "C4w1"="#f46d43", "C4w6"="#f46d43", "C5w1"="#f46d43", "C5w6"="#f46d43", "C6w1"="#f46d43", "C6w6"="#f46d43",
  "L1w1"="#66c2a5", "L1w6"="#66c2a5", "L2w1"="#66c2a5", "L2w6"="#66c2a5", "L3w1"="#66c2a5", "L3w6"="#66c2a5", "L4w1"="#66c2a5", "L4w6"="#66c2a5", "L5w1"="#66c2a5", "L5w6"="#66c2a5", "L6w1"="#66c2a5", "L6w6"="#66c2a5",
  "H1w1"="#5e4fa2", "H1w6"="#5e4fa2", "H2w1"="#5e4fa2", "H2w6"="#5e4fa2", "H3w1"="#5e4fa2", "H3w6"="#5e4fa2", "H4w1"="#5e4fa2", "H4w6"="#5e4fa2", "H5w1"="#5e4fa2", "H5w6"="#5e4fa2", "H6w1"="#5e4fa2", "H6w6"="#5e4fa2"
  )

shapes <- c(
  "C1w1"=16, "C1w6"=17, "C2w1"=16, "C2w6"=17, "C3w1"=16, "C3w6"=17, "C4w1"=16, "C4w6"=17, "C5w1"=16, "C5w6"=17, "C6w1"=16, "C6w6"=17,
  "L1w1"=16, "L1w6"=17, "L2w1"=16, "L2w6"=17, "L3w1"=16, "L3w6"=17, "L4w1"=16, "L4w6"=17, "L5w1"=16, "L5w6"=17, "L6w1"=16, "L6w6"=17,
  "H1w1"=16, "H1w6"=17, "H2w1"=16, "H2w6"=17, "H3w1"=16, "H3w6"=17, "H4w1"=16, "H4w6"=17, "H5w1"=16, "H5w6"=17, "H6w1"=16, "H6w6"=17
  
)


p <- ggplot(data=melt.out, aes(x=group, value)) + 
  #ggtitle("d") +
  geom_violin(data = melt.out[ which(melt.out$calc_type == "bootstrap"), ], aes(color = group) ) + scale_colour_manual(values = cols) +
  #geom_point(data = melt.out[ which(melt.out$calc_type == "rarefyx1"), ], color="gray20", shape=1  ) +
  geom_point(data = melt.out[ which(melt.out$calc_type == "rarefyx1"), ], aes(shape = group) , color="grey40", size=1 ) +
  scale_shape_manual(values = shapes) +
  #theme_bw() +
  theme_classic() +
  #theme(axis.text.x  = element_text(angle=90, vjust=0.5) ) +
  labs(x = NULL, y = "Effective OTUs (count)") +
  theme(legend.position="none") + 
  #facet_grid(~Time)
  facet_grid(~Time, scales="free_x", space="free_x")
p




## Indicate significant differences...?
sel <- which(melt.out$calc_type == "rarefyx1")
rare1 <- melt.out[sel, ]
rare1$group2 <- substring(rare1$group, 1,1 )
str(rare1)
levels(factor(rare1$group2))
rare1$group2 <- factor(rare1$group2, levels = c("C", "L", "H"), labels = c("Control", "Low", "High"))
bp <- boxplot(rare1$value ~ rare1$group2 + rare1$Time )

## exclude outliers from Kruskal-Wallis significance testing
sel <- which(rare1$value %in% bp$out)
rare1$value[sel]
rare1[sel, ]
# group calc_type   Time variable    value  group2
# 12  H6w6  rarefyx1 Week 6   eff_no 66.52506    High
# 55  C1w1  rarefyx1 Week 1   eff_no 57.26172 Control
# 59  C1w1  rarefyx1 Week 1   eff_no 71.00205 Control

rare1_exout <- rare1[-sel, ]

# Kruskal-Wallis test
kt <- kruskal.test( value ~ group2, rare1_exout[which(rare1_exout$Time=="Week 1"), ])
kt
# Kruskal-Wallis rank sum test
# data:  value by group2
# Kruskal-Wallis chi-squared = 20.632, df = 2, p-value = 3.31e-05

table( rare1_exout[which(rare1_exout$Time=="Week 1"), ]$group2 )
# Control     Low    High 
#      10      12      12 

## Dunn Test uses factor vector or non-numeric vector that can be coerced to a factor vector
pt <- dunnTest( value ~ factor(as.character(group2)), data= rare1_exout[which(rare1_exout$Time=="Week 1"), ],
                method = "bonferroni" )

pt
# Dunn (1964) Kruskal-Wallis multiple comparison
# p-values adjusted with the Bonferroni method.
# 
# Comparison         Z      P.unadj        P.adj
# 1 Control - High -4.104256 4.056176e-05 0.0001216853
# 2  Control - Low -3.869727 1.089571e-04 0.0003268714
# 3     High - Low  0.245976 8.057008e-01 1.0000000000

pt$dtres
pt <- pt$res
pt

cldList(comparison = pt$Comparison,
        p.value    = pt$P.adj,
        threshold  = 0.05)
#     Group Letter MonoLetter
# 1 Control      a         a 
# 2    High      b          b
# 3     Low      b          b


# Kruskal-Wallis test
kt <- kruskal.test( value ~ group2, rare1_exout[which(rare1_exout$Time=="Week 6"), ])
kt
# Kruskal-Wallis rank sum test
# data:  value by group2
# Kruskal-Wallis chi-squared = 0.48651, df = 2, p-value = 0.7841

table( rare1_exout[which(rare1_exout$Time=="Week 6"), ]$group2 )
# Control     Low    High 
#      12      12      11


# create arrow annotation, "Low" = "#f46d43", "High" = "#5e4fa2"
anno <- data.frame(x_start = c(10, 0, 5   ), x_end = c(15, 5, 10  ), 
                   y_start = c(40, 115, 145   ), y_end = c(40, 80, 100  ), 
                   arrow_colors = c("Control", "Low", "High"),         # add these to cols vector: "Low" = "#f46d43", "High" = "#5e4fa2"
                   Time = c("Week 1","Week 6", "Week 6"))

# also add significance annotation
annotation_df <- data.frame(
  Time = c("Week 1"),
  start=c("C1w1"),
  end = c("H6w1"),
  y = c(280),
  label = c("***"), stringsAsFactors = FALSE)
str(annotation_df)
# ensure no conflict in factor levels 
annotation_df$start <- factor(annotation_df$start, levels = c(
  "C1w1", "C1w6", "C2w1", "C2w6", "C3w1", "C3w6", "C4w1", "C4w6", "C5w1", "C5w6", "C6w1", "C6w6",
  "L1w1", "L1w6", "L2w1", "L2w6", "L3w1", "L3w6", "L4w1", "L4w6", "L5w1", "L5w6", "L6w1", "L6w6",
  "H1w1", "H1w6", "H2w1", "H2w6", "H3w1", "H3w6", "H4w1", "H4w6", "H5w1", "H5w6", "H6w1", "H6w6"
), ordered = TRUE)
annotation_df$end <- factor(annotation_df$end, levels = c(
  "C1w1", "C1w6", "C2w1", "C2w6", "C3w1", "C3w6", "C4w1", "C4w6", "C5w1", "C5w6", "C6w1", "C6w6",
  "L1w1", "L1w6", "L2w1", "L2w6", "L3w1", "L3w6", "L4w1", "L4w6", "L5w1", "L5w6", "L6w1", "L6w6",
  "H1w1", "H1w6", "H2w1", "H2w6", "H3w1", "H3w6", "H4w1", "H4w6", "H5w1", "H5w6", "H6w1", "H6w6"
), ordered = TRUE)

hist(melt.out$value)


pp <- #p +
  
  ggplot(data=melt.out, aes(x=group, value)) + 
  #ggtitle("d") +
  geom_violin(data = melt.out[ which(melt.out$calc_type == "bootstrap"), ], aes(color = group) ) + scale_colour_manual(values = cols) +
  geom_point(data = melt.out[ which(melt.out$calc_type == "rarefyx1"), ], aes(shape = group) , color="grey40", size=1 ) +
  scale_shape_manual(values = shapes) +
  theme_classic() +
  labs(x = NULL, y = "Effective OTUs (count)") +
  theme(legend.position="none") + 
  facet_grid(~Time, scales="free_x", space="free_x") +
  
  scale_x_discrete(labels=c(
    "C1w1"="C1", "C1w6"="C1", "C2w1"="C2", "C2w6"="C2", "C3w1"="C3", "C3w6"="C3", "C4w1"="C4", "C4w6"="C4", "C5w1"="C5", "C5w6"="C5", "C6w1"="C6", "C6w6"="C6",
    "L1w1"="L1", "L1w6"="L1", "L2w1"="L2", "L2w6"="L2", "L3w1"="L3", "L3w6"="L3", "L4w1"="L4", "L4w6"="L4", "L5w1"="L5", "L5w6"="L5", "L6w1"="L6", "L6w6"="L6",
    "H1w1"="H1", "H1w6"="H1", "H2w1"="H2", "H2w6"="H2", "H3w1"="H3", "H3w6"="H3", "H4w1"="H4", "H4w6"="H4", "H5w1"="H5", "H5w6"="H5", "H6w1"="H6", "H6w6"="H6"
    
  )) +
  
  # add arrows - based on this post: https://www.r-bloggers.com/adding-different-annotation-to-each-facet-in-ggplot/
  geom_segment(data = anno, aes(x = x_start, y = y_start, xend = x_end, yend = y_end, colour = arrow_colors),   #, colour = arrow_colors
               arrow = arrow(length = unit(5, "pt"))  ) +
  
  geom_signif(data=annotation_df,
              aes(xmin=start, xmax=end, annotations=label, y_position=y),
              textsize = 3.5, size = 0.5, tip_length = 0, margin_top = 0, vjust = 0.4 ,               # vjust = -0.2
              manual=TRUE) +
  
  ylim(0,290) +
  
  theme(
    #plot.title=element_text(face = "bold", hjust = -0.18),  # adjust hjust by trial-and-error
    #plot.margin = margin(t = 2, r = 2, b = 2, l = 5, "pt"),
    plot.margin = margin(t = 5, r = 0, b = 0, l = 5, "pt"),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    
    axis.ticks.length=unit(-3, "pt"),
    
    # axis.text.x = element_text(margin=margin(t = 5,r = 5,b = 2,l = 5,"pt"), angle=60, vjust=0.5, size = rel(0.6)),
    # axis.text.y = element_text(margin=margin(t = 5,r = 5,b = 5,l = 5,"pt"), size = rel(0.5)),
    # axis.title.y = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.6)),
    axis.text.x = element_text(margin=margin(t = 5,r = 5,b = 2,l = 5,"pt"), angle=60, vjust=0.5, size = rel(0.7)),
    axis.text.y = element_text(margin=margin(t = 5,r = 5,b = 5,l = 5,"pt"), size = rel(0.6)),
    axis.title.y = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.7)),
    
    strip.background = element_rect(fill="white", linetype = "blank"),
    #strip.text = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.6))
    strip.text = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.7))
    
  )
pp


grid.text(label = "D" , x = unit(0.035, "npc") , y = unit(0.93,"npc"), gp=gpar(fontsize=11, fontface="bold") )


dev.print(tiff, file = paste0("plots/","Alpha-diversity-AIR-samples-with-merged-sample-bootstrap-with-Sig-D-vFINAL.tiff"), width = 8.57, height = 5.95, units = "cm", res=600, compression="lzw")


#------------------------





#### Ordination of fecal Week 0 only samples
#------------------------

unique(clean.16s@sam_data$samp_type)
# "cecal"         "fecal"         "air"           "soil"          "fresh bedding"

fecalw0.dc.16s <- prune_samples(  clean.16s@sam_data$samp_type == "fecal" &  clean.16s@sam_data$Time == "Week 0",  clean.16s )
fecalw0.dc.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 2730 taxa and 54 samples ]
# sample_data() Sample Data:       [ 54 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 2730 taxa by 7 taxonomic ranks ]

min(sample_sums(fecalw0.dc.16s)) # 11450
min(taxa_sums(fecalw0.dc.16s)) # 0

# prune taxa that have zero sequence reads
fecalw0.dc.16s <- prune_taxa(taxa = taxa_sums(fecalw0.dc.16s) > 0, x = fecalw0.dc.16s)
fecalw0.dc.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 732 taxa and 54 samples ]
# sample_data() Sample Data:       [ 54 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 732 taxa by 7 taxonomic ranks ]

min(taxa_sums(fecalw0.dc.16s)) # 1


table(fecalw0.dc.16s@sam_data$Cage.Name)
# C1 C2 C3 C4 C5 C6 H1 H2 H3 H4 H5 H6 L1 L2 L3 L4 L5 L6 
#  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3



## Use lowest number of reads to create rarefied dataset

seed <- 123
rare.fecalw0.dc.16s <- rarefy_even_depth(fecalw0.dc.16s, sample.size = min(sample_sums(fecalw0.dc.16s)),
                                              rngseed = seed, replace = FALSE, trimOTUs = TRUE, verbose = TRUE)

sample_sums(rare.fecalw0.dc.16s) # all 11450
ntaxa(rare.fecalw0.dc.16s) # 610
nsamples(rare.fecalw0.dc.16s) # 54


### ORDINATION PLOT [finished-plot] # # # # # # # # # # # # # # # 
### NMDS + Bray-Curtis

set.seed(123)
ord <- ordinate(rare.fecalw0.dc.16s, "NMDS", "bray")

ord
# Call:
# metaMDS(comm = veganifyOTU(physeq), distance = distance) 
# 
# global Multidimensional Scaling using monoMDS
# 
# Data:     wisconsin(sqrt(veganifyOTU(physeq))) 
# Distance: bray 
# 
# Dimensions: 2 
# Stress:     0.1573245 
# Stress type 1, weak ties
# Two convergent solutions found after 20 tries
# Scaling: centring, PC rotation, halfchange scaling 
# Species: expanded scores based on 'wisconsin(sqrt(veganifyOTU(physeq)))'

str(ord)

rare.fecalw0.dc.16s@sam_data$samp_type # all "fecal"
rare.fecalw0.dc.16s@sam_data$Time      # all "Week 1"

length(unique(rare.fecalw0.dc.16s@sam_data$Litter )) # 15
length(unique(rare.fecalw0.dc.16s@sam_data$Cage.Name )) # 18


## Display by Litter

p <- plot_ordination(rare.fecalw0.dc.16s, ord, type="samples", color="Litter", shape = "Litter")
p
str(p)
p$data$Treatment <- factor(p$data$Treatment, levels = c("Control","Low","High"), ordered = TRUE)

unique( p$data$Litter )

# http://sape.inf.usi.ch/quick-reference/ggplot2/colour
cols <- c(
  "blue" ,  "dodgerblue" ,  "navy" ,  "chartreuse3" , "springgreen3" , "olivedrab4",  "seagreen4", 
  "green" , "orange", "tomato", "magenta3",   "hotpink",   "purple", "red" ,  
  "firebrick"  
)


shapes <- c(1:15)


pp <- p + theme_bw() +
  #ggtitle("A") +
  scale_colour_manual(values = cols) +
  scale_shape_manual(values = shapes) +
  geom_polygon(aes(fill = Litter), alpha = 0) +
  annotate(geom="text", x= -3.5, y= -0.6, label = paste0("Stress = ",round(ord$stress,4)), hjust=0, vjust=0, size=2.5 ) +
  #guides(col = guide_legend(ncol = 2) ) +
  labs(x = NULL, y = NULL) +
  theme(
    plot.margin = margin(t = 2, r = 5, b = 2, l = 2, "pt"),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    
    axis.text.x = element_blank(),
    axis.text.y = element_blank(),
    axis.ticks = element_blank(),
    
    legend.box.margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"),
    legend.key.size = unit(2, "pt"),
    
    #legend.spacing = unit(20, "pt"),
    #legend.text = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.5)),
    legend.text = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.5)),
    legend.title = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.7)),
    legend.margin = margin(t = 0,r = 0,b = 0,l = 5,"pt"),
    legend.box.spacing = unit(2, "pt")
    )


pp


grid.text(label = "A" , x = unit(0.055, "npc") , y = unit(0.94,"npc"), gp=gpar(fontsize=11, fontface="bold") )

dev.print(tiff, file = paste0("plots/","Ordination-Fecal-Week0-only-1xRarefy-by-Litters-vFINAL.tiff"), width = 8.3, height = 7, units = "cm", res=600, compression="lzw")



## Display by Litter and Cage distribution

p <- plot_ordination(rare.fecalw0.dc.16s, ord, type="samples", color="Litter", shape="Cage.Name")
p
str(p)
p$data$Treatment <- factor(p$data$Treatment, levels = c("Control","Low","High"), ordered = TRUE)
p$data$Cage.Name <- factor(p$data$Cage.Name, levels = c( paste0("C",1:6),paste0("L",1:6),paste0("H",1:6) ), ordered = TRUE)

# http://sape.inf.usi.ch/quick-reference/ggplot2/colour
cols <- c(
  "blue" ,  "dodgerblue" ,  "navy" ,  "chartreuse3" , "springgreen3" , "olivedrab4",  "seagreen4", 
  "green" , "orange", "tomato", "magenta3",   "hotpink",   "purple", "red" ,  
  "firebrick"  
  
)

shapes <- c(1:18)


pp <- p + theme_bw() +
  #ggtitle("B") +
  scale_colour_manual(values = cols) +
  scale_shape_manual(values = shapes, name = "Enclosure") +
  guides(col = guide_legend(ncol = 2),
         shape = guide_legend(ncol = 3)) +
  annotate(geom="text", x= -3.5, y= -0.6, label = paste0("Stress = ",round(ord$stress,4)), hjust=0, vjust=0, size=2.5 ) +
  labs(x = NULL, y = NULL) +
  theme(
    
    plot.margin = margin(t = 2, r = 5, b = 2, l = 2, "pt"),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    
    axis.text.x = element_blank(),
    axis.text.y = element_blank(),
    axis.ticks = element_blank(),
    
    legend.box.margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"),
    legend.key.size = unit(2, "pt"),
    
    #legend.text = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.5)),
    legend.text = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.5)),
    legend.title = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.7)),
    legend.margin = margin(t = 0,r = 0,b = 0,l = 5,"pt"),
    legend.box.spacing = unit(2, "pt")
    
  )
pp

grid.text(label = "B" , x = unit(0.055, "npc") , y = unit(0.94,"npc"), gp=gpar(fontsize=11, fontface="bold") )

dev.print(tiff, file = paste0("plots/","Ordination-Fecal-Week0-only-1xRarefy-by-Litters-and-Cage-vFINAL.tiff"), width = 9.6, height = 7, units = "cm", res=600, compression="lzw")





## Display by Treatment and Cage

p <- plot_ordination(rare.fecalw0.dc.16s, ord, type="samples", color="Treatment", shape="Cage.Name")
p
str(p)
p$data$Treatment <- factor(p$data$Treatment, levels = c("Control","Low","High"), ordered = TRUE)
p$data$Cage.Name <- factor(p$data$Cage.Name, levels = c( paste0("C",1:6),paste0("L",1:6),paste0("H",1:6)  ), ordered = TRUE)

cols <- c("Control" = "#f46d43", "Low" = "#66c2a5" , "High" = "#5e4fa2" ) # orange, aqua, dark-blue
shapes <- c(1:18)


pp <- p + theme_bw() + #
  scale_colour_manual(values = cols) +
  scale_shape_manual(values = shapes, name = "Enclosure") +
  geom_polygon(aes(fill = Treatment), alpha = 0.3) +
  scale_fill_manual(values = cols) +
  guides( shape = guide_legend(ncol = 2) ) +
  annotate(geom="text", x= -3, y= -0.6, label = paste0("Stress = ",round(ord$stress,4)), hjust=0, vjust=0, size=2.5 )

pp


str(p$data)


## Minimise plot size

pp <- p + theme_bw() + #
#pp <- ggplot(data = p$data, mapping = aes(x = NMDS1, y = NMDS2) ) + 
  geom_point(aes(color=Treatment, shape= Cage.Name), size = 1) +
  theme_bw() + #
  ggtitle("A") +
  scale_colour_manual(values = cols) +
  scale_shape_manual(values = shapes, name = "Enclosure") +
  geom_polygon(aes(fill = Treatment), alpha = 0.3) +
  scale_fill_manual(values = cols) +
  guides( shape = guide_legend(ncol = 2) ) +
  #guides( shape = guide_legend(ncol = 3) ) +
  #guides( col = guide_legend(ncol = 3) ) +
  annotate(geom="text", x= -3.5, y= -0.6, label = paste0("Stress = ",round(ord$stress,4)), hjust=0, vjust=0, size=2.5 ) +
  labs(x = NULL, y = NULL) +
  theme(
    
    plot.margin = margin(t = 2, r = 5, b = 2, l = 2, "pt"),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    
    axis.text.x = element_blank(),
    axis.text.y = element_blank(),
    axis.ticks = element_blank(),
    
    legend.box.margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"),
    #legend.key.size = unit(2, "pt"),
    legend.key.size = unit(1.35, "pt"),
    
    #legend.text = element_text(margin=margin(t = 0,r = 0,b = 1,l = 0,"pt"), size = rel(0.5)),
    #legend.title = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.6)),
    #legend.text = element_text(margin=margin(t = 0,r = 0,b = 1,l = 0,"pt"), size = rel(0.4)),
    #legend.title = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.45)),
    
    legend.text = element_text(margin=margin(t = 0,r = 0,b = 1,l = 0,"pt"), size = rel(0.5)),
    legend.title = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.55)),
    
    #legend.margin = margin(t = 0,r = 0,b = 0,l = 5,"pt"),
    legend.margin = margin(t = 0,r = 0,b = 0,l = 2,"pt"),
    #legend.box.spacing = unit(2, "pt"),
    legend.box.spacing = unit(4, "pt"),
    legend.spacing = unit(4, "pt")
    
    )

pp

## Plot legend ONLY
library(cowplot); packageVersion("cowplot") # '0.9.2'
legend <- cowplot::get_legend(pp)
grid.newpage()
grid.draw(legend)


dev.print(tiff, file = paste0("plots/","Legend-Ordination-Fecal-w0-w7-Cecal-vFINAL.tiff"), width = 1.35, height = 6, units = "cm", res=600, compression="lzw")

detach("package:cowplot", unload=TRUE)


## No legend in this plot


str(p)

p <- plot_ordination(rare.fecalw0.dc.16s, ord, type="samples", color="Treatment", shape="Cage.Name")
str(p)
p$data$Treatment <- factor(p$data$Treatment, levels = c("Control","Low","High"), ordered = TRUE)
p$data$Cage.Name <- factor(p$data$Cage.Name, levels = c( paste0("C",1:6),paste0("L",1:6),paste0("H",1:6)  ), ordered = TRUE)

cols <- c("Control" = "#f46d43", "Low" = "#66c2a5" , "High" = "#5e4fa2" ) # orange, aqua, dark-blue
shapes <- c(1:18)


## Add centroids


#https://stackoverflow.com/questions/23463324/r-add-centroids-to-scatter-plot
x <- p$data$NMDS1
y <- p$data$NMDS2
Treatment <- p$data$Treatment
df <- data.frame(Treatment, x, y)

centroids <- aggregate(cbind(x,y)~Treatment,df,mean)
f         <- function(z) {sd(z)/sqrt(length(z))} # function to calculate std.err
se        <- aggregate(cbind(se.x=x,se.y=y)~Treatment,df,f)
centroids <- merge(centroids,se, by="Treatment")    # add std.err column to centroids
str(centroids)



## No legend in this plot & add Centroids

pp <- ggplot(data = p$data, aes(x=NMDS1, y=NMDS2, shape = Cage.Name, colour = Treatment) ) + 
  theme_bw() + #
  #ggtitle("A") +
  geom_point(size = 1, alpha=0.5) +  # was alpha = 0.55 
  scale_colour_manual(values = cols) +
  scale_shape_manual(values = shapes, name = "Enclosure") +
  geom_polygon(aes(fill = Treatment), alpha = 0.175, linetype="blank") +  # was alpha = 0.25
  #geom_point(data = df_centroid, aes(x=NMDS1, y=NMDS2, colour=Treatment), size=8, alpha=0.5, inherit.aes = FALSE, show.legend = FALSE) + # , color=Treatment
  
  geom_errorbar(data=centroids,aes(x=x, ymin=y-se.y,ymax=y+se.y, colour=Treatment), size=0.65, width=NA, inherit.aes = FALSE) + # ,width=0.1
  geom_errorbarh(data=centroids,aes(y=y, xmin=x-se.x,xmax=x+se.x, colour=Treatment),size=0.65, height=NA, inherit.aes = FALSE) + # ,height=0.05
  #geom_point(data=centroids, aes(x=x, y=y, colour=Treatment), shape=1, size=3.5, stroke=0.65, inherit.aes = FALSE) + # 
  geom_point(data=centroids, aes(x=x, y=y, colour=Treatment), shape=1, size=4, stroke=0.65, inherit.aes = FALSE) + # 
  
  scale_fill_manual(values = cols) +
  guides( shape = guide_legend(ncol = 2) ) +
  #annotate(geom="text", x= -3.5, y= -0.6, label = paste0("Stress = ",round(ord$stress,4)), hjust=0, vjust=0, size=2.5 ) +
  annotate(geom="text", x= -3.5, y= -0.595, label = paste0("Week 0 faeces\nStress = ",round(ord$stress,4)), hjust=0, vjust=0, size=2.5 ) +
  labs(x = NULL, y = NULL) +
  theme(
    
    #plot.margin = margin(t = 2, r = 5, b = 2, l = 2, "pt"),
    plot.margin = margin(t = 2, r = 1, b = 2, l = 1, "pt"),
    plot.title = element_text(margin=margin(b = 3, unit = "pt")),
    
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    
    axis.text.x = element_blank(),
    axis.text.y = element_blank(),
    axis.ticks = element_blank(),
    
    #legend.box.margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"),
    #legend.key.size = unit(2, "pt"),
    
    #legend.text = element_text(margin=margin(t = 0,r = 0,b = 1,l = 0,"pt"), size = rel(0.5)),
    #legend.title = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.7)),
    #legend.margin = margin(t = 0,r = 0,b = 0,l = 5,"pt"),
    #legend.box.spacing = unit(2, "pt"),
    
    legend.position = "none"
    
  )

pp

str(pp)


grid.text(label = "A" , x = unit(0.095, "npc") , y = unit(0.93,"npc"), gp=gpar(fontsize=11, fontface="bold") )

dev.print(tiff, file = paste0("plots/","Ordination-Fecal-Week0-only-1xRarefy-by-Treatment-and-Cage-NO-LEGEND-CENTROIDS-A-vFINAL.tiff"), width = 4.83, height = 6, units = "cm", res=600, compression="lzw")




# Test hypothesis that microbiota vary (with different centroids) by Litter

# Calculate bray curtis distance matrix
set.seed(123)
bray.rare.fecalw0.dc.16s <- phyloseq::distance(rare.fecalw0.dc.16s, method = "bray")
sampledf <- data.frame(sample_data(rare.fecalw0.dc.16s))
str(sampledf)


# Adonis test
set.seed(123)
adonis(bray.rare.fecalw0.dc.16s ~ Litter + Treatment, data = sampledf)
# Call:
# adonis(formula = bray.rare.fecalw0.dc.16s ~ Litter + Treatment,      data = sampledf) 
# 
# Permutation: free
# Number of permutations: 999
# 
# Terms added sequentially (first to last)
# 
#           Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)    
# Litter    14    5.7600 0.41143  3.3948 0.54302  0.001 ***
# Treatment  2    0.3632 0.18161  1.4986 0.03424  0.072 .  
# Residuals 37    4.4841 0.12119         0.42274           
# Total     53   10.6073                 1.00000           
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## Litter is significant. Treatment is not at 0.05 significance level


# For Litter only
set.seed(123)
adonis(bray.rare.fecalw0.dc.16s ~ Litter, data = sampledf)
# Call:
# adonis(formula = bray.rare.fecalw0.dc.16s ~ Litter, data = sampledf)
#
# Permutation: free
# Number of permutations: 999
#
# Terms added sequentially (first to last)
#
#           Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)
# Litter    14    5.7600 0.41143  3.3102 0.54302  0.001 ***
# Residuals 39    4.8473 0.12429         0.45698
# Total     53   10.6073                 1.00000
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1


# Are faecal week 0 microbiota associated with gender?
set.seed(123)
adonis(bray.rare.fecalw0.dc.16s ~ Litter + Sex, data = sampledf)
# Call:
# adonis(formula = bray.rare.fecalw0.dc.16s ~ Litter + Sex, data = sampledf) 
# 
# Permutation: free
# Number of permutations: 999
# 
# Terms added sequentially (first to last)
# 
#           Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)    
# Litter    14    5.7600 0.41143  3.3300 0.54302  0.001 ***
# Sex        1    0.1524 0.15239  1.2334 0.01437  0.258    
# Residuals 38    4.6949 0.12355         0.44261           
# Total     53   10.6073                 1.00000           
# ---
# Signif. codes:  0 �***� 0.001 �**� 0.01 �*� 0.05 �.� 0.1 � � 1

# Sex is not significant - so this is not a valid model



# Adonis test
set.seed(123)
adonis(bray.rare.fecalw0.dc.16s ~ Treatment, data = sampledf)





# Homogeneity of dispersion test
beta <- betadisper(bray.rare.fecalw0.dc.16s, sampledf$Litter)
set.seed(123)
permutest(beta)
# Permutation test for homogeneity of multivariate dispersions
# Permutation: free
# Number of permutations: 999
# 
# Response: Distances
#           Df  Sum Sq  Mean Sq      F N.Perm Pr(>F)
# Groups    14 0.22808 0.016291 1.0464    999  0.407
# Residuals 39 0.60721 0.015569     


table(rare.fecalw0.dc.16s@sam_data$Treatment)
# Control    High     Low 
#      18      18      18


table(rare.fecalw0.dc.16s@sam_data$Sex)
# female   male 
#     27     27 


table(rare.fecalw0.dc.16s@sam_data$Litter)
# Blue 20   Blue 24   Blue 27  Green 13  Green 14  Green 18  Green 21  Green 30 Orange 15 Orange 19   Pink 23   Pink 24 Purple 26 
# 5         2         2         4         2         4         1         2         7         5         4         6         4 
# Red 22    Red 23 
# 2         4 

summary(as.numeric(table(rare.fecalw0.dc.16s@sam_data$Litter)))
# Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
# 1.0     2.0     4.0     3.6     4.5     7.0 

# i.e. median 4, IQR 2-4.5

#------------------------





#### Ordination of fecal Week 7 only samples
#------------------------

unique(clean.16s@sam_data$samp_type)
#  "cecal"         "fecal"         "air"           "soil"          "fresh bedding"    


fecalw7.dc.16s <- prune_samples( clean.16s@sam_data$samp_type == "fecal" & clean.16s@sam_data$Time == "Week 7", clean.16s )
fecalw7.dc.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 2730 taxa and 53 samples ]
# sample_data() Sample Data:       [ 53 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 2730 taxa by 7 taxonomic ranks ]

min(sample_sums(fecalw7.dc.16s)) #  14195
min(taxa_sums(fecalw7.dc.16s)) # 0

# prune taxa that have zero sequence reads
fecalw7.dc.16s <- prune_taxa( taxa = taxa_sums(fecalw7.dc.16s) > 0, x = fecalw7.dc.16s )
fecalw7.dc.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 645 taxa and 53 samples ]
# sample_data() Sample Data:       [ 53 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 645 taxa by 7 taxonomic ranks ]

table(fecalw7.dc.16s@sam_data$Cage.Name)
# C1 C2 C3 C4 C5 C6 H1 H2 H3 H4 H5 H6 L1 L2 L3 L4 L5 L6 
# 3  3  3  3  3  3  3  3  3  3  3  3  2  3  3  3  3  3

table( fecalw7.dc.16s@sam_data$Treatment )
# Control    High     Low 
#      18      18      17  

table( fecalw7.dc.16s@sam_data$Time )
# Week 7 
#     53 




## Use lowest number of reads to create rarefied dataset

seed <- 123
rare.fecalw7.dc.16s <- rarefy_even_depth(fecalw7.dc.16s, sample.size = min(sample_sums(fecalw7.dc.16s)),
                                         rngseed = seed, replace = FALSE, trimOTUs = TRUE, verbose = TRUE)

sample_sums(rare.fecalw7.dc.16s) # all 14195
ntaxa(rare.fecalw7.dc.16s) # 576
nsamples(rare.fecalw7.dc.16s) # 53


### ORDINATION PLOT [finished-plot] # # # # # # # # # # # # # # # 
### NMDS + Bray-Curtis

set.seed(123)
ord <- ordinate(rare.fecalw7.dc.16s, "NMDS", "bray")

ord
# Call:
# metaMDS(comm = veganifyOTU(physeq), distance = distance) 
# 
# global Multidimensional Scaling using monoMDS
# 
# Data:     wisconsin(sqrt(veganifyOTU(physeq))) 
# Distance: bray 
# 
# Dimensions: 2 
# Stress:     0.1577653 
# Stress type 1, weak ties
# Two convergent solutions found after 20 tries
# Scaling: centring, PC rotation, halfchange scaling 
# Species: expanded scores based on 'wisconsin(sqrt(veganifyOTU(physeq)))' 

str(ord)

rare.fecalw7.dc.16s@sam_data$samp_type # all "fecal"
rare.fecalw7.dc.16s@sam_data$Time      # all "Week 7"

length(unique(rare.fecalw7.dc.16s@sam_data$Litter )) # 15
length(unique(rare.fecalw7.dc.16s@sam_data$Cage.Name )) # 18

p <- plot_ordination(rare.fecalw7.dc.16s, ord, type="samples", color="Treatment", shape="Cage.Name")
p
str(p)
p$data$Treatment <- factor(p$data$Treatment, levels = c("Control","Low","High"), ordered = TRUE)
p$data$Cage.Name <- factor(p$data$Cage.Name, levels = c( paste0("C",1:6),paste0("L",1:6),paste0("H",1:6)  ), ordered = TRUE)
p$data$Sex


cols <- c("Control" = "#f46d43", "Low" = "#66c2a5" , "High" = "#5e4fa2" ) # orange, aqua, dark-blue
shapes <- c(1:18)


pp <- p + theme_bw() +
  scale_colour_manual(values = cols) +
  scale_shape_manual(values = shapes, name = "Enclosure") +
  scale_fill_manual(values = cols) +
  geom_polygon(aes(fill = Treatment), alpha = 0.3) +
  annotate(geom="text", x= 0.43, y= -0.3, label = paste0("Stress = ",round(ord$stress,5)), hjust=1, vjust=0, size=3.25 ) +
  guides(
    color = guide_legend(order = 0),
    shape = guide_legend(order = 1)
  ) +
  guides( shape = guide_legend(ncol = 2) ) # +

pp


## Minimise plot size &
## No legend in this plot



p <- plot_ordination(rare.fecalw7.dc.16s, ord, type="samples", color="Treatment", shape="Cage.Name")
p$data$Treatment <- factor(p$data$Treatment, levels = c("Control","Low","High"), ordered = TRUE)
p$data$Cage.Name <- factor(p$data$Cage.Name, levels = c( paste0("C",1:6),paste0("L",1:6),paste0("H",1:6)  ), ordered = TRUE)

cols <- c("Control" = "#f46d43", "Low" = "#66c2a5" , "High" = "#5e4fa2" ) # orange, aqua, dark-blue
shapes <- c(1:18)


#https://stackoverflow.com/questions/23463324/r-add-centroids-to-scatter-plot
x <- p$data$NMDS1
y <- p$data$NMDS2
Treatment <- p$data$Treatment
df <- data.frame(Treatment, x, y)

centroids <- aggregate(cbind(x,y)~Treatment,df,mean)
f         <- function(z) {sd(z)/sqrt(length(z))} # function to calculate std.err
se        <- aggregate(cbind(se.x=x,se.y=y)~Treatment,df,f)
centroids <- merge(centroids,se, by="Treatment")    # add std.err column to centroids
str(centroids)



## No legend in this plot & add Centroids

pp <- ggplot(data=p$data, aes(x=NMDS1, y=NMDS2, shape = Cage.Name, colour = Treatment)) +
  theme_bw() +
  #ggtitle("B") +
  geom_point(size = 1, alpha=0.5) +
  scale_colour_manual(values = cols) +
  scale_shape_manual(values = shapes, name = "Enclosure") +
  geom_polygon(aes(fill = Treatment), alpha = 0.175, linetype="blank") +
  
  geom_errorbar(data=centroids,aes(x=x, ymin=y-se.y,ymax=y+se.y, colour=Treatment), size=0.65, width=NA, inherit.aes = FALSE) + # ,width=0.1
  geom_errorbarh(data=centroids,aes(y=y, xmin=x-se.x,xmax=x+se.x, colour=Treatment),size=0.65, height=NA, inherit.aes = FALSE) + # ,height=0.05
  geom_point(data=centroids, aes(x=x, y=y, colour=Treatment), shape=1, size=4, stroke=0.65, inherit.aes = FALSE) + # 
  
  scale_fill_manual(values = cols) +
  guides( shape = guide_legend(ncol = 2) ) +
  #annotate(geom="text", x= -0.4, y= -0.42, label = paste0("Stress = ",round(ord$stress,4)), hjust=0, vjust=0, size=2.5 ) +
  annotate(geom="text", x= -0.42, y= -0.44, label = paste0("Week 7 faeces\nStress = ",round(ord$stress,4)), hjust=0, vjust=0, size=2.5 ) +

  labs(x = NULL, y = NULL) +
  theme(
    
    #plot.margin = margin(t = 2, r = 5, b = 2, l = 2, "pt"),
    plot.margin = margin(t = 2, r = 1, b = 2, l = 1, "pt"),
    plot.title = element_text(margin=margin(b = 3, unit = "pt")),
    
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    
    axis.text.x = element_blank(),
    axis.text.y = element_blank(),
    axis.ticks = element_blank(),
    
    # legend.box.margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"),
    # legend.key.size = unit(2, "pt"),
    # 
    # legend.text = element_text(margin=margin(t = 0,r = 0,b = 1,l = 0,"pt"), size = rel(0.5)),
    # legend.title = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.7)),
    # legend.margin = margin(t = 0,r = 0,b = 0,l = 5,"pt"),
    # legend.box.spacing = unit(2, "pt"),
    
    legend.position = "none"
    
  )

pp


grid.text(label = "B" , x = unit(0.095, "npc") , y = unit(0.93,"npc"), gp=gpar(fontsize=11, fontface="bold") )

dev.print(tiff, file = paste0("plots/","Ordination-Fecal-Week7-only-1xRarefy-by-Treatment-and-Cage-NO-LEGEND-CENTROIDS-B-vFINAL.tiff"), width = 4.83, height = 6, units = "cm", res=600, compression="lzw")






# Test hypothesis that microbiota vary (with different centroids) by Treatment

# Calculate bray curtis distance matrix
set.seed(123)
bray.rare.fecalw7.dc.16s <- phyloseq::distance(rare.fecalw7.dc.16s, method = "bray")
sampledf <- data.frame(sample_data(rare.fecalw7.dc.16s))
str(sampledf)




# Adonis test
set.seed(123)
adonis(bray.rare.fecalw7.dc.16s ~ Treatment + Sex + Litter + Cage.Name, data = sampledf)
# Call:
#   adonis(formula = bray.rare.fecalw7.dc.16s ~ Treatment + Sex + Litter + Cage.Name, data = sampledf) 
# 
# Permutation: free
# Number of permutations: 999
# 
# Terms added sequentially (first to last)
# 
#           Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)    
# Treatment  2    0.3005 0.15027  2.4955 0.05452  0.020 *  
# Sex        1    0.3510 0.35098  5.8289 0.06367  0.002 ** 
# Litter    14    1.6480 0.11772  1.9549 0.29895  0.001 ***
# Cage.Name 14    1.9486 0.13918  2.3114 0.35347  0.001 ***
# Residuals 21    1.2645 0.06021         0.22938           
# Total     52    5.5126                 1.00000           
# ---
# Signif. codes:  0 �***� 0.001 �**� 0.01 �*� 0.05 �.� 0.1 � � 1




# Homogeneity of dispersion test
beta <- betadisper(bray.rare.fecalw7.dc.16s, sampledf$Treatment)
set.seed(123)
permutest(beta)
# Permutation test for homogeneity of multivariate dispersions
# Permutation: free
# Number of permutations: 999
# 
# Response: Distances
#           Df   Sum Sq   Mean Sq      F N.Perm Pr(>F)
# Groups     2 0.004422 0.0022111 0.4247    999   0.66
# Residuals 50 0.260341 0.0052068  

table(rare.fecalw7.dc.16s@sam_data$Treatment)
# Control    High     Low 
#      18      18      17

table(rare.fecalw7.dc.16s@sam_data$Sex)
# female   male 
#     26     27 

x<- table(rare.fecalw7.dc.16s@sam_data$Litter)
x
# Blue 20   Blue 24   Blue 27  Green 13  Green 14  Green 18  Green 21  Green 30 Orange 15 Orange 19   Pink 23 
# 5         2         1         4         2         4         1         2         7         5         4 
# Pink 24 Purple 26    Red 22    Red 23 
# 6         4         2         4 

hist( as.numeric(x) )
summary( as.numeric(x) )
# Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
# 1.000   2.000   4.000   3.533   4.500   7.000 


table(rare.fecalw7.dc.16s@sam_data$Cage.Name)
# C1 C2 C3 C4 C5 C6 H1 H2 H3 H4 H5 H6 L1 L2 L3 L4 L5 L6 
# 3  3  3  3  3  3  3  3  3  3  3  3  2  3  3  3  3  3

#
#------------------------





#### Ordination of cecal only samples
#------------------------

unique(clean.16s@sam_data$samp_type)
#  "cecal"         "fecal"         "air"           "soil"          "fresh bedding"

cecal.dc.16s <- prune_samples( clean.16s@sam_data$samp_type == "cecal", clean.16s )
cecal.dc.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 2730 taxa and 53 samples ]
# sample_data() Sample Data:       [ 53 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 2730 taxa by 7 taxonomic ranks ]

min(sample_sums(cecal.dc.16s)) # 14632
min(taxa_sums(cecal.dc.16s)) # 0

# prune taxa that have zero sequence reads
cecal.dc.16s <- prune_taxa( taxa = taxa_sums(cecal.dc.16s) > 0, x = cecal.dc.16s )
cecal.dc.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 550 taxa and 53 samples ]
# sample_data() Sample Data:       [ 53 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 550 taxa by 7 taxonomic ranks ]

table(cecal.dc.16s@sam_data$Cage.Name)
# C1 C2 C3 C4 C5 C6 H1 H2 H3 H4 H5 H6 L1 L2 L3 L4 L5 L6 
# 3  3  3  3  3  3  3  3  3  3  3  3  3  2  3  3  3  3 

table( cecal.dc.16s@sam_data$Treatment )
# Control    High     Low 
#      18      18      17  

table( cecal.dc.16s@sam_data$Time )
# Post-exposure 
# 53 




## Use lowest number of reads to create rarefied dataset

seed <- 123
rare.cecal.dc.16s <- rarefy_even_depth(cecal.dc.16s, sample.size = min(sample_sums(cecal.dc.16s)),
                                            rngseed = seed, replace = FALSE, trimOTUs = TRUE, verbose = TRUE)

sample_sums(rare.cecal.dc.16s) # all 14632
ntaxa(rare.cecal.dc.16s) # 496
nsamples(rare.cecal.dc.16s) # 53


### ORDINATION PLOT [finished-plot] # # # # # # # # # # # # # # # 
### NMDS + Bray-Curtis

set.seed(123)
ord <- ordinate(rare.cecal.dc.16s, "NMDS", "bray")

ord
# Call:
# metaMDS(comm = veganifyOTU(physeq), distance = distance) 
# 
# global Multidimensional Scaling using monoMDS
# 
# Data:     wisconsin(sqrt(veganifyOTU(physeq))) 
# Distance: bray 
# 
# Dimensions: 2 
# Stress:     0.2012916 
# Stress type 1, weak ties
# No convergent solutions - best solution after 20 tries
# Scaling: centring, PC rotation, halfchange scaling 
# Species: expanded scores based on 'wisconsin(sqrt(veganifyOTU(physeq)))' 

str(ord)

rare.cecal.dc.16s@sam_data$samp_type # all "cecal"
rare.cecal.dc.16s@sam_data$Time      # all "Post-exposure"

length(unique(rare.cecal.dc.16s@sam_data$Litter )) # 15
length(unique(rare.cecal.dc.16s@sam_data$Cage.Name )) # 18

p <- plot_ordination(rare.cecal.dc.16s, ord, type="samples", color="Treatment", shape="Cage.Name")
p
str(p)
p$data$Treatment <- factor(p$data$Treatment, levels = c("Control","Low","High"), ordered = TRUE)
p$data$Cage.Name <- factor(p$data$Cage.Name, levels = c( paste0("C",1:6),paste0("L",1:6),paste0("H",1:6)  ), ordered = TRUE)

cols <- c("Control" = "#f46d43", "Low" = "#66c2a5" , "High" = "#5e4fa2" ) # orange, aqua, dark-blue
shapes <- c(1:18)

pp <- p + theme_bw() +
  scale_colour_manual(values = cols) +
  scale_shape_manual(values = shapes, name = "Enclosure") +
  scale_fill_manual(values = cols) +
  geom_polygon(aes(fill = Treatment), alpha = 0.3) +
  annotate(geom="text", x= -0.45, y= -0.3, label = paste0("Stress = ",round(ord$stress,5)), hjust=0, vjust=1, size=3.25 ) +
  guides(
    color = guide_legend(order = 0),
    shape = guide_legend(order = 1)
  ) +
  guides( shape = guide_legend(ncol = 2) )  #+

pp


## Minimise plot size &
## Add centroids to this plot


p <- plot_ordination(rare.cecal.dc.16s, ord, type="samples", color="Treatment", shape="Cage.Name")
p$data$Treatment <- factor(p$data$Treatment, levels = c("Control","Low","High"), ordered = TRUE)
p$data$Cage.Name <- factor(p$data$Cage.Name, levels = c( paste0("C",1:6),paste0("L",1:6),paste0("H",1:6)  ), ordered = TRUE)

cols <- c("Control" = "#f46d43", "Low" = "#66c2a5" , "High" = "#5e4fa2" ) # orange, aqua, dark-blue
shapes <- c(1:18)

#https://stackoverflow.com/questions/23463324/r-add-centroids-to-scatter-plot
x <- p$data$NMDS1
y <- p$data$NMDS2
Treatment <- p$data$Treatment
df <- data.frame(Treatment, x, y)
centroids <- aggregate(cbind(x,y)~Treatment,df,mean)
f         <- function(z) {sd(z)/sqrt(length(z))} # function to calculate std.err
se        <- aggregate(cbind(se.x=x,se.y=y)~Treatment,df,f)
centroids <- merge(centroids,se, by="Treatment")    # add std.err column to centroids
str(centroids)


## No legend in this plot & add Centroids

pp <- ggplot(data=p$data, aes(x=NMDS1, y=NMDS2, shape = Cage.Name, colour = Treatment)) +
  theme_bw() +
  #ggtitle("C") +
  geom_point(size = 1, alpha=0.5) +
  scale_colour_manual(values = cols) +
  scale_shape_manual(values = shapes, name = "Enclosure") +
  geom_polygon(aes(fill = Treatment), alpha = 0.175, linetype="blank") +
  
  geom_errorbar(data=centroids,aes(x=x, ymin=y-se.y,ymax=y+se.y, colour=Treatment), size=0.65, width=NA, inherit.aes = FALSE) + # ,width=0.1
  geom_errorbarh(data=centroids,aes(y=y, xmin=x-se.x,xmax=x+se.x, colour=Treatment),size=0.65, height=NA, inherit.aes = FALSE) + # ,height=0.05
  geom_point(data=centroids, aes(x=x, y=y, colour=Treatment), shape=1, size=4, stroke=0.65, inherit.aes = FALSE) + # 
  
  scale_fill_manual(values = cols) +
  guides( shape = guide_legend(ncol = 2) ) +
  #annotate(geom="text", x= -0.47, y= -0.43, label = paste0("Stress = ",round(ord$stress,4)), hjust=0, vjust=0, size=2.5 ) +
  annotate(geom="text", x= -0.485, y= -0.425, label = paste0("Caeca\nStress = ",round(ord$stress,4)), hjust=0, vjust=0, size=2.5 ) +
  
  labs(x = NULL, y = NULL) +
  theme(
    
    #plot.margin = margin(t = 2, r = 5, b = 2, l = 2, "pt"),
    plot.margin = margin(t = 2, r = 1, b = 2, l = 1, "pt"),
    plot.title = element_text(margin=margin(b = 3, unit = "pt")),
    
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    
    axis.text.x = element_blank(),
    axis.text.y = element_blank(),
    axis.ticks = element_blank(),
    
    # legend.box.margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"),
    # legend.key.size = unit(2, "pt"),
    # 
    # legend.text = element_text(margin=margin(t = 0,r = 0,b = 1,l = 0,"pt"), size = rel(0.5)),
    # legend.title = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.7)),
    # legend.margin = margin(t = 0,r = 0,b = 0,l = 5,"pt"),
    # legend.box.spacing = unit(2, "pt"),
    
    legend.position = "none"
    
  )

pp


grid.text(label = "C" , x = unit(0.095, "npc") , y = unit(0.93,"npc"), gp=gpar(fontsize=11, fontface="bold") )

dev.print(tiff, file = paste0("plots/","Ordination-Cecal-only-1xRarefy-NO-LEGEND-CENTROIDS-C-vFINAL.tiff"), width = 4.83, height = 6, units = "cm", res=600, compression="lzw")




# Test hypothesis that microbiota vary (with different centroids) by Treatment

# Calculate bray curtis distance matrix
set.seed(123)
bray.rare.cecal.dc.16s <- phyloseq::distance(rare.cecal.dc.16s, method = "bray")
sampledf <- data.frame(sample_data(rare.cecal.dc.16s))
str(sampledf)





# Adonis test
set.seed(123)
adonis(bray.rare.cecal.dc.16s ~ Treatment + Sex + Litter + Cage.Name, data = sampledf)
# Call:
# adonis(formula = bray.rare.cecal.dc.16s ~ Treatment + Sex + Litter + Cage.Name, data = sampledf) 
# 
# Permutation: free
# Number of permutations: 999
# 
# Terms added sequentially (first to last)
# 
#           Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)    
# Treatment  2    0.1912 0.09559  2.4672 0.04201  0.007 ** 
# Sex        1    0.4075 0.40752 10.5179 0.08955  0.001 ***
# Litter    14    1.7985 0.12846  3.3156 0.39520  0.001 ***
# Cage.Name 14    1.3399 0.09571  2.4703 0.29444  0.001 ***
# Residuals 21    0.8136 0.03875         0.17879           
# Total     52    4.5508                 1.00000           
# ---
# Signif. codes:  0 �***� 0.001 �**� 0.01 �*� 0.05 �.� 0.1 � � 1





# Homogeneity of dispersion test
beta <- betadisper(bray.rare.cecal.dc.16s, sampledf$Treatment)
set.seed(123)
permutest(beta)
# Permutation test for homogeneity of multivariate dispersions
# Permutation: free
# Number of permutations: 999
# 
# Response: Distances
#           Df   Sum Sq   Mean Sq      F N.Perm Pr(>F)
# Groups     2 0.019532 0.0097661 1.9675    999  0.142
# Residuals 50 0.248184 0.0049637             

table( rare.cecal.dc.16s@sam_data$Treatment)
# Control    High     Low 
#      18      18      17

table( rare.cecal.dc.16s@sam_data$Sex)
# female   male 
# 27     26 

table( rare.cecal.dc.16s@sam_data$Litter)
# Blue 20   Blue 24   Blue 27  Green 13  Green 14  Green 18  Green 21  Green 30 Orange 15 Orange 19   Pink 23 
# 5         2         2         4         2         4         1         2         6         5         4 
# Pink 24 Purple 26    Red 22    Red 23 
# 6         4         2         4 

summary( as.numeric( table( rare.cecal.dc.16s@sam_data$Litter) ))
# Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
# 1.000   2.000   4.000   3.533   4.500   6.000

table( rare.cecal.dc.16s@sam_data$Cage.Name)
# C1 C2 C3 C4 C5 C6 H1 H2 H3 H4 H5 H6 L1 L2 L3 L4 L5 L6 
# 3  3  3  3  3  3  3  3  3  3  3  3  3  2  3  3  3  3


#
#------------------------





#### Identify trending taxa within treatment groups
#    Differential abundance Wk 0 > Wk 7 separately within Treatments
#------------------------
# 1. Separately within each treatment:
#   a. determine differentially abundant taxa (using DESEq2 with alpha = 0.05) between Week 0 to Week 7.
#   b. from Week 7 samples, filter taxa to retain only significantly differentially abundant taxa that are increasing between Week 0 to Week 7.
# 2. Merge the filtered Week 7 samples
# 3. Normalise
# 4. Heatmap - to visualise significant differentially abundant increasing taxa in each treatment


## First isolate fecal samples from Week 0 and 7

fecal.w0w7.dc.16s <- prune_samples( clean.16s@sam_data$samp_type == "fecal", clean.16s )
fecal.w0w7.dc.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 2730 taxa and 107 samples ]
# sample_data() Sample Data:       [ 107 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 2730 taxa by 7 taxonomic ranks ]

min(sample_sums(fecal.w0w7.dc.16s)) # 11450
min(taxa_sums(fecal.w0w7.dc.16s)) # 0

# prune taxa that have zero sequence reads
fecal.w0w7.dc.16s <- prune_taxa(taxa = taxa_sums(fecal.w0w7.dc.16s) > 0, x = fecal.w0w7.dc.16s)
fecal.w0w7.dc.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 800 taxa and 107 samples ]
# sample_data() Sample Data:       [ 107 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 800 taxa by 7 taxonomic ranks ]


## focus on samples that can be traced from Week 0 to Week 7
table(fecal.w0w7.dc.16s@sam_data$Time)
# Week 0 Week 7 
#     54     53

table(fecal.w0w7.dc.16s@sam_data$mouseID)
# C1m1 C1m2 C1m3 C2m1 C2m2 C2m3 C3m1 C3m2 C3m3 C4m1 C4m2 C4m3 C5m1 C5m2 C5m3 C6m1 C6m2 C6m3 H1m1 H1m2 
# 2    2    2    2    2    2    2    2    2    2    2    2    2    2    2    2    2    2    2    2 
# H1m3 H2m1 H2m2 H2m3 H3m1 H3m2 H3m3 H4m1 H4m2 H4m3 H5m1 H5m2 H5m3 H6m1 H6m2 H6m3 L1m1 L1m2 L1m3 L2m1 
# 2    2    2    2    2    2    2    2    2    2    2    2    2    2    2    2    2    1    2    2 
# L2m2 L2m3 L3m1 L3m2 L3m3 L4m1 L4m2 L4m3 L5m1 L5m2 L5m3 L6m1 L6m2 L6m3 
# 2    2    2    2    2    2    2    2    2    2    2    2    2    2

## note mouseID: L1m2 only has one entry
mice <- sort( unique(fecal.w0w7.dc.16s@sam_data$mouseID) )
mice
# [1] "C1m1" "C1m2" "C1m3" "C2m1" "C2m2" "C2m3" "C3m1" "C3m2" "C3m3" "C4m1" "C4m2" "C4m3" "C5m1" "C5m2"
# [15] "C5m3" "C6m1" "C6m2" "C6m3" "H1m1" "H1m2" "H1m3" "H2m1" "H2m2" "H2m3" "H3m1" "H3m2" "H3m3" "H4m1"
# [29] "H4m2" "H4m3" "H5m1" "H5m2" "H5m3" "H6m1" "H6m2" "H6m3" "L1m1" "L1m2" "L1m3" "L2m1" "L2m2" "L2m3"
# [43] "L3m1" "L3m2" "L3m3" "L4m1" "L4m2" "L4m3" "L5m1" "L5m2" "L5m3" "L6m1" "L6m2" "L6m3"


sel <- which(mice == "L1m2") # 38
mice2 <- mice[-sel]

phy_obj <- fecal.w0w7.dc.16s

phy_obj <- prune_samples( phy_obj@sam_data$mouseID %in% mice2, phy_obj )
phy_obj
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 800 taxa and 106 samples ]
# sample_data() Sample Data:       [ 106 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 800 taxa by 7 taxonomic ranks ]

min(sample_sums(phy_obj)) # 11450
min(taxa_sums(phy_obj)) # 1



## 1. Separately within each treatment:
##    a. determine differentially abundant taxa (using DESEq2 with alpha = 0.05) between Week 0 to Week 7.


### Control samples

# prune to control samples only
phy_obj.cont <- prune_samples( phy_obj@sam_data$Treatment == "Control", phy_obj )
phy_obj.cont
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 800 taxa and 36 samples ]
# sample_data() Sample Data:       [ 36 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 800 taxa by 7 taxonomic ranks ]
min(sample_sums(phy_obj.cont)) # 20718
min(taxa_sums(phy_obj.cont)) # 0

# prune taxa that have zero sequence reads
phy_obj.cont <- prune_taxa(taxa = taxa_sums(phy_obj.cont) > 0, x = phy_obj.cont)
phy_obj.cont
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 704 taxa and 36 samples ]
# sample_data() Sample Data:       [ 36 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 704 taxa by 7 taxonomic ranks ]
min(taxa_sums(phy_obj.cont)) # 1

table( phy_obj.cont@sam_data$Time )
# Week 0 Week 7 
#     18     18

# reformat to avoid errors
sel<- which(phy_obj.cont@sam_data$Time == "Week 0")
phy_obj.cont@sam_data$Time[sel] <- "Week_0"

sel<- which(phy_obj.cont@sam_data$Time == "Week 7")
phy_obj.cont@sam_data$Time[sel] <- "Week_7"


# convert to DESeq dataset
phy_obj.cont.deseq <- phyloseq_to_deseq2(phy_obj.cont, ~ Time)


# Differential expression analysis based on the Negative Binomial (a.k.a. Gamma-Poisson) distribution

#http://rstudio-pubs-static.s3.amazonaws.com/218660_7cbe1343ac434dbab59924a80ac2e815.html
# Experiments without replicates do not allow for estimation of the dispersion of counts
# around the expected value for each group, which is critical for differential expression
# analysis. If an experimental design is supplied which does not contain the necessary 
# degrees of freedom for differential analysis, DESeq will provide a message to the user 
# and follow the strategy outlined in Anders and Huber (2010) under the section 'Working 
# without replicates', wherein all the samples are considered as replicates of a single 
# group for the estimation of dispersion. As noted in the reference above: "Some 
# overestimation of the variance may be expected, which will make that approach conservative." 
# Furthermore, "while one may not want to draw strong conclusions from such an analysis, 
# it may still be useful for exploration and hypothesis generation."


deseq.cont <- DESeq(phy_obj.cont.deseq, test="Wald", fitType="parametric")
# estimating size factors
# estimating dispersions
# gene-wise dispersion estimates
# mean-dispersion relationship
# final dispersion estimates
# fitting model and testing
# -- replacing outliers and refitting for 75 genes
# -- DESeq argument 'minReplicatesForReplace' = 7 
# -- original counts are preserved in counts(dds)
# estimating dispersions
# fitting model and testing

alpha <- 0.05

res <- results(deseq.cont)

sigtab.cont <- res[which(res$padj < alpha), ]
sigtab.cont <- cbind(as(sigtab.cont, "data.frame"), as(tax_table(phy_obj.cont)[rownames(sigtab.cont), ], "matrix"))

dim(sigtab.cont) #  75 13

names(sigtab.cont)
# [1] "baseMean"       "log2FoldChange" "lfcSE"          "stat"           "pvalue"        
# [6] "padj"           "Kingdom"        "Phylum"         "Class"          "Order"         
# [11] "Family"         "Genus"          "Species" 

sigtab.cont[ order(sigtab.cont$log2FoldChange) ,c( "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj", "Phylum", "Genus", "Species" )]
#               baseMean log2FoldChange     lfcSE      stat       pvalue         padj             Phylum             Genus           Species
# OTU_112     18.5428583     -4.2506642 1.2351867 -3.441313 5.788981e-04 8.236892e-03  p__Proteobacteria  g__Desulfovibrio        s__unknown
# OTU_45      40.3846440     -4.1297206 1.1793518 -3.501687 4.623226e-04 7.299361e-03      p__Firmicutes  g__Lactobacillus        s__reuteri
# OTU_3988     1.3771808     -3.6669988 1.2170888 -3.012926 2.587418e-03 2.431197e-02   p__Bacteroidetes    g__Bacteroides        s__unknown
# OTU_2913    23.5846873     -3.3053404 0.9544437 -3.463107 5.339762e-04 7.821180e-03   p__Bacteroidetes    g__Bacteroides        s__unknown
# OTU_3777     5.5509050     -3.0515944 0.8422895 -3.622976 2.912332e-04 4.834472e-03   p__Bacteroidetes        g__unknown        s__unknown
# OTU_3406    12.1631678     -2.1689781 0.6482876 -3.345704 8.207402e-04 1.048022e-02   p__Bacteroidetes        g__unknown        s__unknown
# OTU_5415     2.4980769     -2.1436782 0.6142998 -3.489628 4.836926e-04 7.299361e-03   p__Bacteroidetes        g__unknown        s__unknown
# OTU_2     2212.9213516     -2.0302391 0.4804141 -4.226019 2.378620e-05 7.897017e-04      p__Firmicutes  g__Lactobacillus        s__unknown
# OTU_7     1161.8179196     -1.8145545 0.6380401 -2.843950 4.455799e-03 3.522203e-02   p__Bacteroidetes    g__Bacteroides        s__unknown
# OTU_142     29.7555608     -1.7881287 0.6499943 -2.750991 5.941524e-03 4.226970e-02   p__Bacteroidetes        g__unknown        s__unknown
# OTU_81      79.1297351     -1.6301467 0.5070926 -3.214692 1.305844e-03 1.445134e-02      p__Firmicutes        g__unknown        s__unknown
# OTU_7626     3.5714560     -1.4922851 0.5380635 -2.773437 5.546763e-03 4.062188e-02   p__Bacteroidetes        g__unknown        s__unknown
# OTU_127    105.0690016     -1.4610754 0.5056610 -2.889437 3.859325e-03 3.150728e-02   p__Bacteroidetes    g__Bacteroides   s__acidifaciens
# OTU_2846     7.3957390     -1.3650226 0.4856221 -2.810874 4.940715e-03 3.785348e-02   p__Bacteroidetes        g__unknown        s__unknown
# OTU_2084    37.3172508     -1.2778829 0.3445784 -3.708540 2.084577e-04 3.720854e-03   p__Bacteroidetes        g__unknown        s__unknown
# OTU_16     702.2457975     -1.2324181 0.4121339 -2.990335 2.786720e-03 2.569975e-02   p__Bacteroidetes        g__unknown        s__unknown
# OTU_2041   110.8760392     -1.0220104 0.3225915 -3.168125 1.534255e-03 1.625657e-02   p__Bacteroidetes        g__unknown        s__unknown
# OTU_2544    89.9194171     -0.9752328 0.3182380 -3.064476 2.180516e-03 2.129209e-02   p__Bacteroidetes        g__unknown        s__unknown
# OTU_3242    69.9050764     -0.9723833 0.3448169 -2.819999 4.802385e-03 3.736856e-02   p__Bacteroidetes        g__unknown        s__unknown
# OTU_22     391.2760216     -0.7896047 0.2751368 -2.869862 4.106508e-03 3.298453e-02   p__Bacteroidetes    g__Bacteroides   s__acidifaciens
# OTU_15     507.2138786      1.3323698 0.4779238  2.787829 5.306259e-03 3.944055e-02      p__Firmicutes g__[Ruminococcus]         s__gnavus
# OTU_19     375.9962846      1.4640192 0.4986532  2.935947 3.325314e-03 2.760011e-02   p__Bacteroidetes    g__Bacteroides        s__unknown
# OTU_101     18.0828765      1.5300412 0.5194369  2.945577 3.223532e-03 2.760011e-02      p__Firmicutes   g__Oscillospira        s__unknown
# OTU_133      9.5817369      1.6039718 0.5185185  3.093374 1.978945e-03 1.971029e-02      p__Firmicutes        g__unknown        s__unknown
# OTU_59      53.7704254      1.6885705 0.5727061  2.948407 3.194167e-03 2.760011e-02      p__Firmicutes g__Butyricicoccus  s__pullicaecorum
# OTU_254      7.7712083      1.7280668 0.5481499  3.152544 1.618543e-03 1.679238e-02      p__Firmicutes   g__Ruminococcus        s__unknown
# OTU_46      57.3750818      1.7748147 0.5483134  3.236862 1.208519e-03 1.399634e-02      p__Firmicutes g__[Ruminococcus]         s__gnavus
# OTU_402      3.4796363      1.9149560 0.6935720  2.761005 5.762371e-03 4.158929e-02      p__Firmicutes    g__Coprococcus        s__unknown
# OTU_67      75.5550603      2.0057828 0.4965681  4.039290 5.361318e-05 1.570551e-03      p__Firmicutes   g__Oscillospira        s__unknown
# OTU_531      2.9537727      2.1527823 0.7845156  2.744091 6.067866e-03 4.256053e-02      p__Firmicutes        g__unknown        s__unknown
# OTU_201      4.4475730      2.1689741 0.7278070  2.980150 2.881072e-03 2.608680e-02      p__Firmicutes        g__unknown        s__unknown
# OTU_2760    26.4827857      2.3166227 0.6798295  3.407652 6.552433e-04 8.819221e-03      p__Firmicutes        g__unknown        s__unknown
# OTU_57      23.9227294      2.4104822 0.7350809  3.279207 1.040993e-03 1.264426e-02      p__Firmicutes        g__unknown        s__unknown
# OTU_2998   105.2135832      2.4653336 0.8322969  2.962084 3.055641e-03 2.717338e-02      p__Firmicutes        g__unknown        s__unknown
# OTU_7119     0.7424765      2.5212914 0.9366481  2.691824 7.106252e-03 4.847827e-02   p__Bacteroidetes        g__unknown        s__unknown
# OTU_616    104.8140239      2.5436940 0.9492582  2.679665 7.369584e-03 4.959286e-02      p__Firmicutes        g__unknown        s__unknown
# OTU_9505     6.1165439      2.6810103 0.8391426  3.194940 1.398600e-03 1.514136e-02      p__Firmicutes        g__unknown        s__unknown
# OTU_3795     2.8553525      2.6905668 0.7973955  3.374194 7.403224e-04 9.702120e-03      p__Firmicutes   g__Oscillospira        s__unknown
# OTU_202      1.9910979      2.7181841 1.0160733  2.675185 7.468804e-03 4.959286e-02  p__Actinobacteria  g__Adlercreutzia        s__unknown
# OTU_419      2.4862198      2.7388725 1.0089881  2.714474 6.638101e-03 4.591353e-02      p__Firmicutes        g__unknown        s__unknown
# OTU_481    261.4869980      2.7709459 0.9068735  3.055493 2.246906e-03 2.151845e-02      p__Firmicutes        g__unknown        s__unknown
# OTU_130      8.3286306      2.7914313 0.9506076  2.936470 3.319704e-03 2.760011e-02      p__Firmicutes        g__unknown        s__unknown
# OTU_2934    21.1622562      2.8177479 0.7572937  3.720813 1.985826e-04 3.720854e-03      p__Firmicutes        g__unknown        s__unknown
# OTU_4034     5.5412380      3.0181838 0.7191245  4.197025 2.704437e-05 8.417559e-04      p__Firmicutes        g__unknown        s__unknown
# OTU_125     10.4219583      3.1050028 0.7761124  4.000713 6.315199e-05 1.736443e-03      p__Firmicutes   g__Ruminococcus        s__unknown
# OTU_6190     3.0942726      3.1125500 0.9670290  3.218673 1.287854e-03 1.445134e-02      p__Firmicutes    g__Clostridium s__methylpentosum
# OTU_2039    17.0873664      3.1661525 0.6770033  4.676717 2.915045e-06 1.451693e-04      p__Firmicutes        g__unknown        s__unknown
# OTU_4702     1.7382488      3.2092892 0.8192252  3.917469 8.948355e-05 2.025582e-03      p__Firmicutes        g__unknown        s__unknown
# OTU_2419   467.2176794      3.2674696 0.8215539  3.977182 6.973668e-05 1.736443e-03      p__Firmicutes        g__unknown        s__unknown
# OTU_4387     4.1987166      3.3557159 0.9088920  3.692096 2.224139e-04 3.819384e-03      p__Firmicutes        g__unknown        s__unknown
# OTU_10092    2.5731796      3.4105243 0.8955341  3.808369 1.398865e-04 3.028847e-03      p__Firmicutes        g__unknown        s__unknown
# OTU_258      1.5877156      3.4238107 1.2272272  2.789875 5.272837e-03 3.944055e-02      p__Firmicutes    g__Clostridium        s__unknown
# OTU_64      53.9786644      3.5316462 0.9525336  3.707634 2.092046e-04 3.720854e-03      p__Firmicutes        g__unknown        s__unknown
# OTU_247     16.8095021      3.6028986 1.1479381  3.138583 1.697670e-03 1.725387e-02      p__Firmicutes        g__unknown        s__unknown
# OTU_4483     3.0041736      3.6683929 0.9203247  3.985977 6.720307e-05 1.736443e-03      p__Firmicutes        g__unknown        s__unknown
# OTU_5971     4.6750189      3.7687244 1.0778169  3.496628 4.711792e-04 7.299361e-03      p__Firmicutes        g__unknown        s__unknown
# OTU_2384    54.3258759      3.7942495 0.8950296  4.239245 2.242731e-05 7.897017e-04      p__Firmicutes        g__unknown        s__unknown
# OTU_9762    34.0383854      3.8087799 0.7509148  5.072186 3.932710e-07 2.448112e-05      p__Firmicutes        g__unknown        s__unknown
# OTU_23     110.3286306      3.9069377 0.6742409  5.794572 6.849552e-09 6.822154e-07  p__Proteobacteria  g__Desulfovibrio        s__C21_c20
# OTU_1     4960.9791929      3.9646628 0.8359561  4.742669 2.109211e-06 1.167097e-04      p__Firmicutes        g__unknown        s__unknown
# OTU_6689     2.8031402      4.0458498 1.0898211  3.712398 2.053047e-04 3.720854e-03      p__Firmicutes        g__unknown        s__unknown
# OTU_3671     7.9000771      4.0476107 0.7574228  5.343925 9.095504e-08 7.549268e-06      p__Firmicutes        g__unknown        s__unknown
# OTU_273      3.1038213      4.0554291 1.0236007  3.961925 7.434788e-05 1.763107e-03      p__Firmicutes        g__unknown        s__unknown
# OTU_169      4.4814470      4.0993668 0.9642642  4.251290 2.125426e-05 7.897017e-04             p__TM7        g__unknown        s__unknown
# OTU_4077     1.9747873      4.1112769 1.0959598  3.751302 1.759183e-04 3.650305e-03      p__Firmicutes        g__unknown        s__unknown
# OTU_8231     2.1422748      4.2799254 1.2826220  3.336856 8.473174e-04 1.054910e-02   p__Bacteroidetes        g__unknown        s__unknown
# OTU_5280     2.5230003      4.2944680 1.2562631  3.418446 6.297971e-04 8.712194e-03      p__Firmicutes        g__unknown        s__unknown
# OTU_143     12.0926947      4.3528076 1.3387243  3.251459 1.148142e-03 1.361369e-02      p__Firmicutes        g__unknown        s__unknown
# OTU_9785     1.9357390      4.3587136 0.9590896  4.544636 5.503019e-06 2.491367e-04      p__Firmicutes        g__unknown        s__unknown
# OTU_765     29.7740427      4.4883299 0.7312315  6.138042 8.354460e-10 2.080261e-07      p__Firmicutes g__[Ruminococcus]         s__gnavus
# OTU_75      14.8211619      4.6735440 1.0604028  4.407329 1.046531e-05 4.343103e-04 p__Deferribacteres  g__Mucispirillum     s__schaedleri
# OTU_2280    25.3305409      4.8848474 0.9426373  5.182107 2.193932e-07 1.560826e-05      p__Firmicutes        g__unknown        s__unknown
# OTU_4273     6.8839093      5.0918977 0.8206810  6.204478 5.487865e-10 2.080261e-07      p__Firmicutes        g__unknown        s__unknown
# OTU_3188     5.0690465      5.4715911 0.9387125  5.828825 5.581900e-09 6.822154e-07      p__Firmicutes        g__unknown        s__unknown
# OTU_3004    13.0870968      5.9460822 1.0245582  5.803557 6.492250e-09 6.822154e-07      p__Firmicutes        g__unknown        s__unknown





### Low samples

# prune to Low samples only
phy_obj.low <- prune_samples( phy_obj@sam_data$Treatment == "Low", phy_obj )
phy_obj.low
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 800 taxa and 34 samples ]
# sample_data() Sample Data:       [ 34 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 800 taxa by 7 taxonomic ranks ]
min(sample_sums(phy_obj.low)) #  19385
min(taxa_sums(phy_obj.low)) # 0

# prune taxa that have zero sequence reads
phy_obj.low <- prune_taxa(taxa = taxa_sums(phy_obj.low) > 0, x = phy_obj.low)
phy_obj.low
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 608 taxa and 34 samples ]
# sample_data() Sample Data:       [ 34 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 608 taxa by 7 taxonomic ranks ]


table( phy_obj.low@sam_data$Time )
# Week 0 Week 7 
#     17     17

# adjust format to avoid errors
sel<- which(phy_obj.low@sam_data$Time == "Week 0")
phy_obj.low@sam_data$Time[sel] <- "Week_0"

sel<- which(phy_obj.low@sam_data$Time == "Week 7")
phy_obj.low@sam_data$Time[sel] <- "Week_7"


# convert to DESeq dataset
phy_obj.low.deseq <- phyloseq_to_deseq2(phy_obj.low, ~ Time)


# Deseq2 differntial abundance testing

deseq.low <- DESeq(phy_obj.low.deseq, test="Wald", fitType="parametric")
# estimating size factors
# estimating dispersions
# gene-wise dispersion estimates
# mean-dispersion relationship
# final dispersion estimates
# fitting model and testing
# -- replacing outliers and refitting for 64 genes
# -- DESeq argument 'minReplicatesForReplace' = 7 
# -- original counts are preserved in counts(dds)
# estimating dispersions
# fitting model and testing

alpha <- 0.05

res <- results(deseq.low)

sigtab.low <- res[which(res$padj < alpha), ]
sigtab.low <- cbind(as(sigtab.low, "data.frame"), as(tax_table(phy_obj.low)[rownames(sigtab.low), ], "matrix"))

dim(sigtab.low) #  34 13

names(sigtab.low)
# [1] "baseMean"       "log2FoldChange" "lfcSE"          "stat"           "pvalue"         "padj"           "Kingdom"        "Phylum"         "Class"         
# [10] "Order"          "Family"         "Genus"          "Species" 

sigtab.low[ order(sigtab.low$log2FoldChange) ,c( "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj", "Phylum", "Genus", "Species" )]
#            baseMean log2FoldChange     lfcSE      stat       pvalue         padj            Phylum              Genus          Species
# OTU_195    1.798550      -3.998211 1.1824695 -3.381238 7.216003e-04 1.357296e-02     p__Firmicutes   g__Streptococcus       s__unknown
# OTU_127  292.730303      -3.723245 0.6378793 -5.836913 5.317695e-09 5.251224e-07  p__Bacteroidetes     g__Bacteroides  s__acidifaciens
# OTU_93    24.291110      -3.633685 0.8401941 -4.324816 1.526594e-05 1.108976e-03  p__Bacteroidetes g__Parabacteroides       s__unknown
# OTU_719    7.689312      -3.179492 0.8619965 -3.688521 2.255612e-04 5.568543e-03  p__Bacteroidetes     g__Bacteroides       s__unknown
# OTU_2164   6.955018      -2.827016 0.7574858 -3.732105 1.898866e-04 5.357514e-03  p__Bacteroidetes         g__unknown       s__unknown
# OTU_142   28.696683      -2.533310 0.5934479 -4.268800 1.965274e-05 1.108976e-03  p__Bacteroidetes         g__unknown       s__unknown
# OTU_7626   5.442626      -2.491337 0.7886089 -3.159154 1.582280e-03 2.334734e-02  p__Bacteroidetes         g__unknown       s__unknown
# OTU_3777   1.945114      -2.484125 0.7890626 -3.148198 1.642806e-03 2.334734e-02  p__Bacteroidetes         g__unknown       s__unknown
# OTU_4743  22.107263      -2.447953 0.5859928 -4.177445 2.948021e-05 1.455586e-03  p__Bacteroidetes     g__Bacteroides  s__acidifaciens
# OTU_3216   5.400312      -2.444080 0.7475802 -3.269322 1.078056e-03 1.774300e-02  p__Bacteroidetes         g__unknown       s__unknown
# OTU_3118   2.952988      -2.238462 0.6839070 -3.273051 1.063933e-03 1.774300e-02  p__Bacteroidetes         g__unknown       s__unknown
# OTU_104   51.397059      -2.214126 0.6953415 -3.184228 1.451406e-03 2.293221e-02     p__Firmicutes         g__unknown       s__unknown
# OTU_2084  29.008145      -1.742054 0.4074178 -4.275841 1.904168e-05 1.108976e-03  p__Bacteroidetes         g__unknown       s__unknown
# OTU_7522  17.423007      -1.538813 0.4210844 -3.654405 2.577790e-04 5.980094e-03  p__Bacteroidetes         g__unknown       s__unknown
# OTU_22   523.474626      -1.389541 0.3864357 -3.595790 3.234090e-04 6.723503e-03  p__Bacteroidetes     g__Bacteroides  s__acidifaciens
# OTU_16   411.255144      -1.284971 0.3431829 -3.744274 1.809165e-04 5.357514e-03  p__Bacteroidetes         g__unknown       s__unknown
# OTU_42    62.786175      -1.230551 0.4093512 -3.006100 2.646220e-03 3.371796e-02  p__Bacteroidetes         g__unknown       s__unknown
# OTU_101   14.072656       1.559269 0.5458072  2.856813 4.279183e-03 4.971403e-02     p__Firmicutes    g__Oscillospira       s__unknown
# OTU_2039  13.802541       1.630119 0.5416970  3.009283 2.618654e-03 3.371796e-02     p__Firmicutes         g__unknown       s__unknown
# OTU_59    50.128602       1.925324 0.5014552  3.839473 1.232985e-04 4.058577e-03     p__Firmicutes  g__Butyricicoccus s__pullicaecorum
# OTU_5079  17.873409       2.105226 0.6045289  3.482424 4.968964e-04 9.813703e-03     p__Firmicutes  g__[Ruminococcus]        s__gnavus
# OTU_5095  10.794074       2.345546 0.7570808  3.098145 1.947361e-03 2.652440e-02     p__Firmicutes    g__Oscillospira       s__unknown
# OTU_240   12.184679       2.404578 0.6130899  3.922063 8.779394e-05 3.365317e-03     p__Firmicutes     g__Coprococcus       s__unknown
# OTU_9639   2.130227       2.581988 0.8947042  2.885856 3.903501e-03 4.672373e-02     p__Firmicutes         g__unknown       s__unknown
# OTU_2998  97.930762       2.730555 0.6990120  3.906306 9.371770e-05 3.365317e-03     p__Firmicutes         g__unknown       s__unknown
# OTU_23    90.673046       2.838418 0.8500105  3.339275 8.399749e-04 1.508137e-02 p__Proteobacteria   g__Desulfovibrio       s__C21_c20
# OTU_5280   2.195330       2.844698 0.7814845  3.640120 2.725106e-04 5.980094e-03     p__Firmicutes         g__unknown       s__unknown
# OTU_8615   1.144007       2.912812 0.9258675  3.146035 1.655001e-03 2.334734e-02  p__Bacteroidetes         g__unknown       s__unknown
# OTU_273    4.573067       3.301106 1.1230775  2.939339 3.289130e-03 4.060020e-02     p__Firmicutes         g__unknown       s__unknown
# OTU_765   34.985087       3.533033 0.5361757  6.589319 4.418481e-11 1.524523e-08     p__Firmicutes  g__[Ruminococcus]        s__gnavus
# OTU_4273   6.201800       4.019900 0.6178781  6.505975 7.719105e-11 1.524523e-08     p__Firmicutes         g__unknown       s__unknown
# OTU_2280   4.242216       4.031989 0.9977289  4.041167 5.318584e-05 2.334267e-03     p__Firmicutes         g__unknown       s__unknown
# OTU_332    2.356973       4.722236 1.2764443  3.699523 2.160048e-04 5.568543e-03     p__Firmicutes    g__Oscillospira       s__unknown
# OTU_287    7.096759       5.298301 0.8897829  5.954599 2.607100e-09 3.432681e-07     p__Firmicutes         g__unknown       s__unknown




### High samples

# prune to High samples only
phy_obj.high <- prune_samples( phy_obj@sam_data$Treatment == "High", phy_obj )
phy_obj.high
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 800 taxa and 36 samples ]
# sample_data() Sample Data:       [ 36 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 800 taxa by 7 taxonomic ranks ]
min(sample_sums(phy_obj.high)) # 11450
min(taxa_sums(phy_obj.high)) # 0

# prune taxa that have zero sequence reads
phy_obj.high <- prune_taxa(taxa = taxa_sums(phy_obj.high) > 0, x = phy_obj.high)
phy_obj.high
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 622 taxa and 36 samples ]
# sample_data() Sample Data:       [ 36 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 622 taxa by 7 taxonomic ranks ]


table( phy_obj.high@sam_data$Time )
# Week 0 Week 7 
#     18     18

# adjust format to avoid errors
sel<- which(phy_obj.high@sam_data$Time == "Week 0")
phy_obj.high@sam_data$Time[sel] <- "Week_0"

sel<- which(phy_obj.high@sam_data$Time == "Week 7")
phy_obj.high@sam_data$Time[sel] <- "Week_7"


# convert to DESeq dataset
phy_obj.high.deseq <- phyloseq_to_deseq2(phy_obj.high, ~ Time)


# Deseq2 differential abundance testing
deseq.high <- DESeq(phy_obj.high.deseq, test="Wald", fitType="parametric")
# estimating size factors
# estimating dispersions
# gene-wise dispersion estimates
# mean-dispersion relationship
# final dispersion estimates
# fitting model and testing
# -- replacing outliers and refitting for 65 genes
# -- DESeq argument 'minReplicatesForReplace' = 7 
# -- original counts are preserved in counts(dds)
# estimating dispersions
# fitting model and testing

alpha <- 0.05

res <- results(deseq.high)

sigtab.high <- res[which(res$padj < alpha), ]
sigtab.high <- cbind(as(sigtab.high, "data.frame"), as(tax_table(phy_obj.high)[rownames(sigtab.high), ], "matrix"))

dim(sigtab.high) #   79 13

names(sigtab.high)
# [1] "baseMean"       "log2FoldChange" "lfcSE"          "stat"           "pvalue"         "padj"           "Kingdom"        "Phylum"        
# [9] "Class"          "Order"          "Family"         "Genus"          "Species" 

sigtab.high[ order(sigtab.high$log2FoldChange) ,c( "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj", "Phylum", "Genus", "Species" )]
#              baseMean log2FoldChange     lfcSE      stat       pvalue         padj             Phylum              Genus           Species
# OTU_4681     2.043530      -4.079136 1.3724732 -2.972106 2.957641e-03 1.982544e-02      p__Firmicutes         g__unknown        s__unknown
# OTU_195      0.985394      -3.234906 1.0376998 -3.117381 1.824655e-03 1.349615e-02      p__Firmicutes   g__Streptococcus        s__unknown
# OTU_719     17.246642      -3.005322 0.9063864 -3.315719 9.140776e-04 7.842785e-03   p__Bacteroidetes     g__Bacteroides        s__unknown
# OTU_4086     1.080848      -2.781731 1.0266076 -2.709634 6.735742e-03 3.802149e-02   p__Bacteroidetes     g__Bacteroides        s__unknown
# OTU_3777     2.064294      -2.504568 0.6299288 -3.975954 7.009754e-05 1.073994e-03   p__Bacteroidetes         g__unknown        s__unknown
# OTU_127    148.218087      -2.414155 0.5209501 -4.634139 3.584264e-06 1.098321e-04   p__Bacteroidetes     g__Bacteroides   s__acidifaciens
# OTU_93       9.876403      -2.101165 0.7811775 -2.689741 7.150755e-03 3.983992e-02   p__Bacteroidetes g__Parabacteroides        s__unknown
# OTU_7626     4.076220      -2.050871 0.5449074 -3.763705 1.674141e-04 2.052018e-03   p__Bacteroidetes         g__unknown        s__unknown
# OTU_4743    20.534163      -1.986233 0.7036741 -2.822660 4.762710e-03 2.877750e-02   p__Bacteroidetes     g__Bacteroides   s__acidifaciens
# OTU_2     2851.883968      -1.915679 0.4954935 -3.866204 1.105426e-04 1.529767e-03      p__Firmicutes   g__Lactobacillus        s__unknown
# OTU_21     201.608421      -1.864232 0.6369436 -2.926840 3.424246e-03 2.192539e-02      p__Firmicutes         g__unknown        s__unknown
# OTU_5415     4.801449      -1.722225 0.5339671 -3.225340 1.258232e-03 9.995953e-03   p__Bacteroidetes         g__unknown        s__unknown
# OTU_24     116.627087      -1.616744 0.5253827 -3.077269 2.089066e-03 1.516822e-02      p__Firmicutes         g__unknown        s__unknown
# OTU_3242    68.242178      -1.548283 0.3153554 -4.909645 9.124154e-07 3.558420e-05   p__Bacteroidetes         g__unknown        s__unknown
# OTU_2084    36.388399      -1.536603 0.4028264 -3.814555 1.364284e-04 1.721405e-03   p__Bacteroidetes         g__unknown        s__unknown
# OTU_2846     7.681438      -1.499266 0.4714395 -3.180186 1.471803e-03 1.148007e-02   p__Bacteroidetes         g__unknown        s__unknown
# OTU_22     572.559066      -1.218268 0.3451008 -3.530180 4.152775e-04 4.453852e-03   p__Bacteroidetes     g__Bacteroides   s__acidifaciens
# OTU_2041   106.962104      -1.130011 0.3282132 -3.442918 5.754748e-04 5.647830e-03   p__Bacteroidetes         g__unknown        s__unknown
# OTU_2544    87.397303      -1.023581 0.3613387 -2.832747 4.614986e-03 2.877750e-02   p__Bacteroidetes         g__unknown        s__unknown
# OTU_16     446.315924      -0.958834 0.2786383 -3.441142 5.792646e-04 5.647830e-03   p__Bacteroidetes         g__unknown        s__unknown
# OTU_29     122.220729       1.038246 0.3121266  3.326361 8.798796e-04 7.703436e-03   p__Bacteroidetes         g__unknown        s__unknown
# OTU_60     207.945848       1.307496 0.4943108  2.645089 8.166945e-03 4.434961e-02      p__Firmicutes         g__unknown        s__unknown
# OTU_31     153.055939       1.729748 0.4519056  3.827676 1.293590e-04 1.681666e-03      p__Firmicutes         g__unknown        s__unknown
# OTU_15     585.861078       1.754188 0.5079705  3.453326 5.537193e-04 5.647830e-03      p__Firmicutes  g__[Ruminococcus]         s__gnavus
# OTU_105     42.083956       1.761243 0.5893318  2.988543 2.803113e-03 1.913636e-02   p__Bacteroidetes         g__unknown        s__unknown
# OTU_38     127.999115       1.777875 0.5421738  3.279161 1.041164e-03 8.589600e-03      p__Firmicutes         g__unknown        s__unknown
# OTU_46      38.816777       1.845908 0.6227928  2.963920 3.037477e-03 2.004735e-02      p__Firmicutes  g__[Ruminococcus]         s__gnavus
# OTU_2750    11.509058       1.944012 0.7034053  2.763715 5.714737e-03 3.358386e-02   p__Bacteroidetes         g__unknown        s__unknown
# OTU_91      11.563760       1.955792 0.5511539  3.548541 3.873720e-04 4.261092e-03   p__Bacteroidetes         g__unknown        s__unknown
# OTU_3456     3.869395       2.047931 0.5256209  3.896212 9.770883e-05 1.397236e-03   p__Bacteroidetes         g__unknown        s__unknown
# OTU_3876     3.314630       2.055860 0.6557929  3.134923 1.718994e-03 1.293769e-02   p__Bacteroidetes         g__unknown        s__unknown
# OTU_556     15.699203       2.076692 0.7620504  2.725138 6.427465e-03 3.676510e-02      p__Firmicutes    g__Oscillospira        s__unknown
# OTU_101     10.908299       2.165301 0.5063512  4.276282 1.900406e-05 3.396976e-04      p__Firmicutes    g__Oscillospira        s__unknown
# OTU_5520     6.523735       2.198683 0.5704933  3.854004 1.162017e-04 1.557829e-03      p__Firmicutes           g__Dorea        s__unknown
# OTU_115     18.198058       2.302950 0.5159827  4.463230 8.073313e-06 2.164657e-04      p__Firmicutes    g__Ruminococcus        s__unknown
# OTU_6509     6.358935       2.362478 0.7086561  3.333743 8.568565e-04 7.658155e-03   p__Bacteroidetes         g__unknown        s__unknown
# OTU_5915     1.177450       2.381407 0.8584033  2.774229 5.533273e-03 3.296908e-02   p__Bacteroidetes         g__unknown        s__unknown
# OTU_2419   259.000746       2.406283 0.7030403  3.422682 6.200654e-04 5.911290e-03      p__Firmicutes         g__unknown        s__unknown
# OTU_4273     5.971702       2.524909 0.8934620  2.825983 4.713571e-03 2.877750e-02      p__Firmicutes         g__unknown        s__unknown
# OTU_4483     2.462432       2.596384 0.9430368  2.753216 5.901292e-03 3.421155e-02      p__Firmicutes         g__unknown        s__unknown
# OTU_9762    30.654538       2.629714 0.8597311  3.058763 2.222526e-03 1.563055e-02      p__Firmicutes         g__unknown        s__unknown
# OTU_64      35.953911       2.692941 0.8328833  3.233275 1.223797e-03 9.905826e-03      p__Firmicutes         g__unknown        s__unknown
# OTU_5765     2.648711       2.697816 0.8207166  3.287146 1.012082e-03 8.513395e-03      p__Firmicutes         g__unknown        s__unknown
# OTU_37      50.345166       2.749703 0.7729584  3.557375 3.745788e-04 4.228797e-03      p__Firmicutes         g__unknown        s__unknown
# OTU_3669    16.180306       2.749773 0.6679435  4.116775 3.842113e-05 6.339486e-04      p__Firmicutes  g__[Ruminococcus]         s__gnavus
# OTU_2039    11.747794       2.786055 0.5206581  5.351027 8.745664e-08 4.168766e-06      p__Firmicutes         g__unknown        s__unknown
# OTU_4387     1.369493       2.846796 0.9528173  2.987767 2.810235e-03 1.913636e-02      p__Firmicutes         g__unknown        s__unknown
# OTU_5079    15.740123       2.976611 0.6383231  4.663174 3.113699e-06 1.027521e-04      p__Firmicutes  g__[Ruminococcus]         s__gnavus
# OTU_287      4.452519       3.031707 0.8785625  3.450758 5.590145e-04 5.647830e-03      p__Firmicutes         g__unknown        s__unknown
# OTU_109     13.644146       3.062056 0.9138082  3.350873 8.055711e-04 7.352979e-03      p__Firmicutes         g__unknown        s__unknown
# OTU_1     2956.272933       3.082037 0.6992884  4.407391 1.046232e-05 2.406540e-04      p__Firmicutes         g__unknown        s__unknown
# OTU_5375     2.348260       3.158170 1.1166597  2.828230 4.680615e-03 2.877750e-02      p__Firmicutes         g__unknown        s__unknown
# OTU_196      8.286051       3.207144 0.7490111  4.281838 1.853562e-05 3.396976e-04      p__Firmicutes         g__unknown        s__unknown
# OTU_765     32.960477       3.208995 0.7305768  4.392413 1.120997e-05 2.406540e-04      p__Firmicutes  g__[Ruminococcus]         s__gnavus
# OTU_10092    1.804068       3.269152 1.1068356  2.953603 3.140882e-03 2.041573e-02      p__Firmicutes         g__unknown        s__unknown
# OTU_139      9.465414       3.292801 0.7232097  4.553037 5.287690e-06 1.512279e-04      p__Firmicutes   g__Anaerotruncus        s__unknown
# OTU_125      4.681573       3.445857 0.7845344  4.392232 1.121930e-05 2.406540e-04      p__Firmicutes    g__Ruminococcus        s__unknown
# OTU_7377    12.186049       3.469706 0.8587130  4.040588 5.331731e-05 8.471528e-04   p__Bacteroidetes         g__unknown        s__unknown
# OTU_9265    18.151516       3.621745 0.8288836  4.369426 1.245738e-05 2.544865e-04   p__Bacteroidetes         g__unknown        s__unknown
# OTU_75       6.957835       3.654925 1.1894892  3.072684 2.121430e-03 1.516822e-02 p__Deferribacteres   g__Mucispirillum     s__schaedleri
# OTU_6190     2.233411       3.659486 1.0744411  3.405944 6.593580e-04 6.149230e-03      p__Firmicutes     g__Clostridium s__methylpentosum
# OTU_2998    35.340798       3.670364 0.6399754  5.735164 9.741831e-09 5.970351e-07      p__Firmicutes         g__unknown        s__unknown
# OTU_9505     2.924012       3.906800 0.8962776  4.358917 1.307075e-05 2.548796e-04      p__Firmicutes         g__unknown        s__unknown
# OTU_9785     1.981434       4.044040 1.5080439  2.681646 7.326086e-03 4.029347e-02      p__Firmicutes         g__unknown        s__unknown
# OTU_3408     1.646883       4.059738 1.1208914  3.621884 2.924656e-04 3.485215e-03      p__Firmicutes         g__unknown        s__unknown
# OTU_82       8.882978       4.060109 0.8480131  4.787790 1.686281e-06 6.028454e-05  p__Actinobacteria         g__unknown        s__unknown
# OTU_23      84.607384       4.112387 0.7798565  5.273261 1.340206e-07 5.749486e-06  p__Proteobacteria   g__Desulfovibrio        s__C21_c20
# OTU_26     215.515354       4.145496 0.9836592  4.214362 2.504851e-05 4.298325e-04   p__Bacteroidetes         g__unknown        s__unknown
# OTU_5790     3.186627       4.174087 1.0696575  3.902265 9.529673e-05 1.397236e-03      p__Firmicutes         g__unknown        s__unknown
# OTU_35     146.533118       4.174864 1.1581124  3.604887 3.122893e-04 3.620868e-03   p__Bacteroidetes      g__Prevotella        s__unknown
# OTU_2760    15.676237       4.256497 0.7123156  5.975577 2.292769e-09 1.937759e-07      p__Firmicutes         g__unknown        s__unknown
# OTU_173      5.366274       4.295784 0.8016890  5.358416 8.395459e-08 4.168766e-06      p__Firmicutes         g__unknown        s__unknown
# OTU_1737    11.412714       4.310080 0.5945543  7.249263 4.190462e-13 8.988542e-11      p__Firmicutes         g__unknown        s__unknown
# OTU_616     59.092899       4.311497 0.6840614  6.302792 2.923319e-10 4.180346e-08      p__Firmicutes         g__unknown        s__unknown
# OTU_481    137.340586       4.322291 0.7266485  5.948256 2.710153e-09 1.937759e-07      p__Firmicutes         g__unknown        s__unknown
# OTU_3188     2.279943       4.371347 1.3849341  3.156358 1.597528e-03 1.223821e-02      p__Firmicutes         g__unknown        s__unknown
# OTU_3004     7.934426       4.524324 1.0239869  4.418342 9.946115e-06 2.406540e-04      p__Firmicutes         g__unknown        s__unknown
# OTU_2280     9.365913       5.649581 0.9478189  5.960612 2.512947e-09 1.937759e-07      p__Firmicutes         g__unknown        s__unknown
# OTU_79      13.129839      23.166880 2.9259099  7.917838 2.416762e-15 1.036791e-12   p__Bacteroidetes      g__Prevotella        s__unknown




### compare control / low / high scenarios

## Control

taxa.sig.diffabun.cont <- row.names(sigtab.cont)
length(taxa.sig.diffabun.cont) # 75

# Increasing
sel <- which(sigtab.cont$log2FoldChange > 0) # qty 55
taxa.sig.diffabun.Inc.cont <- row.names(sigtab.cont)[sel]
taxa.sig.diffabun.Inc.cont
# [1] "OTU_1"     "OTU_15"    "OTU_19"    "OTU_23"    "OTU_46"    "OTU_57"    "OTU_59"    "OTU_64"    "OTU_67"    "OTU_75"    "OTU_101"  
# [12] "OTU_125"   "OTU_130"   "OTU_133"   "OTU_143"   "OTU_169"   "OTU_201"   "OTU_202"   "OTU_247"   "OTU_254"   "OTU_258"   "OTU_273"  
# [23] "OTU_402"   "OTU_419"   "OTU_481"   "OTU_531"   "OTU_616"   "OTU_765"   "OTU_2039"  "OTU_2280"  "OTU_2384"  "OTU_2419"  "OTU_2760" 
# [34] "OTU_2934"  "OTU_2998"  "OTU_3004"  "OTU_3188"  "OTU_3671"  "OTU_3795"  "OTU_4034"  "OTU_4077"  "OTU_4273"  "OTU_4387"  "OTU_4483" 
# [45] "OTU_4702"  "OTU_5280"  "OTU_5971"  "OTU_6190"  "OTU_6689"  "OTU_7119"  "OTU_8231"  "OTU_9505"  "OTU_9762"  "OTU_9785"  "OTU_10092"

# Decreasing
sel <- which(sigtab.cont$log2FoldChange < 0) # qty 20
taxa.sig.diffabun.Dec.cont <- row.names(sigtab.cont)[sel]
taxa.sig.diffabun.Dec.cont
# [1] "OTU_2"    "OTU_7"    "OTU_16"   "OTU_22"   "OTU_45"   "OTU_81"   "OTU_112"  "OTU_127"  "OTU_142"  "OTU_2041" "OTU_2084" "OTU_2544"
# [13] "OTU_2846" "OTU_2913" "OTU_3242" "OTU_3406" "OTU_3777" "OTU_3988" "OTU_5415" "OTU_7626"


# how many Phyla in increasing?
length(unique((sigtab.cont$Phylum[ which(sigtab.cont$log2FoldChange > 0) ]))) # 6



## Low

taxa.sig.diffabun.low <- row.names(sigtab.low)
length(taxa.sig.diffabun.low) # 34

# Increasing
sel <- which(sigtab.low$log2FoldChange > 0) # qty 17
taxa.sig.diffabun.Inc.low <- row.names(sigtab.low)[sel]
taxa.sig.diffabun.Inc.low
# [1] "OTU_23"   "OTU_59"   "OTU_101"  "OTU_240"  "OTU_273"  "OTU_287"  "OTU_332"  "OTU_765"  "OTU_2039" "OTU_2280" "OTU_2998" "OTU_4273"
# [13] "OTU_5079" "OTU_5095" "OTU_5280" "OTU_8615" "OTU_9639"

# Decreasing
sel <- which(sigtab.low$log2FoldChange < 0) # qty 17
taxa.sig.diffabun.Dec.low <- row.names(sigtab.low)[sel]
taxa.sig.diffabun.Dec.low
# [1] "OTU_16"   "OTU_22"   "OTU_42"   "OTU_93"   "OTU_104"  "OTU_127"  "OTU_142"  "OTU_195"  "OTU_719"  "OTU_2084" "OTU_2164" "OTU_3118"
# [13] "OTU_3216" "OTU_3777" "OTU_4743" "OTU_7522" "OTU_7626"

# how many Phyla in increasing?
length(unique((sigtab.low$Phylum[ which(sigtab.low$log2FoldChange > 0) ]))) # 3



## High

taxa.sig.diffabun.high <- row.names(sigtab.high)
length(taxa.sig.diffabun.high) # 79

# Increasing
sel <- which(sigtab.high$log2FoldChange > 0) # qty 59
taxa.sig.diffabun.Inc.high <- row.names(sigtab.high)[sel]
taxa.sig.diffabun.Inc.high
# [1] "OTU_1"     "OTU_15"    "OTU_23"    "OTU_26"    "OTU_29"    "OTU_31"    "OTU_35"    "OTU_37"    "OTU_38"    "OTU_46"    "OTU_60"   
# [12] "OTU_64"    "OTU_75"    "OTU_79"    "OTU_82"    "OTU_91"    "OTU_101"   "OTU_105"   "OTU_109"   "OTU_115"   "OTU_125"   "OTU_139"  
# [23] "OTU_173"   "OTU_196"   "OTU_287"   "OTU_481"   "OTU_556"   "OTU_616"   "OTU_765"   "OTU_1737"  "OTU_2039"  "OTU_2280"  "OTU_2419" 
# [34] "OTU_2750"  "OTU_2760"  "OTU_2998"  "OTU_3004"  "OTU_3188"  "OTU_3408"  "OTU_3456"  "OTU_3669"  "OTU_3876"  "OTU_4273"  "OTU_4387" 
# [45] "OTU_4483"  "OTU_5079"  "OTU_5375"  "OTU_5520"  "OTU_5765"  "OTU_5790"  "OTU_5915"  "OTU_6190"  "OTU_6509"  "OTU_7377"  "OTU_9265" 
# [56] "OTU_9505"  "OTU_9762"  "OTU_9785"  "OTU_10092"

# Decreasing
sel <- which(sigtab.high$log2FoldChange < 0) # qty 20
taxa.sig.diffabun.Dec.high <- row.names(sigtab.high)[sel]
taxa.sig.diffabun.Dec.high
# [1] "OTU_2"    "OTU_16"   "OTU_21"   "OTU_22"   "OTU_24"   "OTU_93"   "OTU_127"  "OTU_195"  "OTU_719"  "OTU_2041" "OTU_2084" "OTU_2544"
# [13] "OTU_2846" "OTU_3242" "OTU_3777" "OTU_4086" "OTU_4681" "OTU_4743" "OTU_5415" "OTU_7626"


# how many Phyla in increasing?
length(unique((sigtab.high$Phylum[ which(sigtab.high$log2FoldChange > 0) ]))) # 5





### Euler diagrams to show overlaps for Increasing / Decreasing taxa among different Treatments

### Increasing taxa

# taxa.sig.diffabun.Inc.cont
# taxa.sig.diffabun.Inc.low
# taxa.sig.diffabun.Inc.high


x = list( "Control" = taxa.sig.diffabun.Inc.cont, "Low" = taxa.sig.diffabun.Inc.low, "High" = taxa.sig.diffabun.Inc.high)
overlap.inc <- calculate.overlap(x)

as.numeric(lapply(x, FUN = length)) # 55 17 59 - i.e. Control 55, Low 17, High 59
as.numeric(lapply(overlap.inc, FUN = length)) #   7  3 19  2 26  5 31

str(overlap.inc)
# List of 7
# $ a5: chr [1:7] "OTU_23" "OTU_101" "OTU_765" "OTU_2039" ...
# $ a2: chr [1:3] "OTU_59" "OTU_273" "OTU_5280"
# $ a4: chr [1:19] "OTU_1" "OTU_15" "OTU_46" "OTU_64" ...
# $ a6: chr [1:2] "OTU_287" "OTU_5079"
# $ a1: chr [1:26] "OTU_19" "OTU_57" "OTU_67" "OTU_130" ...
# $ a3: chr [1:5] "OTU_240" "OTU_332" "OTU_5095" "OTU_8615" ...
# $ a7: chr [1:31] "OTU_26" "OTU_29" "OTU_31" "OTU_35" ...


# Control 55, Low 17, High 59

# Control&Low
length(which(x[[1]] %in% x[[2]])) # 10
x[[1]][which(x[[1]] %in% x[[2]])]
#  "OTU_23"   "OTU_59"   "OTU_101"  "OTU_273"  "OTU_765"  "OTU_2039" "OTU_2280" "OTU_2998" "OTU_4273" "OTU_5280"

# Control&High
length(which(x[[1]] %in% x[[3]])) # 26

# Low&High
length(which(x[[2]] %in% x[[3]])) # 9

# Control&Low&High
sel <- which(x[[1]] %in% x[[2]])
length(which(x[[1]][sel] %in% x[[3]])) # 7


## use 'eulerr' package

combo <- c("Control" = 55-(10-7)-(26-7)-7, "Low" = 17-(10-7)-(9-7)-7, "High" = 59-(26-7)-(9-7)-7,
           "Control&Low" = 10-7, "Control&High" = 26-7, "Low&High" = 9-7,  "Control&Low&High" = 7)
fit.circ.inc <- euler(combo)
fit.circ.inc
plot(fit.circ.inc)
cols <- c("Control" = "#f46d43", "Low" = "#66c2a5" , "High" = "#5e4fa2" ) # orange, aqua, dark-blue


plot(fit.circ.inc, quantities = TRUE, fills = list(fill = cols, alpha = 0.3) )
# https://stat.ethz.ch/R-manual/R-devel/library/grid/html/unit.html

grid.text(label = "A", x = unit(0.05, "npc") , y = unit(0.95,"npc"), gp=gpar(fontsize=16, fontface="bold") )

dev.print(tiff, file = paste0("plots/","Euler-diagram-Increasing-diff-abun-taxa-by-Treatments-vFINAL-A.tif"), width = 7, height = 10, units = "cm", res=600, compression="lzw")



## taxa that are only increasing in High:
overlap.inc$a7
# [1] "OTU_26"   "OTU_29"   "OTU_31"   "OTU_35"   "OTU_37"   "OTU_38"   "OTU_60"   "OTU_79"   "OTU_82"   "OTU_91"   "OTU_105" 
# [12] "OTU_109"  "OTU_115"  "OTU_139"  "OTU_173"  "OTU_196"  "OTU_556"  "OTU_1737" "OTU_2750" "OTU_3408" "OTU_3456" "OTU_3669"
# [23] "OTU_3876" "OTU_5375" "OTU_5520" "OTU_5765" "OTU_5790" "OTU_5915" "OTU_6509" "OTU_7377" "OTU_9265"
write.csv(overlap.inc$a7,file="SigDiffAbun-High-only-vFINAL1.csv", row.names = FALSE)

sel <- which( !(x[[3]] %in% x[[1]]) & !(x[[3]] %in% x[[2]]))
x[[3]][sel]
# [1] "OTU_26"   "OTU_29"   "OTU_31"   "OTU_35"   "OTU_37"   "OTU_38"   "OTU_60"   "OTU_79"   "OTU_82"   "OTU_91"   "OTU_105" 
# [12] "OTU_109"  "OTU_115"  "OTU_139"  "OTU_173"  "OTU_196"  "OTU_556"  "OTU_1737" "OTU_2750" "OTU_3408" "OTU_3456" "OTU_3669"
# [23] "OTU_3876" "OTU_5375" "OTU_5520" "OTU_5765" "OTU_5790" "OTU_5915" "OTU_6509" "OTU_7377" "OTU_9265"


## taxa that are only increasing in Low:
sel <- which( !(x[[2]] %in% x[[1]]) & !(x[[2]] %in% x[[3]]))
x[[2]][sel]
# [1] "OTU_240"  "OTU_332"  "OTU_5095" "OTU_8615" "OTU_9639"
overlap.inc$a3
# [1] "OTU_240"  "OTU_332"  "OTU_5095" "OTU_8615" "OTU_9639"



## taxa that are only increasing in Control:
sel <- which( !(x[[1]] %in% x[[2]]) & !(x[[1]] %in% x[[3]]))
x[[1]][sel]
# [1] "OTU_19"   "OTU_57"   "OTU_67"   "OTU_130"  "OTU_133"  "OTU_143"  "OTU_169"  "OTU_201"  "OTU_202"  "OTU_247"  "OTU_254" 
# [12] "OTU_258"  "OTU_402"  "OTU_419"  "OTU_531"  "OTU_2384" "OTU_2934" "OTU_3671" "OTU_3795" "OTU_4034" "OTU_4077" "OTU_4702"
# [23] "OTU_5971" "OTU_6689" "OTU_7119" "OTU_8231"
overlap.inc$a1
# [1] "OTU_19"   "OTU_57"   "OTU_67"   "OTU_130"  "OTU_133"  "OTU_143"  "OTU_169"  "OTU_201"  "OTU_202"  "OTU_247"  "OTU_254" 
# [12] "OTU_258"  "OTU_402"  "OTU_419"  "OTU_531"  "OTU_2384" "OTU_2934" "OTU_3671" "OTU_3795" "OTU_4034" "OTU_4077" "OTU_4702"
# [23] "OTU_5971" "OTU_6689" "OTU_7119" "OTU_8231"







## Decreasing taxa

# taxa.sig.diffabun.Dec.cont
# taxa.sig.diffabun.Dec.low
# taxa.sig.diffabun.Dec.high


x = list( "Control" = taxa.sig.diffabun.Dec.cont, "Low" = taxa.sig.diffabun.Dec.low, "High" = taxa.sig.diffabun.Dec.high)
overlap.dec <- calculate.overlap(x)

as.numeric(lapply(x, FUN = length)) #  20 17 20
as.numeric(lapply(overlap.dec, FUN = length)) # 6 1 6 4 7 6 4

str(overlap.dec)
# List of 7
# $ a5: chr [1:6] "OTU_16" "OTU_22" "OTU_127" "OTU_2084" ...
# $ a2: chr "OTU_142"
# $ a4: chr [1:6] "OTU_2" "OTU_2041" "OTU_2544" "OTU_2846" ...
# $ a6: chr [1:4] "OTU_93" "OTU_195" "OTU_719" "OTU_4743"
# $ a1: chr [1:7] "OTU_7" "OTU_45" "OTU_81" "OTU_112" ...
# $ a3: chr [1:6] "OTU_42" "OTU_104" "OTU_2164" "OTU_3118" ...
# $ a7: chr [1:4] "OTU_21" "OTU_24" "OTU_4086" "OTU_4681"

# i.e. Control 20, Low 17, High 20

# Control&Low
length(which(x[[1]] %in% x[[2]])) # 7

# Control&High
length(which(x[[1]] %in% x[[3]])) # 12

# Low&High
length(which(x[[2]] %in% x[[3]])) # 10

# Control&Low&High
sel <- which(x[[1]] %in% x[[2]])
length(which(x[[1]][sel] %in% x[[3]])) # 6


## use 'eulerr' package

combo <- c("Control" = 20-(7-6)-(12-6)-6, "Low" = 21-(7-6)-(10-6)-6, "High" = 20-(12-6)-(10-6)-6,
           "Control&Low" = 7-6, "Control&High" = 12-6, "Low&High" = 10-6,  "Control&Low&High" = 6)
fit.circ.dec <- euler(combo)
fit.circ.dec
plot(fit.circ.dec)
cols <- c("Control" = "#f46d43", "Low" = "#66c2a5" , "High" = "#5e4fa2" ) # orange, aqua, dark-blue


plot(fit.circ.dec, quantities = TRUE, fills = list(fill = cols, alpha = 0.3) )
# https://stat.ethz.ch/R-manual/R-devel/library/grid/html/unit.html

grid.text(label = "B", x = unit(0.05, "npc") , y = unit(0.95,"npc"), gp=gpar(fontsize=16, fontface="bold") )

dev.print(tiff, file = paste0("plots/","Euler-diagram-Decreasing-diff-abun-taxa-by-Treatments-decontam-vFINAL-B.tif"), width = 6, height = 10, units = "cm", res=600, compression="lzw")









### Heatmap visualisation of the significantly increasing taxa in Week 7 fecal samples

## use previously cleaned phyloseq object for Week 7 fecal samples

fecalw7.dc.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 645 taxa and 53 samples ]
# sample_data() Sample Data:       [ 53 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 645 taxa by 7 taxonomic ranks ]


## isolate only significant differentially abundant increasing taxa from each Treatment

phy_obj <- fecalw7.dc.16s


## Control
# subselect Week 7
phy_obj.cont.w7 <- prune_samples( phy_obj@sam_data$Treatment == "Control", phy_obj )
# subselect taxa that are significantly differentially abundant and increasing
phy_obj.cont.w7.Inc <- prune_taxa( phy_obj.cont.w7, taxa = taxa.sig.diffabun.Inc.cont )

## Low
# subselect Week 7
phy_obj.low.w7 <- prune_samples( phy_obj@sam_data$Treatment == "Low", phy_obj )
# subselect taxa that are significantly differentially abundant and increasing
phy_obj.low.w7.Inc <- prune_taxa( phy_obj.low.w7, taxa = taxa.sig.diffabun.Inc.low )

## High
# subselect Week 7
phy_obj.high.w7 <- prune_samples( phy_obj@sam_data$Treatment == "High", phy_obj )
# subselect taxa that are significantly differentially abundant and increasing
phy_obj.high.w7.Inc <- prune_taxa( phy_obj.high.w7, taxa = taxa.sig.diffabun.Inc.high )


## 2. Merge the filtered Week 7 samples
merge.phy_obj.w7.Inc <- merge_phyloseq(phy_obj.cont.w7.Inc, phy_obj.low.w7.Inc)   # step 1
merge.phy_obj.w7.Inc <- merge_phyloseq(merge.phy_obj.w7.Inc, phy_obj.high.w7.Inc) # step 2

min(sample_sums(merge.phy_obj.w7.Inc)) # 181
min(taxa_sums(merge.phy_obj.w7.Inc)) # 31

sort(sample_sums(merge.phy_obj.w7.Inc))
# L5m3T16 L2m3T16 L5m1T16 L4m2T16 L5m2T16 L2m1T16 L4m1T16 L1m3T16 L6m2T16 L2m2T16 L3m1T16 L1m1T16 C2m3T16 L4m3T16 L3m3T16 L6m1T16 
# 181     223     283     311     348     407     453     571     728     744     747     767     771     909     959    1191 
# L3m2T16 H3m2T16 L6m3T16 C1m1T16 H4m2T16 H6m1T16 C1m3T16 H5m1T16 H3m3T16 C2m1T16 H6m2T16 C1m2T16 H1m1T16 C3m3T16 H6m3T16 H2m1T16 
# 1372    1438    1622    1891    3556    3563    3567    4103    4180    4229    5089    5120    5548    5750    6059    6497 
# C2m2T16 H3m1T16 C4m2T16 H1m2T16 C3m2T16 C4m1T16 C3m1T16 C6m2T16 C5m3T16 H4m3T16 C5m1T16 C5m2T16 C4m3T16 H1m3T16 H4m1T16 H2m3T16 
# 7616    7684    8028    8272    9219    9286    9553   11153   11520   14358   14952   15158   15381   15565   15814   16732 
# H5m2T16 H2m2T16 H5m3T16 C6m1T16 C6m3T16 
# 17812   18516   20252   23901   23937 



## Normalise by converting to relative abundance ...

relabun.merge.phy_obj.w7.Inc <- transform_sample_counts(merge.phy_obj.w7.Inc, function(x) x / sum(x) )

relabun.merge.phy_obj.w7.Inc
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 93 taxa and 53 samples ]
# sample_data() Sample Data:       [ 53 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 93 taxa by 7 taxonomic ranks ]

dim(relabun.merge.phy_obj.w7.Inc@otu_table) # 93 taxa x  53 samples


mat0 <- as.matrix(as.data.frame(relabun.merge.phy_obj.w7.Inc@otu_table))
mat0 <- t(mat0) # now samples are rows, taxa are columns
dim(mat0) # 53  93

ok <- complete.cases(mat0)
sel <- which(ok==FALSE) # empty, i.e. all rows are complete, no NAs

row.names(mat0)
# [1] "C1m3T16" "C3m3T16" "C5m3T16" "C6m1T16" "C4m1T16" "C2m1T16" "C4m2T16" "C6m2T16" "C2m2T16" "C3m1T16" "C1m1T16" "C5m1T16"
# [13] "C1m2T16" "C3m2T16" "C5m2T16" "C4m3T16" "C6m3T16" "C2m3T16" "L5m3T16" "L3m3T16" "L1m3T16" "L2m2T16" "L6m2T16" "L4m2T16"
# [25] "L2m1T16" "L4m1T16" "L6m1T16" "L5m2T16" "L3m2T16" "L5m1T16" "L1m1T16" "L3m1T16" "L2m3T16" "L6m3T16" "L4m3T16" "H5m1T16"
# [37] "H3m1T16" "H1m1T16" "H5m2T16" "H1m2T16" "H3m2T16" "H2m3T16" "H4m3T16" "H6m3T16" "H5m3T16" "H1m3T16" "H3m3T16" "H2m1T16"
# [49] "H6m1T16" "H4m1T16" "H2m2T16" "H4m2T16" "H6m2T16"

temp <- mat0

# convert for display purposes. Now all are week 7 (T16) so we can delete this
row.names(mat0) <- gsub(pattern = "T16", replacement = "", x = row.names(mat0))

row.names(mat0)
# [1] "C1m3" "C3m3" "C5m3" "C6m1" "C4m1" "C2m1" "C4m2" "C6m2" "C2m2" "C3m1" "C1m1" "C5m1" "C1m2" "C3m2" "C5m2" "C4m3" "C6m3" "C2m3"
# [19] "L5m3" "L3m3" "L1m3" "L2m2" "L6m2" "L4m2" "L2m1" "L4m1" "L6m1" "L5m2" "L3m2" "L5m1" "L1m1" "L3m1" "L2m3" "L6m3" "L4m3" "H5m1"
# [37] "H3m1" "H1m1" "H5m2" "H1m2" "H3m2" "H2m3" "H4m3" "H6m3" "H5m3" "H1m3" "H3m3" "H2m1" "H6m1" "H4m1" "H2m2" "H4m2" "H6m2"

## convert to z-scores
mat0.zcol <- base::scale(mat0) # default scaling method scales the columns of a numeric matrix
mat0[1:5,1:5]
mat0.zcol[1:5,1:5] # scales within taxa (by column)

mat0.zcol.zrow <- t( base::scale(t(mat0.zcol)) )
mat0.zcol.zrow[1:5, 1:5] # then scale within samples (by row)

class(mat0.zcol.zrow) # matrix

mat <- mat0.zcol.zrow


heatmap.2(mat, trace="none",col=colorRampPalette(c("blue","white","red"))(20), margins = c(5,5))


getwd() # "C:/Workspace/PROJ/PAPER-MICRO-MICE/modelling"


heatmap.2(mat, trace = "none", dendrogram = "both",
          col=colorRampPalette(c("red","black","green"))(20),
          rowsep=c(17,35),
          margins = c(2,5), # cexRow = 1, 
          key.title = "Z-score", keysize = 1.5, 
          xlab = "OTU relative abundance (z-score)",
          labCol = "",
          RowSideColors=c(
            # Control  ="#f46d43",  # orange
            # Low      ="#66c2a5", # green-blue
            # High     ="#5e4fa2", dark-blue
            
            "C1m3"="#f46d43", "C3m3"="#f46d43", "C5m3"="#f46d43", "C6m1"="#f46d43", "C4m1"="#f46d43", "C2m1"="#f46d43", "C4m2"="#f46d43", "C6m2"="#f46d43", "C2m2"="#f46d43",
            "C3m1"="#f46d43", "C1m1"="#f46d43", "C5m1"="#f46d43", "C1m2"="#f46d43", "C3m2"="#f46d43", "C5m2"="#f46d43", "C4m3"="#f46d43", "C6m3"="#f46d43", "C2m3"="#f46d43",
            "L5m3"="#66c2a5", "L3m3"="#66c2a5", "L1m3"="#66c2a5", "L2m2"="#66c2a5", "L6m2"="#66c2a5", "L4m2"="#66c2a5", "L2m1"="#66c2a5", "L4m1"="#66c2a5", "L6m1"="#66c2a5",
            "L5m2"="#66c2a5", "L3m2"="#66c2a5", "L5m1"="#66c2a5", "L1m1"="#66c2a5", "L3m1"="#66c2a5", "L2m3"="#66c2a5", "L6m3"="#66c2a5", "L4m3"="#66c2a5",
            "H5m1"="#5e4fa2", "H3m1"="#5e4fa2", "H1m1"="#5e4fa2", "H5m2"="#5e4fa2", "H1m2"="#5e4fa2", "H3m2"="#5e4fa2", "H2m3"="#5e4fa2", "H4m3"="#5e4fa2", "H6m3"="#5e4fa2",
            "H5m3"="#5e4fa2", "H1m3"="#5e4fa2", "H3m3"="#5e4fa2", "H2m1"="#5e4fa2", "H6m1"="#5e4fa2", "H4m1"="#5e4fa2", "H2m2"="#5e4fa2", "H4m2"="#5e4fa2", "H6m2"="#5e4fa2"
  
          )
)


legend(x = 0.2, y = 1.07, xpd = TRUE,     # top left
       legend = c(
         "Control", "Low", "High"
       ),
       col = c(
         "Control"  ="#f46d43",  # orange
         "Low"      = "#66c2a5", # green-blue
         "High"     ="#5e4fa2"   # dark-blue
       ), 
       lty= 1,             
       lwd = 5,           
       cex=.6 ,
       xjust = 0,  # how the legend is to be justified relative to the legend x location. A value of 0 means left justified, 0.5 means centered and 1 means right justified
       yjust = 0  # how the legend is to be justified relative to the legend x location. A value of 0 means left justified, 0.5 means centered and 1 means right justified
)


dev.print(tiff, file=paste0("plots/","Heatmap-Sig-DiffAbun-Increasing-taxa-zcol-zrow-both-dendograms-vFINAL.tif"), width = 18.3, height = 16, units = "cm", res = 600, compression = "lzw")

#------------------------






#### Get closest NCBI match / putative species
#------------------------

# https://rdrr.io/github/mhahsler/rBLAST/man/BLAST.html
# Accessed 2019-01-25
# ftp://ftp.ncbi.nlm.nih.gov/blast/db
# ftp://ftp.ncbi.nlm.nih.gov/blast/db/README
# DATABASE COPY
# ftp://ftp.ncbi.nlm.nih.gov/blast/db/16SMicrobial.tar.gz  (Dated 20/01/2019)
# EXECUTABLE COPY
# Install software from:
# ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ncbi-blast-2.8.1+-win64.exe  (Dated 27/11/2018)
# call using    -db 16SMicrobial

# Record Expectation (E) value - use threshold of 10e-10 to denote significant matches (Pearson 2013)


## fasta sequences are available from 

seqs_df <- read_excel(path= paste0(datadir,"/","OTU-rep-sequences.xlsx"),
                      sheet=1, range="B1:C10485", col_names = TRUE)

seqs_df <- as.data.frame(seqs_df)
str(seqs_df)

## Find interpretable putative species assignment from NCBI bacterial 16S database

names(seqs_df)
#  "subject" "OTU-seq"

head(seqs_df[ ,c("subject","OTU-seq")])
# subject
# 1   OTU_1
# 2   OTU_2
# 3   OTU_3
# 4   OTU_4
# 5   OTU_5
# 6   OTU_6
# OTU-seq
# 1                          TGGGGGATATTGCACAATGGGGGGAACCCTGATGCAGCGACGCCGCGTGGGCGATGGAGTGCTTCGGCATGTAAAGCCCTATCAGCAGGGAAGAAAATGACGGTACCTGACTAAGAAGCCCCGGCTAAATACGTGCCAGCAGCCGCGGTAATACGTATGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGGCGGCCTGGCAAGCCTGATGTGAAATACCGGGGCCCAACCCCGGGGCTGCATTGGGAACTGCCAGGCTGGAGTGCCGGAGAGGCAGGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCCTGCTGGACGGTGACTGACGCTGAGGCTCGAAAGCGTGGGGAGCAAACAGG
# 2 TAGGGAATCTTCCACAATGGGCGAAAGCCTGATGGAGCAACGCCGCGTGGGTGAAGAAGGTCTTCGGATCGTAAAACCCTGTTGTTAGAGAAGAAAGTGCGTGAGAGTAACTGTTCACGTTTCGACGGTATCTAACCAGAAAGCCACGGCTAACTACGTGCCAGCAGCCGCGGTAATACGTAGGTGGCAAGCGTTATCCGGATTTATTGGGCGTAAAGGGAACGCAGGCGGTCTTTTAAGTCTGATGTGAAAGCCTTCGGCTTAACCGGAGTAGTGCATTGGAAACTGGGAGACTTGAGTGCAGAAGAGGAGAGTGGAACTCCATGTGTAGCGGTGAAATGCGTAGATATATGGAAGAACACCAGTGGCGAAAGCGGCTCTCTGGTCTGTAACTGACGCTGAGGTTCGAAAGCGTGGGTAGCAAACAGG
# 3                          TGGGGAATATTGCACAATGGGGGAAACCCTGATGCAGCGACGCCGCGTGAGTGAAGAAGTATTTCGGTATGTAAAGCTCTATCAGCAGGGAAGAAAATGACGGTACCTGATTAAGAAGCCCCGGCTAACTACGTGCCAGCAGCCGCGGTAATACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGCAGCGCAAGTCTGAAGTGAAATGCCGGGGCTTAACCCCGGAACTGCTTTGGAAACTGTGCAGCTAGAGTGCAGGAGAGGTAAGTGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACTGTAACTGACGTTGAGGCTCGAAAGCGTGGGGAGCAAACAGG
# 4        TGGGGAATATTGCACAATGGGGGAAACCCTGATGCAGCGACGCCGCGTGAGTGAAGAAGTATTTCGGTATGTAAAGCTCTATCAGCAGGGAAGAAATAGCGACCTTTGGGTCGCCAGACGGTACCTGATTAAGAAGCCCCGGCTAACTACGTGCCAGCAGCCGCGGTAATACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGCAGCGCAAGTCTGAAGTGAAATGCCGGGGCTTAACCCCGGAACTGCTTTGGAAACTGTGCAGCTAGAGTGCAGGAGAGGTAAGTGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACTGTAACTGACGTTGAGGCTCGAAAGCGTGGGGAGCAAACAGG
# 5                         TGGGGAATATTGCACAATGGGGGAAACCCTGATGCAGCGACGCCGCGTGAGTGAAGAAGTATTTCGGTATGTAAAGCTCTATCAGCAGGGAAGAAAAAAGACGGTACCTGACTAAGAAGCCCCGGCTAACTACGTGCCAGCAGCCGCGGTAATACGTAGGGGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGCGCGGCAAGTCTGAAGTGAAAGGCAGGGGCTTAACCCCTGAACTGCTTTGGAAACTGCCATGCTAGAGTGCTGGAGAGGTAAGTGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACAGTAACTGACGTTGAGGCTCGAAAGCGTGGGGAGCAAACAGG
# 6 TAGGGAATCTTCCACAATGGACGCAAGTCTGATGGAGCAACGCCGCGTGAGTGAAGAAGGGTTTCGGCTCGTAAAGCTCTGTTGGTAGTGAAGAAAGATAGAGGTAGTAACTGGCCTTTATTTGACGGTAATTACCTAGAAAGTCACGGCTAACTACGTGCCAGCAGCCGCGGTAATACGTAGGTGGCAAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGTGCAGGCGGTTCAATAAGTCTGATGTGAAAGCCTTCGGCTCAACCGGAGAATTGCATCAGAAACTGTTGAACTTGAGTGCAGAAGAGGAGAGTGGAACTCCATGTGTAGCGGTGGAATGCGTAGATATATGGAAGAACACCAGTGGCGAAGGCGGCTCTCTGGTCTGCAACTGACGCTGAGGCTCGAAAGCATGGGTAGCGAACAGG


## copied from above
x = list( "Control" = taxa.sig.diffabun.Inc.cont, "Low" = taxa.sig.diffabun.Inc.low, "High" = taxa.sig.diffabun.Inc.high)
overlap.inc <- calculate.overlap(x)

## taxa that are only increasing in High:
overlap.inc$a7
# [1] "OTU_26"   "OTU_29"   "OTU_31"   "OTU_35"   "OTU_37"   "OTU_38"   "OTU_60"   "OTU_79"   "OTU_82"  
# [10] "OTU_91"   "OTU_105"  "OTU_109"  "OTU_115"  "OTU_139"  "OTU_173"  "OTU_196"  "OTU_556"  "OTU_1737"
# [19] "OTU_2750" "OTU_3408" "OTU_3456" "OTU_3669" "OTU_3876" "OTU_5375" "OTU_5520" "OTU_5765" "OTU_5790"
# [28] "OTU_5915" "OTU_6509" "OTU_7377" "OTU_9265"

sel <- which( !(x[[3]] %in% x[[1]]) & !(x[[3]] %in% x[[2]]))
x[[3]][sel]
# [1] "OTU_26"   "OTU_29"   "OTU_31"   "OTU_35"   "OTU_37"   "OTU_38"   "OTU_60"   "OTU_79"   "OTU_82"  
# [10] "OTU_91"   "OTU_105"  "OTU_109"  "OTU_115"  "OTU_139"  "OTU_173"  "OTU_196"  "OTU_556"  "OTU_1737"
# [19] "OTU_2750" "OTU_3408" "OTU_3456" "OTU_3669" "OTU_3876" "OTU_5375" "OTU_5520" "OTU_5765" "OTU_5790"
# [28] "OTU_5915" "OTU_6509" "OTU_7377" "OTU_9265"

taxa.sig.ONLY.High <- overlap.inc$a7



## taxa that are only increasing in Low:
sel <- which( !(x[[2]] %in% x[[1]]) & !(x[[2]] %in% x[[3]]))
x[[2]][sel]
# "OTU_240"  "OTU_332"  "OTU_5095" "OTU_8615" "OTU_9639"
overlap.inc$a3
#  "OTU_240"  "OTU_332"  "OTU_5095" "OTU_8615" "OTU_9639"

taxa.sig.ONLY.Low <- overlap.inc$a3



## taxa that are only increasing in Control:
sel <- which( !(x[[1]] %in% x[[2]]) & !(x[[1]] %in% x[[3]]))
x[[1]][sel]
# [1] "OTU_19"   "OTU_57"   "OTU_67"   "OTU_130"  "OTU_133"  "OTU_143"  "OTU_169"  "OTU_201"  "OTU_202" 
# [10] "OTU_247"  "OTU_254"  "OTU_258"  "OTU_402"  "OTU_419"  "OTU_531"  "OTU_2384" "OTU_2934" "OTU_3671"
# [19] "OTU_3795" "OTU_4034" "OTU_4077" "OTU_4702" "OTU_5971" "OTU_6689" "OTU_7119" "OTU_8231"
overlap.inc$a1
# [1] "OTU_19"   "OTU_57"   "OTU_67"   "OTU_130"  "OTU_133"  "OTU_143"  "OTU_169"  "OTU_201"  "OTU_202" 
# [10] "OTU_247"  "OTU_254"  "OTU_258"  "OTU_402"  "OTU_419"  "OTU_531"  "OTU_2384" "OTU_2934" "OTU_3671"
# [19] "OTU_3795" "OTU_4034" "OTU_4077" "OTU_4702" "OTU_5971" "OTU_6689" "OTU_7119" "OTU_8231"

taxa.sig.ONLY.Control <- overlap.inc$a1




## load a BLAST database (replace db with the location + name of the BLAST DB)
# downloaded db from: ftp://ftp.ncbi.nlm.nih.gov/blast/db/16SMicrobial.tar.gz  (Dated 20/01/2019)
untar("C:/Workspace/DATA/NCBI_BLAST_16S/16SMicrobial.tar.gz", exdir="C:/Workspace/DATA/NCBI_BLAST_16S/16SMicrobialDB")

# blastn.exe is here: "C:/Program Files/NCBI/blast-2.8.1+/bin"

Sys.getenv("PATH") # includes: C:\\Program Files\\NCBI\\blast-2.8.1+\\bin
# [1] "C:\\R\\R-3.5.1\\bin\\x64;C:\\Program Files (x86)\\Common Files\\Oracle\\Java\\javapath;C:\\Program Files\\ImageMagick-7.0.6-Q16;C:\\ProgramData\\Oracle\\Java\\javapath;C:\\WINDOWS\\system32;C:\\WINDOWS;C:\\WINDOWS\\System32\\Wbem;C:\\WINDOWS\\System32\\WindowsPowerShell\\v1.0\\;C:\\Program Files\\PuTTY\\;C:\\ArcGIS\\EsriProductionMapping\\Desktop10.3\\Bin;C:\\ArcGIS\\Bin;C:\\Program Files (x86)\\Sennheiser\\SoftphoneSDK\\;C:\\Program Files\\NCBI\\blast-2.8.1+\\bin;C:\\Users\\a1026871\\AppData\\Local\\Microsoft\\WindowsApps;C:\\Users\\a1026871\\AppData\\Local\\Pandoc\\;C:\\Program Files\\snap\\bin;C:\\Python27;C:\\Sen2Cor-02.05.05-win64"

Sys.which("blastn")
# blastn 
# "C:\\Program Files\\NCBI\\BLAST-~1.1_\\bin\\blastn.exe"



system(command = "blastdbcmd -db C:/Workspace/DATA/NCBI_BLAST_16S/16SMicrobialDB/16SMicrobial -info")
# Database: 16S Microbial Sequences
# 20,792 sequences; 30,234,980 total bases
# Date: Jan 20, 2019  12:00 AM	Longest sequence: 3,600 bases
# BLASTDB Version: 4
# Volumes:
#   C:\Workspace\DATA\NCBI_BLAST_16S\16SMicrobialDB\16SMicrobial



##  set up text_query.txt and adapt this!
# system(command = "blastn -query text_query.txt -db C:/Workspace/DATA/NCBI_BLAST_16S/16SMicrobialDB/16SMicrobial -out output.txt")



## assign closest NCBI match to significantly increasing taxa

# store temporary blast results here
blast_dir <- "C:/Workspace/PROJ/PAPER-MICRO-MICE/modelling/blast_outputs_temp"



### HIGH


# clear previous results
#unlink(file.path(blast_dir) , recursive=TRUE)

# store results in dataframe. Some fields as placeholders to be filled later
#df_blast <- data.frame(otu=taxa.sig.ONLY.High, score_bits=NA, expect=NA, pident=NA, closest_ncbi_match=NA )

df_blast <- data.frame(otu=taxa.sig.ONLY.High, AccessionNo=NA, closest_ncbi_match=NA, score_bits=NA, expect=NA, pident=NA, Phylum=NA, Description=NA, Ref=NA )

for (i in 1:length(df_blast$otu)) {   # length(taxa.sig.ONLY.High) = 31
  #i<-1
  sel <- which(seqs_df$subject == df_blast$otu[i])
  rep_seq <- seqs_df$`OTU-seq`[sel]
  
  fileConn <- file("query_seq.txt")
  writeLines(text = rep_seq, fileConn)
  close(fileConn)
  
  # blast output formats here: https://www.ncbi.nlm.nih.gov/books/NBK279684/

  this_command <- paste0('blastn -db C:/Workspace/DATA/NCBI_BLAST_16S/16SMicrobialDB/16SMicrobial -query query_seq.txt -max_target_seqs 1 -out ',blast_dir,'/output_',i,'.txt')
  
  system(command = this_command)
  
  if (file.exists(paste0(blast_dir,'/output_',i,'.txt'))) {
    
    read_out <- read_file(file = paste0(blast_dir,'/output_',i,'.txt'))
    # Score
    idx.before <- str_locate(string = read_out, pattern = "Score = ")
    idx.after <-  str_locate(string = read_out, pattern = " bits ")
    df_blast$score_bits[i] <- as.numeric( substring(text = read_out, first = as.numeric(idx.before[1,"end"])+1, last = as.numeric(idx.after[1,"start"])-1) )
    
    # Expect - this is unreliable to extract by code
    # idx.before <- str_locate(string = read_out, pattern = ",  Expect = ")
    # df_blast$expect[i] <- as.numeric( substring(text = read_out, first = as.numeric(idx.before[1,"end"])+1, last = as.numeric(idx.before[1,"end"])+3) )
    
    # Pident (percent identity) ... this fails if 100%, only captures 00 = 0, so check 0's manually
    idx.after <-  str_locate(string = read_out, pattern = ", Gaps = ")
    df_blast$pident[i] <- as.numeric( substring(text = read_out, first = as.numeric(idx.after[1,"start"])-4, last = as.numeric(idx.after[1,"start"])-3) )
  
    # Putative species / closest NCBI match?
    idx.before <- str_locate(string = read_out, pattern = "\r\n\r\n\r\n>N")
    df_blast$closest_ncbi_match[i] <- substring(text = read_out, first = as.numeric(idx.before[1,"end"]), last = as.numeric(idx.before[1,"end"])+60)
    
  } else {
    df_blast$score_bits[i] <- NA
    #df_blast$expect[i] <- NA
    df_blast$pident[i] <- NA
    df_blast$closest_ncbi_match[i] <- NA
  }
  
  print(paste0("Calculated OTU: ", df_blast$otu[i]," no ",i," of ",length(df_blast$otu)))
  
}


df_blast.High <- df_blast


df_blast.High[ ,c("otu", "closest_ncbi_match", "score_bits", "expect", "pident")]
#         otu                                            closest_ncbi_match score_bits expect pident
# 1    OTU_26 NR_144616.1 Muribaculum intestinale strain YL27 16S ribosomal        466     NA     87
# 2    OTU_29 NR_144616.1 Muribaculum intestinale strain YL27 16S ribosomal        562     NA     91
# 3    OTU_31 NR_134772.1 Murimonas intestini strain SRB-530-5-H 16S riboso        647     NA     96
# 4    OTU_35 NR_041364.1 Prevotella stercorea strain CB35 16S ribosomal RN        584     NA     92
# 5    OTU_37 NR_156080.1 Kineothrix alysoides strain KNHs209 16S ribosomal        521     NA     97  #
# 6    OTU_38 NR_117905.1 Acetatifactor muris strain CT-m2 16S ribosomal RN        630     NA     95
# 7    OTU_60 NR_156080.1 Kineothrix alysoides strain KNHs209 16S ribosomal        658     NA     96  #
# 8    OTU_79 NR_113117.1 Prevotella oralis strain JCM 12251 16S ribosomal         545     NA     90
# 9    OTU_82 NR_116938.1 Olsenella profusa strain DSM 13989 16S ribosomal         675     NA     96
# 10   OTU_91 NR_113271.1 Alistipes indistinctus strain JCM 16068 16S ribos        662     NA     95
# 11  OTU_105 NR_144616.1 Muribaculum intestinale strain YL27 16S ribosomal        529     NA     89
# 12  OTU_109 NR_026493.1 [Clostridium] indolis strain 7 16S ribosomal RNA         558     NA     91
# 13  OTU_115 NR_144748.1 Phocea massiliensis strain Marseille-P2769 16S ri        573     NA     92
# 14  OTU_139 NR_027558.1 Anaerotruncus colihominis strain WAL 14565 16S ri        651     NA     96
# 15  OTU_173 NR_118554.1 Intestinimonas butyriciproducens strain SRB-521-5        592     NA     93
# 16  OTU_196 NR_115340.1 Saccharofermentans acetigenes strain P6 16S ribos        414     NA     85
# 17  OTU_556 NR_118156.1 Oscillibacter ruminantium strain GH1 16S ribosoma        612     NA     94
# 18 OTU_1737 NR_117905.1 Acetatifactor muris strain CT-m2 16S ribosomal RN        614     NA     94
# 19 OTU_2750 NR_144616.1 Muribaculum intestinale strain YL27 16S ribosomal        556     NA     90
# 20 OTU_3408 NR_115711.1 Clostridium cavendishii strain BL-28 16S ribosoma        411     NA     85
# 21 OTU_3456 NR_144616.1 Muribaculum intestinale strain YL27 16S ribosomal        540     NA     90
# 22 OTU_3669 NR_144731.1 Eisenbergiella massiliensis strain AT11 16S ribos        586     NA     92
# 23 OTU_3876 NR_144616.1 Muribaculum intestinale strain YL27 16S ribosomal        562     NA     91
# 24 OTU_5375 NR_036800.1 [Ruminococcus] gnavus strain ATCC 29149 16S ribos        564     NA     92
# 25 OTU_5520 NR_156081.1 Kineothrix alysoides strain KNHs209 16S ribosomal        658     NA     96  #
# 26 OTU_5765 NR_104899.1 Bacteroides xylanolyticus strain X5-1 16S ribosom        569     NA     92
# 27 OTU_5790 NR_156081.1 Kineothrix alysoides strain KNHs209 16S ribosomal        483     NA     95  #
# 28 OTU_5915 NR_144616.1 Muribaculum intestinale strain YL27 16S ribosomal        584     NA     92
# 29 OTU_6509 NR_144616.1 Muribaculum intestinale strain YL27 16S ribosomal        505     NA     88
# 30 OTU_7377 NR_144616.1 Muribaculum intestinale strain YL27 16S ribosomal        505     NA     88
# 31 OTU_9265 NR_144616.1 Muribaculum intestinale strain YL27 16S ribosomal        488     NA     88

write.csv(df_blast.High, file = "blast-results-High-vFINAL1.csv", row.names = FALSE)






### LOW

# clear previous results - OR move to storage folder
#unlink(file.path(blast_dir) , recursive=TRUE)

# store results in dataframe. Some fields as placeholders to be filled later
df_blast <- data.frame(otu=taxa.sig.ONLY.Low, AccessionNo=NA, closest_ncbi_match=NA, score_bits=NA, expect=NA, pident=NA, Phylum=NA, Description=NA, Ref=NA )

for (i in 1:length(df_blast$otu)) {   # length(taxa.sig.ONLY.Low) = 5
  #i<-1
  sel <- which(seqs_df$subject == df_blast$otu[i])
  rep_seq <- seqs_df$`OTU-seq`[sel]
  
  fileConn <- file("query_seq.txt")
  writeLines(text = rep_seq, fileConn)
  close(fileConn)
  
  this_command <- paste0('blastn -db C:/Workspace/DATA/NCBI_BLAST_16S/16SMicrobialDB/16SMicrobial -query query_seq.txt -max_target_seqs 1 -out ',blast_dir,'/output_',i,'.txt')
  
  system(command = this_command)
  
  if (file.exists(paste0(blast_dir,'/output_',i,'.txt'))) {
    
    read_out <- read_file(file = paste0(blast_dir,'/output_',i,'.txt'))
    # Score
    idx.before <- str_locate(string = read_out, pattern = "Score = ")
    idx.after <-  str_locate(string = read_out, pattern = " bits ")
    df_blast$score_bits[i] <- as.numeric( substring(text = read_out, first = as.numeric(idx.before[1,"end"])+1, last = as.numeric(idx.after[1,"start"])-1) )
    
    # Expect - this is unreliable to extract by code
    # idx.before <- str_locate(string = read_out, pattern = ",  Expect = ")
    # df_blast$expect[i] <- as.numeric( substring(text = read_out, first = as.numeric(idx.before[1,"end"])+1, last = as.numeric(idx.before[1,"end"])+3) )
    
    # Pident (percent identity) ... this fails if 100%, only captures 00 = 0, so check 0's manually
    idx.after <-  str_locate(string = read_out, pattern = ", Gaps = ")
    df_blast$pident[i] <- as.numeric( substring(text = read_out, first = as.numeric(idx.after[1,"start"])-4, last = as.numeric(idx.after[1,"start"])-3) )
    
    # Putative species / closest NCBI match?
    idx.before <- str_locate(string = read_out, pattern = "\r\n\r\n\r\n>N")
    df_blast$closest_ncbi_match[i] <- substring(text = read_out, first = as.numeric(idx.before[1,"end"]), last = as.numeric(idx.before[1,"end"])+60)
    
  } else {
    df_blast$score_bits[i] <- NA
    #df_blast$expect[i] <- NA
    df_blast$pident[i] <- NA
    df_blast$closest_ncbi_match[i] <- NA
  }
  
  print(paste0("Calculated OTU: ", df_blast$otu[i]," no ",i," of ",length(df_blast$otu)))
  
}


df_blast.Low <- df_blast


df_blast.Low[ ,c("otu", "closest_ncbi_match", "score_bits", "expect", "pident")]
#        otu                                            closest_ncbi_match score_bits expect pident
# 1  OTU_240 NR_144608.1 Cuneatibacter caecimuris strain BARN-424-CC-10 16        654     NA     96
# 2  OTU_332 NR_144611.1 Flintibacter butyricus strain BLS21 16S ribosomal        573     NA     92
# 3 OTU_5095 NR_147370.1 Pseudoflavonifractor phocaeensis strain Marseille        680     NA     97
# 4 OTU_8615 NR_144750.1 Prevotellamassilia timonensis strain Marseille-P2        457     NA     86
# 5 OTU_9639 NR_156081.1 Kineothrix alysoides strain KNHs209 16S ribosomal        603     NA     94


write.csv(df_blast.Low, file = "blast-results-Low-vFINAL1.csv", row.names = FALSE)




### CONTROL

# clear previous results - OR move to storage folder
#unlink(file.path(blast_dir) , recursive=TRUE)

# store results in dataframe. Some fields as placeholders to be filled later
df_blast <- data.frame(otu=taxa.sig.ONLY.Control, AccessionNo=NA, closest_ncbi_match=NA, score_bits=NA, expect=NA, pident=NA, Phylum=NA, Description=NA, Ref=NA )

for (i in 1:length(df_blast$otu)) {   # length(taxa.sig.ONLY.Control) = 26
  #i<-1
  sel <- which(seqs_df$subject == df_blast$otu[i])
  rep_seq <- seqs_df$`OTU-seq`[sel]
  
  fileConn <- file("query_seq.txt")
  writeLines(text = rep_seq, fileConn)
  close(fileConn)
  
  # blast output formats here: https://www.ncbi.nlm.nih.gov/books/NBK279684/

  this_command <- paste0('blastn -db C:/Workspace/DATA/NCBI_BLAST_16S/16SMicrobialDB/16SMicrobial -query query_seq.txt -max_target_seqs 1 -out ',blast_dir,'/output_',i,'.txt')
  
  system(command = this_command)
  
  if (file.exists(paste0(blast_dir,'/output_',i,'.txt'))) {
    
    read_out <- read_file(file = paste0(blast_dir,'/output_',i,'.txt'))
    # Score
    idx.before <- str_locate(string = read_out, pattern = "Score = ")
    idx.after <-  str_locate(string = read_out, pattern = " bits ")
    df_blast$score_bits[i] <- as.numeric( substring(text = read_out, first = as.numeric(idx.before[1,"end"])+1, last = as.numeric(idx.after[1,"start"])-1) )
    
    # Expect - this is unreliable to extract by code
    # idx.before <- str_locate(string = read_out, pattern = ",  Expect = ")
    # df_blast$expect[i] <- as.numeric( substring(text = read_out, first = as.numeric(idx.before[1,"end"])+1, last = as.numeric(idx.before[1,"end"])+3) )
    
    # Pident (percent identity) ... this fails if 100%, only captures 00 = 0, so check 0's manually
    idx.after <-  str_locate(string = read_out, pattern = ", Gaps = ")
    df_blast$pident[i] <- as.numeric( substring(text = read_out, first = as.numeric(idx.after[1,"start"])-4, last = as.numeric(idx.after[1,"start"])-3) )
    
    # Putative species / closest NCBI match?
    idx.before <- str_locate(string = read_out, pattern = "\r\n\r\n\r\n>N")
    df_blast$closest_ncbi_match[i] <- substring(text = read_out, first = as.numeric(idx.before[1,"end"]), last = as.numeric(idx.before[1,"end"])+60)
    
  } else {
    df_blast$score_bits[i] <- NA
    #df_blast$expect[i] <- NA
    df_blast$pident[i] <- NA
    df_blast$closest_ncbi_match[i] <- NA
  }
  
  print(paste0("Calculated OTU: ", df_blast$otu[i]," no ",i," of ",length(df_blast$otu)))
  
}


df_blast.Cont <- df_blast


df_blast.Cont[ ,c("otu", "closest_ncbi_match", "score_bits", "expect", "pident")]
#         otu                                            closest_ncbi_match score_bits expect pident
# 1    OTU_19 NR_074515.1 Bacteroides vulgatus strain ATCC 8482 16S ribosom        784     NA      0
# 2    OTU_57 NR_044648.2 [Eubacterium] tortuosum strain ATCC 25548 16S rib        586     NA     91
# 3    OTU_67 NR_074793.2 Oscillibacter valericigenes strain Sjm18-20 16S r        645     NA     95
# 4   OTU_130 NR_144749.1 Ihubacter massiliensis strain Marseille-P2843 16S        693     NA     98
# 5   OTU_133 NR_119085.1 [Clostridium] polysaccharolyticum strain DSM 1801        634     NA     95
# 6   OTU_143 NR_156080.1 Kineothrix alysoides strain KNHs209 16S ribosomal        630     NA     95
# 7   OTU_169 NR_025162.1 Tindallia californiensis strain APO 16S ribosomal        259     NA     79
# 8   OTU_201 NR_028911.1 Anaerovorax odorimutans strain NorPut 16S ribosom        568     NA     92
# 9   OTU_202 NR_042825.1 Paraeggerthella hongkongensis strain HKU10 16S ri        580     NA     93
# 10  OTU_247 NR_042007.1 Roseburia inulinivorans strain A2-194 16S ribosom        593     NA     93
# 11  OTU_254 NR_114789.1 [Clostridium] leptum strain DSM 753 16S ribosomal        619     NA     94
# 12  OTU_258 NR_074629.1 Ruminiclostridium thermocellum strain ATCC 27405         520     NA     90
# 13  OTU_402 NR_148620.1 Lachnotalea glycerini strain : DLD10 16S ribosoma        658     NA     96
# 14  OTU_419 NR_117905.1 Acetatifactor muris strain CT-m2 16S ribosomal RN        597     NA     93
# 15  OTU_531 NR_156081.1 Kineothrix alysoides strain KNHs209 16S ribosomal        686     NA     97
# 16 OTU_2384 NR_026493.1 [Clostridium] indolis strain 7 16S ribosomal RNA         614     NA     94
# 17 OTU_2934 NR_036800.1 [Ruminococcus] gnavus strain ATCC 29149 16S ribos        608     NA     94
# 18 OTU_3671 NR_116747.1 Ruminococcus faecis strain Eg2 16S ribosomal RNA         542     NA     91
# 19 OTU_3795 NR_074793.2 Oscillibacter valericigenes strain Sjm18-20 16S r        590     NA     93
# 20 OTU_4034 NR_156907.1 Faecalimonas umbilicata strain EGH7 16S ribosomal        603     NA     94
# 21 OTU_4077 NR_026493.1 [Clostridium] indolis strain 7 16S ribosomal RNA         592     NA     93
# 22 OTU_4702 NR_152688.1 Falcatimonas natans strain WN011 16S ribosomal RN        597     NA     93
# 23 OTU_5971 NR_156080.1 Kineothrix alysoides strain KNHs209 16S ribosomal        569     NA     92
# 24 OTU_6689 NR_036800.1 [Ruminococcus] gnavus strain ATCC 29149 16S ribos        619     NA     94
# 25 OTU_7119 NR_042152.1 Marvinbryantia formatexigens strain I-52 16S ribo        401     NA     85
# 26 OTU_8231 NR_028785.1 [Clostridium] scindens strain ATCC 35704 16S ribo        390     NA     91

write.csv(df_blast.Cont, file = "blast-results-Cont-vFINAL1.csv", row.names = FALSE)

#------------------------





#### Add stats on % OTU Rel Abun Air, Soil, Fecal, Cecal
#------------------------

## add values for max % OTU Rel Abun in soil, Week 7 fecal, cecal samples
## add to
temp.df_blast <- list()
temp.df_blast[["Cont"]] <- df_blast.Cont; temp.df_blast[["Low"]] <- df_blast.Low; temp.df_blast[["High"]] <- df_blast.High



# # # # # # # # # # # # # # 
# Control
# # # # # # # # # # # # # # 

## Air - Control
air.w1w6.Cont.dc.16s <- prune_samples( air.dc.16s@sam_data$Treatment == "Control", air.dc.16s )
min(sample_sums(air.w1w6.Cont.dc.16s)) #  1681
min(taxa_sums(air.w1w6.Cont.dc.16s)) #  0

# prune taxa that have zero sequence reads
air.w1w6.Cont.dc.16s <- prune_taxa(taxa = taxa_sums(air.w1w6.Cont.dc.16s) > 0, x = air.w1w6.Cont.dc.16s)
air.w1w6.Cont.dc.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 504 taxa and 24 samples ]
# sample_data() Sample Data:       [ 24 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 504 taxa by 7 taxonomic ranks ]

# % Relative abundance
relabun.air.w1w6.Cont.dc.16s <- transform_sample_counts(air.w1w6.Cont.dc.16s, function(x) 100*x / sum(x) )
relabun.air.w1w6.Cont.dc.16s



## No Control soils



## Fecal Week 0 - Control
fecalw0.Cont.dc.16s <- prune_samples( fecalw0.dc.16s@sam_data$Treatment == "Control", fecalw0.dc.16s )
min(sample_sums(fecalw0.Cont.dc.16s)) #  20718
min(taxa_sums(fecalw0.Cont.dc.16s)) # 0

# prune taxa that have zero sequence reads
fecalw0.Cont.dc.16s <- prune_taxa( taxa = taxa_sums(fecalw0.Cont.dc.16s) > 0, x = fecalw0.Cont.dc.16s )
fecalw0.Cont.dc.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 665 taxa and 18 samples ]
# sample_data() Sample Data:       [ 18 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 665 taxa by 7 taxonomic ranks ]

# % Relative abundance
relabun.fecalw0.Cont.dc.16s <- transform_sample_counts(fecalw0.Cont.dc.16s, function(x) 100*x / sum(x) )
relabun.fecalw0.Cont.dc.16s



## Fecal Week 7 - Control
fecalw7.Cont.dc.16s <- prune_samples( fecalw7.dc.16s@sam_data$Treatment == "Control", fecalw7.dc.16s )
min(sample_sums(fecalw7.Cont.dc.16s)) #  22492
min(taxa_sums(fecalw7.Cont.dc.16s)) # 0

# prune taxa that have zero sequence reads
fecalw7.Cont.dc.16s <- prune_taxa( taxa = taxa_sums(fecalw7.Cont.dc.16s) > 0, x = fecalw7.Cont.dc.16s )
fecalw7.Cont.dc.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 566 taxa and 18 samples ]
# sample_data() Sample Data:       [ 18 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 566 taxa by 7 taxonomic ranks ]

# % Relative abundance
relabun.fecalw7.Cont.dc.16s <- transform_sample_counts(fecalw7.Cont.dc.16s, function(x) 100*x / sum(x) )
relabun.fecalw7.Cont.dc.16s



## Cecal - Control
cecal.Cont.dc.16s <- prune_samples( cecal.dc.16s@sam_data$Treatment == "Control", cecal.dc.16s )
min(sample_sums(cecal.Cont.dc.16s)) # 36868
min(taxa_sums(cecal.Cont.dc.16s)) # 0

# prune taxa that have zero sequence reads
cecal.Cont.dc.16s <- prune_taxa( taxa = taxa_sums(cecal.Cont.dc.16s) > 0, x = cecal.Cont.dc.16s )
cecal.Cont.dc.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 512 taxa and 18 samples ]
# sample_data() Sample Data:       [ 18 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 512 taxa by 7 taxonomic ranks ]

# % Relative abundance
relabun.cecal.Cont.dc.16s <- transform_sample_counts(cecal.Cont.dc.16s, function(x) 100*x / sum(x) )
relabun.cecal.Cont.dc.16s




# # # # # # # # # # # # # # 
# Low
# # # # # # # # # # # # # # 

## Air - Low
air.w1w6.Low.dc.16s <- prune_samples( air.dc.16s@sam_data$Treatment == "Low", air.dc.16s )
min(sample_sums(air.w1w6.Low.dc.16s)) #  1307
min(taxa_sums(air.w1w6.Low.dc.16s)) #  0

# prune taxa that have zero sequence reads
air.w1w6.Low.dc.16s <- prune_taxa(taxa = taxa_sums(air.w1w6.Low.dc.16s) > 0, x = air.w1w6.Low.dc.16s)
air.w1w6.Low.dc.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 1675 taxa and 24 samples ]
# sample_data() Sample Data:       [ 24 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 1675 taxa by 7 taxonomic ranks ]

# % Relative abundance
relabun.air.w1w6.Low.dc.16s <- transform_sample_counts(air.w1w6.Low.dc.16s, function(x) 100*x / sum(x) )
relabun.air.w1w6.Low.dc.16s





## Soils - Low
soil.Low.dc.16s <- prune_samples( soil.dc.16s@sam_data$Treatment == "Low", soil.dc.16s )
min(sample_sums(soil.Low.dc.16s)) #  15550
min(taxa_sums(soil.Low.dc.16s)) #  0

# prune taxa that have zero sequence reads
soil.Low.dc.16s <- prune_taxa(taxa = taxa_sums(soil.Low.dc.16s) > 0, x = soil.Low.dc.16s)
soil.Low.dc.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 2145 taxa and 33 samples ]
# sample_data() Sample Data:       [ 33 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 2145 taxa by 7 taxonomic ranks ]

# % Relative abundance
relabun.soil.Low.dc.16s <- transform_sample_counts(soil.Low.dc.16s, function(x) 100*x / sum(x) )
relabun.soil.Low.dc.16s


taxa_in_soil.Low <- taxa_names(soil.Low.dc.16s)



## Fecal Week 0 - Low
fecalw0.Low.dc.16s <- prune_samples( fecalw0.dc.16s@sam_data$Treatment == "Low", fecalw0.dc.16s )
min(sample_sums(fecalw0.Low.dc.16s)) #  23952
min(taxa_sums(fecalw0.Low.dc.16s)) # 0

# prune taxa that have zero sequence reads
fecalw0.Low.dc.16s <- prune_taxa( taxa = taxa_sums(fecalw0.Low.dc.16s) > 0, x = fecalw0.Low.dc.16s )
fecalw0.Low.dc.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 571 taxa and 18 samples ]
# sample_data() Sample Data:       [ 18 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 571 taxa by 7 taxonomic ranks ]

# % Relative abundance
relabun.fecalw0.Low.dc.16s <- transform_sample_counts(fecalw0.Low.dc.16s, function(x) 100*x / sum(x) )
relabun.fecalw0.Low.dc.16s




## Fecal Week 7 - Low
fecalw7.Low.dc.16s <- prune_samples( fecalw7.dc.16s@sam_data$Treatment == "Low", fecalw7.dc.16s )
min(sample_sums(fecalw7.Low.dc.16s)) # 19385
min(taxa_sums(fecalw7.Low.dc.16s)) # 0

# prune taxa that have zero sequence reads
fecalw7.Low.dc.16s <- prune_taxa( taxa = taxa_sums(fecalw7.Low.dc.16s) > 0, x = fecalw7.Low.dc.16s )
fecalw7.Low.dc.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 560 taxa and 17 samples ]
# sample_data() Sample Data:       [ 17 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 560 taxa by 7 taxonomic ranks ]

# % Relative abundance
relabun.fecalw7.Low.dc.16s <- transform_sample_counts(fecalw7.Low.dc.16s, function(x) 100*x / sum(x) )
relabun.fecalw7.Low.dc.16s



## Cecal - Low
cecal.Low.dc.16s <- prune_samples( cecal.dc.16s@sam_data$Treatment == "Low", cecal.dc.16s )
min(sample_sums(cecal.Low.dc.16s)) # 14632
min(taxa_sums(cecal.Low.dc.16s)) # 0

# prune taxa that have zero sequence reads
cecal.Low.dc.16s <- prune_taxa( taxa = taxa_sums(cecal.Low.dc.16s) > 0, x = cecal.Low.dc.16s )
cecal.Low.dc.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 489 taxa and 17 samples ]
# sample_data() Sample Data:       [ 17 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 489 taxa by 7 taxonomic ranks ]

# % Relative abundance
relabun.cecal.Low.dc.16s <- transform_sample_counts(cecal.Low.dc.16s, function(x) 100*x / sum(x) )
relabun.cecal.Low.dc.16s




# # # # # # # # # # # # # # 
# High
# # # # # # # # # # # # # # 

## Air - High
air.w1w6.High.dc.16s <- prune_samples( air.dc.16s@sam_data$Treatment == "High", air.dc.16s )
min(sample_sums(air.w1w6.High.dc.16s)) #  1400
min(taxa_sums(air.w1w6.High.dc.16s)) #  0

# prune taxa that have zero sequence reads
air.w1w6.High.dc.16s <- prune_taxa(taxa = taxa_sums(air.w1w6.High.dc.16s) > 0, x = air.w1w6.High.dc.16s)
air.w1w6.High.dc.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 1562 taxa and 24 samples ]
# sample_data() Sample Data:       [ 24 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 1562 taxa by 7 taxonomic ranks ]

# % Relative abundance
relabun.air.w1w6.High.dc.16s <- transform_sample_counts(air.w1w6.High.dc.16s, function(x) 100*x / sum(x) )
relabun.air.w1w6.High.dc.16s





## Soils - High
soil.High.dc.16s <- prune_samples( soil.dc.16s@sam_data$Treatment == "High", soil.dc.16s )
min(sample_sums(soil.High.dc.16s)) #  12524
min(taxa_sums(soil.High.dc.16s)) #  0

# prune taxa that have zero sequence reads
soil.High.dc.16s <- prune_taxa(taxa = taxa_sums(soil.High.dc.16s) > 0, x = soil.High.dc.16s)
soil.High.dc.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 2125 taxa and 32 samples ]
# sample_data() Sample Data:       [ 32 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 2125 taxa by 7 taxonomic ranks ]

# % Relative abundance
relabun.soil.High.dc.16s <- transform_sample_counts(soil.High.dc.16s, function(x) 100*x / sum(x) )
relabun.soil.High.dc.16s


taxa_in_soil.High <- taxa_names(soil.High.dc.16s)




## Fecal Week 0 - High
fecalw0.High.dc.16s <- prune_samples( fecalw0.dc.16s@sam_data$Treatment == "High", fecalw0.dc.16s )
min(sample_sums(fecalw0.High.dc.16s)) #  11450
min(taxa_sums(fecalw0.High.dc.16s)) # 0

# prune taxa that have zero sequence reads
fecalw0.High.dc.16s <- prune_taxa( taxa = taxa_sums(fecalw0.High.dc.16s) > 0, x = fecalw0.High.dc.16s )
fecalw0.High.dc.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 562 taxa and 18 samples ]
# sample_data() Sample Data:       [ 18 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 562 taxa by 7 taxonomic ranks ]

# % Relative abundance
relabun.fecalw0.High.dc.16s <- transform_sample_counts(fecalw0.High.dc.16s, function(x) 100*x / sum(x) )
relabun.fecalw0.High.dc.16s




## Fecal Week 7 - High
fecalw7.High.dc.16s <- prune_samples( fecalw7.dc.16s@sam_data$Treatment == "High", fecalw7.dc.16s )
min(sample_sums(fecalw7.High.dc.16s)) #  14195
min(taxa_sums(fecalw7.High.dc.16s)) # 0

# prune taxa that have zero sequence reads
fecalw7.High.dc.16s <- prune_taxa( taxa = taxa_sums(fecalw7.High.dc.16s) > 0, x = fecalw7.High.dc.16s )
fecalw7.High.dc.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 573 taxa and 18 samples ]
# sample_data() Sample Data:       [ 18 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 573 taxa by 7 taxonomic ranks ]

# % Relative abundance
relabun.fecalw7.High.dc.16s <- transform_sample_counts(fecalw7.High.dc.16s, function(x) 100*x / sum(x) )
relabun.fecalw7.High.dc.16s



## Cecal - High
cecal.High.dc.16s <- prune_samples( cecal.dc.16s@sam_data$Treatment == "High", cecal.dc.16s )
min(sample_sums(cecal.High.dc.16s)) # 40098
min(taxa_sums(cecal.High.dc.16s)) # 0

# prune taxa that have zero sequence reads
cecal.High.dc.16s <- prune_taxa( taxa = taxa_sums(cecal.High.dc.16s) > 0, x = cecal.High.dc.16s )
cecal.High.dc.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 501 taxa and 18 samples ]
# sample_data() Sample Data:       [ 18 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 501 taxa by 7 taxonomic ranks ]

# % Relative abundance
relabun.cecal.High.dc.16s <- transform_sample_counts(cecal.High.dc.16s, function(x) 100*x / sum(x) )
relabun.cecal.High.dc.16s




## loop through OTUs in each dataframe to add % OTU Rel Abun for: soils, air, fecal Week 7, cecal
## capture % OTU Rel Abun as: median (range)

## Control
#df_blast.Cont <- temp.df_blast[["Cont"]]

df_blast.Cont$rel_abun_airw1w6_mean <-NA
df_blast.Cont$rel_abun_airw1w6_min <-NA
df_blast.Cont$rel_abun_airw1w6_max <-NA
df_blast.Cont$rel_abun_airw1w6_n_nonzero <-NA
df_blast.Cont$rel_abun_airw1w6_n <-NA

df_blast.Cont$rel_abun_fecalw0_mean <-NA
df_blast.Cont$rel_abun_fecalw0_min <-NA
df_blast.Cont$rel_abun_fecalw0_max <-NA
df_blast.Cont$rel_abun_fecalw0_n_nonzero <-NA
df_blast.Cont$rel_abun_fecalw0_n <-NA

df_blast.Cont$rel_abun_fecalw7_mean <-NA
df_blast.Cont$rel_abun_fecalw7_min <-NA
df_blast.Cont$rel_abun_fecalw7_max <-NA
df_blast.Cont$rel_abun_fecalw7_n_nonzero <-NA
df_blast.Cont$rel_abun_fecalw7_n <-NA

df_blast.Cont$rel_abun_cecal_mean <-NA
df_blast.Cont$rel_abun_cecal_min <-NA
df_blast.Cont$rel_abun_cecal_max <-NA
df_blast.Cont$rel_abun_cecal_n_nonzero <-NA
df_blast.Cont$rel_abun_cecal_n <-NA


for (i in 1:length(df_blast.Cont$otu)) {
  #i<-1
  
  this_otu <- as.character( df_blast.Cont$otu[i] )
  
  sel <- which( row.names(clean.16s@tax_table) == this_otu)
  if (length(sel)==1){
    df_blast.Cont$Phylum[i] <- sub(pattern="p__", replacement="", x=as.character(clean.16s@tax_table[sel, "Phylum"]) )
  }
  
  sel <- which( row.names(relabun.air.w1w6.Cont.dc.16s@otu_table) == this_otu)
  if (length(sel)==1){
    x <- as.numeric(relabun.air.w1w6.Cont.dc.16s@otu_table[sel, ])
    subsel <- which(!x==0)
    
    df_blast.Cont$rel_abun_airw1w6_mean[i] <- mean( x ) # [subsel]
    df_blast.Cont$rel_abun_airw1w6_min[i] <- min( x )
    df_blast.Cont$rel_abun_airw1w6_max[i] <- max( x )
    df_blast.Cont$rel_abun_airw1w6_n_nonzero[i] <- length( x[subsel] )
    df_blast.Cont$rel_abun_airw1w6_n[i] <- length( x )
  }
  
  sel <- which( row.names(relabun.fecalw0.Cont.dc.16s@otu_table) == this_otu)
  if (length(sel)==1){
    x <- as.numeric(relabun.fecalw0.Cont.dc.16s@otu_table[sel, ])
    subsel <- which(!x==0)
    
    df_blast.Cont$rel_abun_fecalw0_mean[i] <- mean( x ) # [subsel]
    df_blast.Cont$rel_abun_fecalw0_min[i] <- min( x )
    df_blast.Cont$rel_abun_fecalw0_max[i] <- max( x )
    df_blast.Cont$rel_abun_fecalw0_n_nonzero[i] <- length( x[subsel] )
    df_blast.Cont$rel_abun_fecalw0_n[i] <- length( x )
  }

  sel <- which( row.names(relabun.fecalw7.Cont.dc.16s@otu_table) == this_otu)
  if (length(sel)==1){
    x <- as.numeric(relabun.fecalw7.Cont.dc.16s@otu_table[sel, ])
    subsel <- which(!x==0)
    
    df_blast.Cont$rel_abun_fecalw7_mean[i] <- mean( x )  # [subsel]
    df_blast.Cont$rel_abun_fecalw7_min[i] <- min( x )
    df_blast.Cont$rel_abun_fecalw7_max[i] <- max( x )
    df_blast.Cont$rel_abun_fecalw7_n_nonzero[i] <- length( x[subsel] )
    df_blast.Cont$rel_abun_fecalw7_n[i] <- length( x )
  }
  
  sel <- which( row.names(relabun.cecal.Cont.dc.16s@otu_table) == this_otu)
  if (length(sel)==1){
    x <- as.numeric(relabun.cecal.Cont.dc.16s@otu_table[sel, ])
    subsel <- which(!x==0)
    
    df_blast.Cont$rel_abun_cecal_mean[i] <- mean( x )  # [subsel]
    df_blast.Cont$rel_abun_cecal_min[i] <- min( x )
    df_blast.Cont$rel_abun_cecal_max[i] <- max( x )
    df_blast.Cont$rel_abun_cecal_n_nonzero[i] <- length( x[subsel] )
    df_blast.Cont$rel_abun_cecal_n[i] <- length( x )
  }
  
  print(paste0("completed OTU # ",i))
}






## Low
#df_blast.Low <- temp.df_blast[["Low"]]


df_blast.Low$rel_abun_soil_mean <-NA
df_blast.Low$rel_abun_soil_min <-NA
df_blast.Low$rel_abun_soil_max <-NA
df_blast.Low$rel_abun_soil_n_nonzero <-NA
df_blast.Low$rel_abun_soil_n <-NA

df_blast.Low$rel_abun_airw1w6_mean <-NA
df_blast.Low$rel_abun_airw1w6_min <-NA
df_blast.Low$rel_abun_airw1w6_max <-NA
df_blast.Low$rel_abun_airw1w6_n_nonzero <-NA
df_blast.Low$rel_abun_airw1w6_n <-NA

df_blast.Low$rel_abun_fecalw0_mean <-NA
df_blast.Low$rel_abun_fecalw0_min <-NA
df_blast.Low$rel_abun_fecalw0_max <-NA
df_blast.Low$rel_abun_fecalw0_n_nonzero <-NA
df_blast.Low$rel_abun_fecalw0_n <-NA

df_blast.Low$rel_abun_fecalw7_mean <-NA
df_blast.Low$rel_abun_fecalw7_min <-NA
df_blast.Low$rel_abun_fecalw7_max <-NA
df_blast.Low$rel_abun_fecalw7_n_nonzero <-NA
df_blast.Low$rel_abun_fecalw7_n <-NA

df_blast.Low$rel_abun_cecal_mean <-NA
df_blast.Low$rel_abun_cecal_min <-NA
df_blast.Low$rel_abun_cecal_max <-NA
df_blast.Low$rel_abun_cecal_n_nonzero <-NA
df_blast.Low$rel_abun_cecal_n <-NA


for (i in 1:length(df_blast.Low$otu)) {
  #i<-1
  
  this_otu <- as.character( df_blast.Low$otu[i] )
  
  sel <- which( row.names(clean.16s@tax_table) == this_otu)
  if (length(sel)==1){
    df_blast.Low$Phylum[i] <- sub(pattern="p__", replacement="", x=as.character(clean.16s@tax_table[sel, "Phylum"]) )
  }
  
  sel <- which( row.names(relabun.soil.Low.dc.16s@otu_table) == this_otu)
  if (length(sel)==1){
    x <- as.numeric(relabun.soil.Low.dc.16s@otu_table[sel, ])
    subsel <- which(!x==0)
    
    df_blast.Low$rel_abun_soil_mean[i] <- mean( x )  # [subsel]
    df_blast.Low$rel_abun_soil_min[i] <- min( x )
    df_blast.Low$rel_abun_soil_max[i] <- max( x )
    df_blast.Low$rel_abun_soil_n_nonzero[i] <- length( x[subsel] )
    df_blast.Low$rel_abun_soil_n[i] <- length( x )
  }
  
  
  sel <- which( row.names(relabun.air.w1w6.Low.dc.16s@otu_table) == this_otu)
  if (length(sel)==1){
    x <- as.numeric(relabun.air.w1w6.Low.dc.16s@otu_table[sel, ])
    subsel <- which(!x==0)
    
    df_blast.Low$rel_abun_airw1w6_mean[i] <- mean( x )  # [subsel]
    df_blast.Low$rel_abun_airw1w6_min[i] <- min( x )
    df_blast.Low$rel_abun_airw1w6_max[i] <- max( x )
    df_blast.Low$rel_abun_airw1w6_n_nonzero[i] <- length( x[subsel] )
    df_blast.Low$rel_abun_airw1w6_n[i] <- length( x )
  }
  
  sel <- which( row.names(relabun.fecalw0.Low.dc.16s@otu_table) == this_otu)
  if (length(sel)==1){
    x <- as.numeric(relabun.fecalw0.Low.dc.16s@otu_table[sel, ])
    subsel <- which(!x==0)
    
    df_blast.Low$rel_abun_fecalw0_mean[i] <- mean( x )  # [subsel]
    df_blast.Low$rel_abun_fecalw0_min[i] <- min( x )
    df_blast.Low$rel_abun_fecalw0_max[i] <- max( x )
    df_blast.Low$rel_abun_fecalw0_n_nonzero[i] <- length( x[subsel] )
    df_blast.Low$rel_abun_fecalw0_n[i] <- length( x )
  }
  
  sel <- which( row.names(relabun.fecalw7.Low.dc.16s@otu_table) == this_otu)
  if (length(sel)==1){
    x <- as.numeric(relabun.fecalw7.Low.dc.16s@otu_table[sel, ])
    subsel <- which(!x==0)
    
    df_blast.Low$rel_abun_fecalw7_mean[i] <- mean( x )  # [subsel]
    df_blast.Low$rel_abun_fecalw7_min[i] <- min( x )
    df_blast.Low$rel_abun_fecalw7_max[i] <- max( x )
    df_blast.Low$rel_abun_fecalw7_n_nonzero[i] <- length( x[subsel] )
    df_blast.Low$rel_abun_fecalw7_n[i] <- length( x )
  }
  
  sel <- which( row.names(relabun.cecal.Low.dc.16s@otu_table) == this_otu)
  if (length(sel)==1){
    x <- as.numeric(relabun.cecal.Low.dc.16s@otu_table[sel, ])
    subsel <- which(!x==0)
    
    df_blast.Low$rel_abun_cecal_mean[i] <- mean( x )  # [subsel]
    df_blast.Low$rel_abun_cecal_min[i] <- min( x )
    df_blast.Low$rel_abun_cecal_max[i] <- max( x )
    df_blast.Low$rel_abun_cecal_n_nonzero[i] <- length( x[subsel] )
    df_blast.Low$rel_abun_cecal_n[i] <- length( x )
  }
  
  print(paste0("completed OTU # ",i))
}






## High
#df_blast.High <- temp.df_blast[["High"]]


df_blast.High$rel_abun_soil_mean <-NA
df_blast.High$rel_abun_soil_min <-NA
df_blast.High$rel_abun_soil_max <-NA
df_blast.High$rel_abun_soil_n_nonzero <-NA
df_blast.High$rel_abun_soil_n <-NA

df_blast.High$rel_abun_airw1w6_mean <-NA
df_blast.High$rel_abun_airw1w6_min <-NA
df_blast.High$rel_abun_airw1w6_max <-NA
df_blast.High$rel_abun_airw1w6_n_nonzero <-NA
df_blast.High$rel_abun_airw1w6_n <-NA

df_blast.High$rel_abun_fecalw0_mean <-NA
df_blast.High$rel_abun_fecalw0_min <-NA
df_blast.High$rel_abun_fecalw0_max <-NA
df_blast.High$rel_abun_fecalw0_n_nonzero <-NA
df_blast.High$rel_abun_fecalw0_n <-NA

df_blast.High$rel_abun_fecalw7_mean <-NA
df_blast.High$rel_abun_fecalw7_min <-NA
df_blast.High$rel_abun_fecalw7_max <-NA
df_blast.High$rel_abun_fecalw7_n_nonzero <-NA
df_blast.High$rel_abun_fecalw7_n <-NA

df_blast.High$rel_abun_cecal_mean <-NA
df_blast.High$rel_abun_cecal_min <-NA
df_blast.High$rel_abun_cecal_max <-NA
df_blast.High$rel_abun_cecal_n_nonzero <-NA
df_blast.High$rel_abun_cecal_n <-NA


for (i in 1:length(df_blast.High$otu)) {
  #i<-1
  
  this_otu <- as.character( df_blast.High$otu[i] )
  
  sel <- which( row.names(clean.16s@tax_table) == this_otu)
  if (length(sel)==1){
    df_blast.High$Phylum[i] <- sub(pattern="p__", replacement="", x=as.character(clean.16s@tax_table[sel, "Phylum"]) )
  }
  
  sel <- which( row.names(relabun.soil.High.dc.16s@otu_table) == this_otu)
  if (length(sel)==1){
    x <- as.numeric(relabun.soil.High.dc.16s@otu_table[sel, ])
    subsel <- which(!x==0)
    
    df_blast.High$rel_abun_soil_mean[i] <- mean( x )  # [subsel]
    df_blast.High$rel_abun_soil_min[i] <- min( x )
    df_blast.High$rel_abun_soil_max[i] <- max( x )
    df_blast.High$rel_abun_soil_n_nonzero[i] <- length( x[subsel] )
    df_blast.High$rel_abun_soil_n[i] <- length( x )
  }
  
  
  sel <- which( row.names(relabun.air.w1w6.High.dc.16s@otu_table) == this_otu)
  if (length(sel)==1){
    x <- as.numeric(relabun.air.w1w6.High.dc.16s@otu_table[sel, ])
    subsel <- which(!x==0)
    
    df_blast.High$rel_abun_airw1w6_mean[i] <- mean( x )  # [subsel]
    df_blast.High$rel_abun_airw1w6_min[i] <- min( x )
    df_blast.High$rel_abun_airw1w6_max[i] <- max( x )
    df_blast.High$rel_abun_airw1w6_n_nonzero[i] <- length( x[subsel] )
    df_blast.High$rel_abun_airw1w6_n[i] <- length( x )
  }
  
  sel <- which( row.names(relabun.fecalw0.High.dc.16s@otu_table) == this_otu)
  if (length(sel)==1){
    x <- as.numeric(relabun.fecalw0.High.dc.16s@otu_table[sel, ])
    subsel <- which(!x==0)
    
    df_blast.High$rel_abun_fecalw0_mean[i] <- mean( x )  # [subsel]
    df_blast.High$rel_abun_fecalw0_min[i] <- min( x )
    df_blast.High$rel_abun_fecalw0_max[i] <- max( x )
    df_blast.High$rel_abun_fecalw0_n_nonzero[i] <- length( x[subsel] )
    df_blast.High$rel_abun_fecalw0_n[i] <- length( x )
  }
  
  sel <- which( row.names(relabun.fecalw7.High.dc.16s@otu_table) == this_otu)
  if (length(sel)==1){
    x <- as.numeric(relabun.fecalw7.High.dc.16s@otu_table[sel, ])
    subsel <- which(!x==0)
    
    df_blast.High$rel_abun_fecalw7_mean[i] <- mean( x )  # [subsel]
    df_blast.High$rel_abun_fecalw7_min[i] <- min( x )
    df_blast.High$rel_abun_fecalw7_max[i] <- max( x )
    df_blast.High$rel_abun_fecalw7_n_nonzero[i] <- length( x[subsel] )
    df_blast.High$rel_abun_fecalw7_n[i] <- length( x )
  }
  
  sel <- which( row.names(relabun.cecal.High.dc.16s@otu_table) == this_otu)
  if (length(sel)==1){
    x <- as.numeric(relabun.cecal.High.dc.16s@otu_table[sel, ])
    subsel <- which(!x==0)
    
    df_blast.High$rel_abun_cecal_mean[i] <- mean( x )  # [subsel]
    df_blast.High$rel_abun_cecal_min[i] <- min( x )
    df_blast.High$rel_abun_cecal_max[i] <- max( x )
    df_blast.High$rel_abun_cecal_n_nonzero[i] <- length( x[subsel] )
    df_blast.High$rel_abun_cecal_n[i] <- length( x )
  }
  
  print(paste0("completed OTU # ",i))
}




## Export tables of results

## Control

names(df_blast.Cont)
# [1] "otu"                        "AccessionNo"                "closest_ncbi_match"         "score_bits"                
# [5] "expect"                     "pident"                     "Phylum"                     "Description"               
# [9] "Ref"                        "rel_abun_airw1w6_mean"      "rel_abun_airw1w6_min"       "rel_abun_airw1w6_max"      
# [13] "rel_abun_airw1w6_n_nonzero" "rel_abun_airw1w6_n"         "rel_abun_fecalw0_mean"      "rel_abun_fecalw0_min"      
# [17] "rel_abun_fecalw0_max"       "rel_abun_fecalw0_n_nonzero" "rel_abun_fecalw0_n"         "rel_abun_fecalw7_mean"     
# [21] "rel_abun_fecalw7_min"       "rel_abun_fecalw7_max"       "rel_abun_fecalw7_n_nonzero" "rel_abun_fecalw7_n"        
# [25] "rel_abun_cecal_mean"        "rel_abun_cecal_min"         "rel_abun_cecal_max"         "rel_abun_cecal_n_nonzero"  
# [29] "rel_abun_cecal_n" 

tab <- df_blast.Cont

tab$air_mean_range <- paste0(round(tab$rel_abun_airw1w6_mean,4),"(",round(tab$rel_abun_airw1w6_min,4),"-",round(tab$rel_abun_airw1w6_max,4),")" )
tab$air_n_nz <- paste0(tab$rel_abun_airw1w6_n_nonzero," (",tab$rel_abun_airw1w6_n,")" )

tab$fecalw0_mean_range <- paste0(round(tab$rel_abun_fecalw0_mean,4),"(",round(tab$rel_abun_fecalw0_min,4),"-",round(tab$rel_abun_fecalw0_max,4),")" )
tab$fecalw0_n_nz <- paste0(tab$rel_abun_fecalw0_n_nonzero," (",tab$rel_abun_fecalw0_n,")" )

tab$fecalw7_mean_range <- paste0(round(tab$rel_abun_fecalw7_mean,4),"(",round(tab$rel_abun_fecalw7_min,4),"-",round(tab$rel_abun_fecalw7_max,4),")" )
tab$fecalw7_n_nz <- paste0(tab$rel_abun_fecalw7_n_nonzero," (",tab$rel_abun_fecalw7_n,")" )

tab$cecal_mean_range <- paste0(round(tab$rel_abun_cecal_mean,4),"(",round(tab$rel_abun_cecal_min,4),"-",round(tab$rel_abun_cecal_max,4),")" )
tab$cecal_n_nz <- paste0(tab$rel_abun_cecal_n_nonzero," (",tab$rel_abun_cecal_n,")" )
  
names(tab)

keep <- c(
  "otu", "AccessionNo" , "closest_ncbi_match", "score_bits",                
  "expect"  , "pident"  , "Phylum" , "Description" ,
  "Ref" ,
  "air_mean_range" , "air_n_nz"  ,
  "fecalw0_mean_range" , "fecalw0_n_nz" , 
  "fecalw7_mean_range" , "fecalw7_n_nz" ,
  "cecal_mean_range"   , "cecal_n_nz"
)

write.csv(tab[ ,keep], file="Table-results-SigDiffAbun-with-blast-Control-only-vFINAL1.csv", row.names = FALSE )
# note some columns will be blank and need to be filled by inspection of the blast output file, and citation for closest NCBI match, etc




## Low

names(df_blast.Low)
# [1] "otu"                        "AccessionNo"                "closest_ncbi_match"         "score_bits"                
# [5] "expect"                     "pident"                     "Phylum"                     "Description"               
# [9] "Ref"                        "rel_abun_soil_mean"         "rel_abun_soil_min"          "rel_abun_soil_max"         
# [13] "rel_abun_soil_n_nonzero"    "rel_abun_soil_n"            "rel_abun_airw1w6_mean"      "rel_abun_airw1w6_min"      
# [17] "rel_abun_airw1w6_max"       "rel_abun_airw1w6_n_nonzero" "rel_abun_airw1w6_n"         "rel_abun_fecalw0_mean"     
# [21] "rel_abun_fecalw0_min"       "rel_abun_fecalw0_max"       "rel_abun_fecalw0_n_nonzero" "rel_abun_fecalw0_n"        
# [25] "rel_abun_fecalw7_mean"      "rel_abun_fecalw7_min"       "rel_abun_fecalw7_max"       "rel_abun_fecalw7_n_nonzero"
# [29] "rel_abun_fecalw7_n"         "rel_abun_cecal_mean"        "rel_abun_cecal_min"         "rel_abun_cecal_max"        
# [33] "rel_abun_cecal_n_nonzero"   "rel_abun_cecal_n"

tab <- df_blast.Low

tab$soil_mean_range <- paste0(round(tab$rel_abun_soil_mean,4),"(",round(tab$rel_abun_soil_min,4),"-",round(tab$rel_abun_soil_max,4),")" )
tab$soil_n_nz <- paste0(tab$rel_abun_soil_n_nonzero," (",tab$rel_abun_soil_n,")" )

tab$air_mean_range <- paste0(round(tab$rel_abun_airw1w6_mean,4),"(",round(tab$rel_abun_airw1w6_min,4),"-",round(tab$rel_abun_airw1w6_max,4),")" )
tab$air_n_nz <- paste0(tab$rel_abun_airw1w6_n_nonzero," (",tab$rel_abun_airw1w6_n,")" )

tab$fecalw0_mean_range <- paste0(round(tab$rel_abun_fecalw0_mean,4),"(",round(tab$rel_abun_fecalw0_min,4),"-",round(tab$rel_abun_fecalw0_max,4),")" )
tab$fecalw0_n_nz <- paste0(tab$rel_abun_fecalw0_n_nonzero," (",tab$rel_abun_fecalw0_n,")" )

tab$fecalw7_mean_range <- paste0(round(tab$rel_abun_fecalw7_mean,4),"(",round(tab$rel_abun_fecalw7_min,4),"-",round(tab$rel_abun_fecalw7_max,4),")" )
tab$fecalw7_n_nz <- paste0(tab$rel_abun_fecalw7_n_nonzero," (",tab$rel_abun_fecalw7_n,")" )

tab$cecal_mean_range <- paste0(round(tab$rel_abun_cecal_mean,4),"(",round(tab$rel_abun_cecal_min,4),"-",round(tab$rel_abun_cecal_max,4),")" )
tab$cecal_n_nz <- paste0(tab$rel_abun_cecal_n_nonzero," (",tab$rel_abun_cecal_n,")" )


names(tab)

keep <- c(
  "otu", "AccessionNo" , "closest_ncbi_match", "score_bits",                
  "expect"  , "pident"  , "Phylum" , "Description" ,
  "Ref" ,
  "soil_mean_range"  ,     "soil_n_nz" ,
  "air_mean_range" , "air_n_nz"  ,
  "fecalw0_mean_range" , "fecalw0_n_nz" , 
  "fecalw7_mean_range" , "fecalw7_n_nz" ,
  "cecal_mean_range"   , "cecal_n_nz"
)

write.csv(tab[ ,keep], file="Table-results-SigDiffAbun-with-blast-Low-only-vFINAL1.csv", row.names = FALSE )
# note some columns will be blank and need to be filled by inspection of the blast output file, and citation for closest NCBI match, etc



## High


names(df_blast.High)
# [1] "otu"                        "AccessionNo"                "closest_ncbi_match"         "score_bits"                
# [5] "expect"                     "pident"                     "Phylum"                     "Description"               
# [9] "Ref"                        "rel_abun_soil_mean"         "rel_abun_soil_min"          "rel_abun_soil_max"         
# [13] "rel_abun_soil_n_nonzero"    "rel_abun_soil_n"            "rel_abun_airw1w6_mean"      "rel_abun_airw1w6_min"      
# [17] "rel_abun_airw1w6_max"       "rel_abun_airw1w6_n_nonzero" "rel_abun_airw1w6_n"         "rel_abun_fecalw0_mean"     
# [21] "rel_abun_fecalw0_min"       "rel_abun_fecalw0_max"       "rel_abun_fecalw0_n_nonzero" "rel_abun_fecalw0_n"        
# [25] "rel_abun_fecalw7_mean"      "rel_abun_fecalw7_min"       "rel_abun_fecalw7_max"       "rel_abun_fecalw7_n_nonzero"
# [29] "rel_abun_fecalw7_n"         "rel_abun_cecal_mean"        "rel_abun_cecal_min"         "rel_abun_cecal_max"        
# [33] "rel_abun_cecal_n_nonzero"   "rel_abun_cecal_n"

tab <- df_blast.High

tab$soil_mean_range <- paste0(round(tab$rel_abun_soil_mean,4),"(",round(tab$rel_abun_soil_min,4),"-",round(tab$rel_abun_soil_max,4),")" )
tab$soil_n_nz <- paste0(tab$rel_abun_soil_n_nonzero," (",tab$rel_abun_soil_n,")" )

tab$air_mean_range <- paste0(round(tab$rel_abun_airw1w6_mean,4),"(",round(tab$rel_abun_airw1w6_min,4),"-",round(tab$rel_abun_airw1w6_max,4),")" )
tab$air_n_nz <- paste0(tab$rel_abun_airw1w6_n_nonzero," (",tab$rel_abun_airw1w6_n,")" )

tab$fecalw0_mean_range <- paste0(round(tab$rel_abun_fecalw0_mean,4),"(",round(tab$rel_abun_fecalw0_min,4),"-",round(tab$rel_abun_fecalw0_max,4),")" )
tab$fecalw0_n_nz <- paste0(tab$rel_abun_fecalw0_n_nonzero," (",tab$rel_abun_fecalw0_n,")" )

tab$fecalw7_mean_range <- paste0(round(tab$rel_abun_fecalw7_mean,4),"(",round(tab$rel_abun_fecalw7_min,4),"-",round(tab$rel_abun_fecalw7_max,4),")" )
tab$fecalw7_n_nz <- paste0(tab$rel_abun_fecalw7_n_nonzero," (",tab$rel_abun_fecalw7_n,")" )

tab$cecal_mean_range <- paste0(round(tab$rel_abun_cecal_mean,4),"(",round(tab$rel_abun_cecal_min,4),"-",round(tab$rel_abun_cecal_max,4),")" )
tab$cecal_n_nz <- paste0(tab$rel_abun_cecal_n_nonzero," (",tab$rel_abun_cecal_n,")" )


names(tab)

keep <- c(
  "otu", "AccessionNo" , "closest_ncbi_match", "score_bits",                
  "expect"  , "pident"  , "Phylum" , "Description" ,
  "Ref" ,
  "soil_mean_range"  ,     "soil_n_nz" ,
  "air_mean_range" , "air_n_nz"  ,
  "fecalw0_mean_range" , "fecalw0_n_nz" , 
  "fecalw7_mean_range" , "fecalw7_n_nz" ,
  "cecal_mean_range"   , "cecal_n_nz"
)

write.csv(tab[ ,keep], file="Table-results-SigDiffAbun-with-blast-High-only-vFINAL1.csv", row.names = FALSE )
# note some columns will be blank and need to be filled by inspection of the blast output file, and citation for closest NCBI match, etc


#------------------------





#### Comparison of putative butyrate-producer K. alysoides between treatments - incl faecal w0, w7, caecal
#------------------------

### Analyze all K.alysoides-like bacteria that increase across treatments

### collate all OTUs that increase from week 0 to week 7 fecal samples
taxa.sig.diffabun.Inc.cont
# [1] "OTU_1"     "OTU_15"    "OTU_19"    "OTU_23"    "OTU_46"    "OTU_57"    "OTU_59"    "OTU_64"   
# [9] "OTU_67"    "OTU_75"    "OTU_101"   "OTU_125"   "OTU_130"   "OTU_133"   "OTU_143"   "OTU_169"  
# [17] "OTU_201"   "OTU_202"   "OTU_247"   "OTU_254"   "OTU_258"   "OTU_273"   "OTU_402"   "OTU_419"  
# [25] "OTU_481"   "OTU_531"   "OTU_616"   "OTU_765"   "OTU_2039"  "OTU_2280"  "OTU_2384"  "OTU_2419" 
# [33] "OTU_2760"  "OTU_2934"  "OTU_2998"  "OTU_3004"  "OTU_3188"  "OTU_3671"  "OTU_3795"  "OTU_4034" 
# [41] "OTU_4077"  "OTU_4273"  "OTU_4387"  "OTU_4483"  "OTU_4702"  "OTU_5280"  "OTU_5971"  "OTU_6190" 
# [49] "OTU_6689"  "OTU_7119"  "OTU_8231"  "OTU_9505"  "OTU_9762"  "OTU_9785"  "OTU_10092"
taxa.sig.diffabun.Inc.low
# [1] "OTU_23"   "OTU_59"   "OTU_101"  "OTU_240"  "OTU_273"  "OTU_287"  "OTU_332"  "OTU_765" 
# [9] "OTU_2039" "OTU_2280" "OTU_2998" "OTU_4273" "OTU_5079" "OTU_5095" "OTU_5280" "OTU_8615"
# [17] "OTU_9639"
taxa.sig.diffabun.Inc.high
# [1] "OTU_1"     "OTU_15"    "OTU_23"    "OTU_26"    "OTU_29"    "OTU_31"    "OTU_35"    "OTU_37"   
# [9] "OTU_38"    "OTU_46"    "OTU_60"    "OTU_64"    "OTU_75"    "OTU_79"    "OTU_82"    "OTU_91"   
# [17] "OTU_101"   "OTU_105"   "OTU_109"   "OTU_115"   "OTU_125"   "OTU_139"   "OTU_173"   "OTU_196"  
# [25] "OTU_287"   "OTU_481"   "OTU_556"   "OTU_616"   "OTU_765"   "OTU_1737"  "OTU_2039"  "OTU_2280" 
# [33] "OTU_2419"  "OTU_2750"  "OTU_2760"  "OTU_2998"  "OTU_3004"  "OTU_3188"  "OTU_3408"  "OTU_3456" 
# [41] "OTU_3669"  "OTU_3876"  "OTU_4273"  "OTU_4387"  "OTU_4483"  "OTU_5079"  "OTU_5375"  "OTU_5520" 
# [49] "OTU_5765"  "OTU_5790"  "OTU_5915"  "OTU_6190"  "OTU_6509"  "OTU_7377"  "OTU_9265"  "OTU_9505" 
# [57] "OTU_9762"  "OTU_9785"  "OTU_10092"

length(taxa.sig.diffabun.Inc.cont);length(taxa.sig.diffabun.Inc.low);length(taxa.sig.diffabun.Inc.high)
# [1] 55
# [1] 17
# [1] 59


### assign closest NCBI match to significantly increasing taxa

## fasta sequences are available from 
seqs_df <- read_excel(path= paste0(datadir,"/","OTU-rep-sequences.xlsx"),
                      sheet=1, range="B1:C10485", col_names = TRUE)
seqs_df <- as.data.frame(seqs_df)
str(seqs_df)
# 'data.frame':	10484 obs. of  2 variables:
# $ subject: chr  "OTU_1" "OTU_2" "OTU_3" "OTU_4" ...
# $ OTU-seq: chr  "TGGGGGATATTGCACAATGGGG

## assign closest NCBI match to significantly increasing taxa

# store temporary blast results here
blast_dir <- "C:/Workspace/PROJ/PAPER-MICRO-MICE/modelling/blast_outputs_inc_all"


### HIGH

df_blast <- data.frame(otu=taxa.sig.diffabun.Inc.high, AccessionNo=NA, closest_ncbi_match=NA, score_bits=NA, expect=NA, pident=NA, Phylum=NA, Description=NA, Ref=NA )

for (i in 1:length(df_blast$otu)) {   # length(taxa.sig.diffabun.Inc.high) = 59
  #i<-1
  sel <- which(seqs_df$subject == df_blast$otu[i])
  rep_seq <- seqs_df$`OTU-seq`[sel]
  
  fileConn <- file("query_seq.txt")
  writeLines(text = rep_seq, fileConn)
  close(fileConn)
  
  # blast output formats here: https://www.ncbi.nlm.nih.gov/books/NBK279684/
  
  this_command <- paste0('blastn -db C:/Workspace/DATA/NCBI_BLAST_16S/16SMicrobialDB/16SMicrobial -query query_seq.txt -max_target_seqs 1 -out ',blast_dir,'/output_',i,'.txt')
  
  system(command = this_command)
  
  if (file.exists(paste0(blast_dir,'/output_',i,'.txt'))) {
    
    read_out <- read_file(file = paste0(blast_dir,'/output_',i,'.txt'))
    # Score
    idx.before <- str_locate(string = read_out, pattern = "Score = ")
    idx.after <-  str_locate(string = read_out, pattern = " bits ")
    df_blast$score_bits[i] <- as.numeric( substring(text = read_out, first = as.numeric(idx.before[1,"end"])+1, last = as.numeric(idx.after[1,"start"])-1) )
    
    # Expect - this is unreliable to extract by code
    # idx.before <- str_locate(string = read_out, pattern = ",  Expect = ")
    # df_blast$expect[i] <- as.numeric( substring(text = read_out, first = as.numeric(idx.before[1,"end"])+1, last = as.numeric(idx.before[1,"end"])+3) )
    
    # Pident (percent identity) ... this fails if 100%, only captures 00 = 0, so check 0's manually
    idx.after <-  str_locate(string = read_out, pattern = ", Gaps = ")
    df_blast$pident[i] <- as.numeric( substring(text = read_out, first = as.numeric(idx.after[1,"start"])-4, last = as.numeric(idx.after[1,"start"])-3) )
    
    # Putative species / closest NCBI match?
    idx.before <- str_locate(string = read_out, pattern = "\r\n\r\n\r\n>N")
    df_blast$closest_ncbi_match[i] <- substring(text = read_out, first = as.numeric(idx.before[1,"end"]), last = as.numeric(idx.before[1,"end"])+60)
    
  } else {
    df_blast$score_bits[i] <- NA
    #df_blast$expect[i] <- NA
    df_blast$pident[i] <- NA
    df_blast$closest_ncbi_match[i] <- NA
  }
  
  print(paste0("Calculated OTU: ", df_blast$otu[i]," no ",i," of ",length(df_blast$otu)))
  
}


df_blast.High.all_KA <- df_blast

sel <- grep(pattern = "Kineothrix alysoides", x = df_blast.High.all_KA$closest_ncbi_match)
df_blast.High.all_KA <- df_blast.High.all_KA[sel, ]


df_blast.High.all_KA[ ,c("otu", "closest_ncbi_match", "score_bits", "expect", "pident")]
#         otu                                            closest_ncbi_match score_bits expect pident
# 8    OTU_37 NR_156080.1 Kineothrix alysoides strain KNHs209 16S ribosomal        521     NA     97
# 11   OTU_60 NR_156080.1 Kineothrix alysoides strain KNHs209 16S ribosomal        658     NA     96
# 26  OTU_481 NR_156081.1 Kineothrix alysoides strain KNHs209 16S ribosomal        614     NA     94
# 48 OTU_5520 NR_156081.1 Kineothrix alysoides strain KNHs209 16S ribosomal        658     NA     96
# 50 OTU_5790 NR_156081.1 Kineothrix alysoides strain KNHs209 16S ribosomal        483     NA     95
# 56 OTU_9505 NR_156081.1 Kineothrix alysoides strain KNHs209 16S ribosomal        652     NA     96



### LOW

# clear previous results - OR move to storage folder
#unlink(file.path(blast_dir) , recursive=TRUE)

# store results in dataframe. Some fields as placeholders to be filled later
df_blast <- data.frame(otu=taxa.sig.diffabun.Inc.low, AccessionNo=NA, closest_ncbi_match=NA, score_bits=NA, expect=NA, pident=NA, Phylum=NA, Description=NA, Ref=NA )

for (i in 1:length(df_blast$otu)) {   # length(taxa.sig.diffabun.Inc.low) = 17
  #i<-1
  sel <- which(seqs_df$subject == df_blast$otu[i])
  rep_seq <- seqs_df$`OTU-seq`[sel]
  
  fileConn <- file("query_seq.txt")
  writeLines(text = rep_seq, fileConn)
  close(fileConn)
  
  this_command <- paste0('blastn -db C:/Workspace/DATA/NCBI_BLAST_16S/16SMicrobialDB/16SMicrobial -query query_seq.txt -max_target_seqs 1 -out ',blast_dir,'/output_',i,'.txt')
  
  system(command = this_command)
  
  if (file.exists(paste0(blast_dir,'/output_',i,'.txt'))) {
    
    read_out <- read_file(file = paste0(blast_dir,'/output_',i,'.txt'))
    # Score
    idx.before <- str_locate(string = read_out, pattern = "Score = ")
    idx.after <-  str_locate(string = read_out, pattern = " bits ")
    df_blast$score_bits[i] <- as.numeric( substring(text = read_out, first = as.numeric(idx.before[1,"end"])+1, last = as.numeric(idx.after[1,"start"])-1) )
    
    # Expect - this is unreliable to extract by code
    # idx.before <- str_locate(string = read_out, pattern = ",  Expect = ")
    # df_blast$expect[i] <- as.numeric( substring(text = read_out, first = as.numeric(idx.before[1,"end"])+1, last = as.numeric(idx.before[1,"end"])+3) )
    
    # Pident (percent identity) ... this fails if 100%, only captures 00 = 0, so check 0's manually
    idx.after <-  str_locate(string = read_out, pattern = ", Gaps = ")
    df_blast$pident[i] <- as.numeric( substring(text = read_out, first = as.numeric(idx.after[1,"start"])-4, last = as.numeric(idx.after[1,"start"])-3) )
    
    # Putative species / closest NCBI match?
    idx.before <- str_locate(string = read_out, pattern = "\r\n\r\n\r\n>N")
    df_blast$closest_ncbi_match[i] <- substring(text = read_out, first = as.numeric(idx.before[1,"end"]), last = as.numeric(idx.before[1,"end"])+60)
    
  } else {
    df_blast$score_bits[i] <- NA
    #df_blast$expect[i] <- NA
    df_blast$pident[i] <- NA
    df_blast$closest_ncbi_match[i] <- NA
  }
  
  print(paste0("Calculated OTU: ", df_blast$otu[i]," no ",i," of ",length(df_blast$otu)))
  
}



df_blast.Low.all_KA <- df_blast

sel <- grep(pattern = "Kineothrix alysoides", x = df_blast.Low.all_KA$closest_ncbi_match)
df_blast.Low.all_KA <- df_blast.Low.all_KA[sel, ]



df_blast.Low.all_KA[ ,c("otu", "closest_ncbi_match", "score_bits", "expect", "pident")]
#         otu                                            closest_ncbi_match score_bits expect pident
# 17 OTU_9639 NR_156081.1 Kineothrix alysoides strain KNHs209 16S ribosomal        603     NA     94





### CONTROL

# clear previous results - OR move to storage folder
#unlink(file.path(blast_dir) , recursive=TRUE)

# store results in dataframe. Some fields as placeholders to be filled later
df_blast <- data.frame(otu=taxa.sig.diffabun.Inc.cont, AccessionNo=NA, closest_ncbi_match=NA, score_bits=NA, expect=NA, pident=NA, Phylum=NA, Description=NA, Ref=NA )

for (i in 1:length(df_blast$otu)) {   # length(taxa.sig.diffabun.Inc.cont) = 55
  #i<-1
  sel <- which(seqs_df$subject == df_blast$otu[i])
  rep_seq <- seqs_df$`OTU-seq`[sel]
  
  fileConn <- file("query_seq.txt")
  writeLines(text = rep_seq, fileConn)
  close(fileConn)
  
  # blast output formats here: https://www.ncbi.nlm.nih.gov/books/NBK279684/
  
  this_command <- paste0('blastn -db C:/Workspace/DATA/NCBI_BLAST_16S/16SMicrobialDB/16SMicrobial -query query_seq.txt -max_target_seqs 1 -out ',blast_dir,'/output_',i,'.txt')
  
  system(command = this_command)
  
  if (file.exists(paste0(blast_dir,'/output_',i,'.txt'))) {
    
    read_out <- read_file(file = paste0(blast_dir,'/output_',i,'.txt'))
    # Score
    idx.before <- str_locate(string = read_out, pattern = "Score = ")
    idx.after <-  str_locate(string = read_out, pattern = " bits ")
    df_blast$score_bits[i] <- as.numeric( substring(text = read_out, first = as.numeric(idx.before[1,"end"])+1, last = as.numeric(idx.after[1,"start"])-1) )
    
    # Expect - this is unreliable to extract by code
    # idx.before <- str_locate(string = read_out, pattern = ",  Expect = ")
    # df_blast$expect[i] <- as.numeric( substring(text = read_out, first = as.numeric(idx.before[1,"end"])+1, last = as.numeric(idx.before[1,"end"])+3) )
    
    # Pident (percent identity) ... this fails if 100%, only captures 00 = 0, so check 0's manually
    idx.after <-  str_locate(string = read_out, pattern = ", Gaps = ")
    df_blast$pident[i] <- as.numeric( substring(text = read_out, first = as.numeric(idx.after[1,"start"])-4, last = as.numeric(idx.after[1,"start"])-3) )
    
    # Putative species / closest NCBI match?
    idx.before <- str_locate(string = read_out, pattern = "\r\n\r\n\r\n>N")
    df_blast$closest_ncbi_match[i] <- substring(text = read_out, first = as.numeric(idx.before[1,"end"]), last = as.numeric(idx.before[1,"end"])+60)
    
  } else {
    df_blast$score_bits[i] <- NA
    #df_blast$expect[i] <- NA
    df_blast$pident[i] <- NA
    df_blast$closest_ncbi_match[i] <- NA
  }
  
  print(paste0("Calculated OTU: ", df_blast$otu[i]," no ",i," of ",length(df_blast$otu)))
  
}


df_blast.Cont.all_KA <- df_blast

sel <- grep(pattern = "Kineothrix alysoides", x = df_blast.Cont.all_KA$closest_ncbi_match)
df_blast.Cont.all_KA <- df_blast.Cont.all_KA[sel, ]


df_blast.Cont.all_KA[ ,c("otu", "closest_ncbi_match", "score_bits", "expect", "pident")]
#         otu                                            closest_ncbi_match score_bits expect pident
# 15  OTU_143 NR_156080.1 Kineothrix alysoides strain KNHs209 16S ribosomal        630     NA     95
# 25  OTU_481 NR_156081.1 Kineothrix alysoides strain KNHs209 16S ribosomal        614     NA     94
# 26  OTU_531 NR_156081.1 Kineothrix alysoides strain KNHs209 16S ribosomal        686     NA     97
# 47 OTU_5971 NR_156080.1 Kineothrix alysoides strain KNHs209 16S ribosomal        569     NA     92
# 52 OTU_9505 NR_156081.1 Kineothrix alysoides strain KNHs209 16S ribosomal        652     NA     96



temp1 <- df_blast.Cont.all_KA
temp2 <- df_blast.Low.all_KA
temp3 <- df_blast.High.all_KA


### calculate mean OTU rel abun, percent prevalence

### from earlier use % OTU relative abundance phyloseq objects

relabun.fecalw0.Cont.dc.16s
relabun.fecalw7.Cont.dc.16s
relabun.cecal.Cont.dc.16s

relabun.fecalw0.Low.dc.16s
relabun.fecalw7.Low.dc.16s
relabun.cecal.Low.dc.16s

relabun.fecalw0.High.dc.16s
relabun.fecalw7.High.dc.16s
relabun.cecal.High.dc.16s


df_blast.Cont.all_KA$rel_abun_fecalw0_mean <-NA
df_blast.Cont.all_KA$rel_abun_fecalw0_n_nonzero <-NA
df_blast.Cont.all_KA$rel_abun_fecalw0_n <-NA

df_blast.Cont.all_KA$rel_abun_fecalw7_mean <-NA
df_blast.Cont.all_KA$rel_abun_fecalw7_n_nonzero <-NA
df_blast.Cont.all_KA$rel_abun_fecalw7_n <-NA

df_blast.Cont.all_KA$rel_abun_cecal_mean <-NA
df_blast.Cont.all_KA$rel_abun_cecal_n_nonzero <-NA
df_blast.Cont.all_KA$rel_abun_cecal_n <-NA


for (i in 1:length(df_blast.Cont.all_KA$otu)) {
  #i<-1
  
  this_otu <- as.character( df_blast.Cont.all_KA$otu[i] )
  
  sel <- which( row.names(relabun.fecalw0.Cont.dc.16s@otu_table) == this_otu)
  if (length(sel)==1){
    x <- as.numeric(relabun.fecalw0.Cont.dc.16s@otu_table[sel, ])
    subsel <- which(!x==0)
    
    df_blast.Cont.all_KA$rel_abun_fecalw0_mean[i] <- mean( x )
    df_blast.Cont.all_KA$rel_abun_fecalw0_n_nonzero[i] <- length( x[subsel] )
    df_blast.Cont.all_KA$rel_abun_fecalw0_n[i] <- length( x )
  }
  
  sel <- which( row.names(relabun.fecalw7.Cont.dc.16s@otu_table) == this_otu)
  if (length(sel)==1){
    x <- as.numeric(relabun.fecalw7.Cont.dc.16s@otu_table[sel, ])
    subsel <- which(!x==0)
    
    df_blast.Cont.all_KA$rel_abun_fecalw7_mean[i] <- mean( x )
    df_blast.Cont.all_KA$rel_abun_fecalw7_n_nonzero[i] <- length( x[subsel] )
    df_blast.Cont.all_KA$rel_abun_fecalw7_n[i] <- length( x )
  }
  
  sel <- which( row.names(relabun.cecal.Cont.dc.16s@otu_table) == this_otu)
  if (length(sel)==1){
    x <- as.numeric(relabun.cecal.Cont.dc.16s@otu_table[sel, ])
    subsel <- which(!x==0)
    
    df_blast.Cont.all_KA$rel_abun_cecal_mean[i] <- mean( x )
    df_blast.Cont.all_KA$rel_abun_cecal_n_nonzero[i] <- length( x[subsel] )
    df_blast.Cont.all_KA$rel_abun_cecal_n[i] <- length( x )
  }
  
  print(paste0("completed OTU # ",i))
}




## Low

df_blast.Low.all_KA$rel_abun_fecalw0_mean <-NA
df_blast.Low.all_KA$rel_abun_fecalw0_n_nonzero <-NA
df_blast.Low.all_KA$rel_abun_fecalw0_n <-NA

df_blast.Low.all_KA$rel_abun_fecalw7_mean <-NA
df_blast.Low.all_KA$rel_abun_fecalw7_n_nonzero <-NA
df_blast.Low.all_KA$rel_abun_fecalw7_n <-NA

df_blast.Low.all_KA$rel_abun_cecal_mean <-NA
df_blast.Low.all_KA$rel_abun_cecal_n_nonzero <-NA
df_blast.Low.all_KA$rel_abun_cecal_n <-NA


for (i in 1:length(df_blast.Low.all_KA$otu)) {
  #i<-1
  
  this_otu <- as.character( df_blast.Low.all_KA$otu[i] )
  
  sel <- which( row.names(relabun.fecalw0.Low.dc.16s@otu_table) == this_otu)
  if (length(sel)==1){
    x <- as.numeric(relabun.fecalw0.Low.dc.16s@otu_table[sel, ])
    subsel <- which(!x==0)
    
    df_blast.Low.all_KA$rel_abun_fecalw0_mean[i] <- mean( x )
    df_blast.Low.all_KA$rel_abun_fecalw0_n_nonzero[i] <- length( x[subsel] )
    df_blast.Low.all_KA$rel_abun_fecalw0_n[i] <- length( x )
  }
  
  sel <- which( row.names(relabun.fecalw7.Low.dc.16s@otu_table) == this_otu)
  if (length(sel)==1){
    x <- as.numeric(relabun.fecalw7.Low.dc.16s@otu_table[sel, ])
    subsel <- which(!x==0)
    
    df_blast.Low.all_KA$rel_abun_fecalw7_mean[i] <- mean( x )
    df_blast.Low.all_KA$rel_abun_fecalw7_n_nonzero[i] <- length( x[subsel] )
    df_blast.Low.all_KA$rel_abun_fecalw7_n[i] <- length( x )
  }
  
  sel <- which( row.names(relabun.cecal.Low.dc.16s@otu_table) == this_otu)
  if (length(sel)==1){
    x <- as.numeric(relabun.cecal.Low.dc.16s@otu_table[sel, ])
    subsel <- which(!x==0)
    
    df_blast.Low.all_KA$rel_abun_cecal_mean[i] <- mean( x )
    df_blast.Low.all_KA$rel_abun_cecal_n_nonzero[i] <- length( x[subsel] )
    df_blast.Low.all_KA$rel_abun_cecal_n[i] <- length( x )
  }
  
  print(paste0("completed OTU # ",i))
}




## High

df_blast.High.all_KA$rel_abun_fecalw0_mean <-NA
df_blast.High.all_KA$rel_abun_fecalw0_n_nonzero <-NA
df_blast.High.all_KA$rel_abun_fecalw0_n <-NA

df_blast.High.all_KA$rel_abun_fecalw7_mean <-NA
df_blast.High.all_KA$rel_abun_fecalw7_n_nonzero <-NA
df_blast.High.all_KA$rel_abun_fecalw7_n <-NA

df_blast.High.all_KA$rel_abun_cecal_mean <-NA
df_blast.High.all_KA$rel_abun_cecal_n_nonzero <-NA
df_blast.High.all_KA$rel_abun_cecal_n <-NA


for (i in 1:length(df_blast.High.all_KA$otu)) {
  #i<-1
  
  this_otu <- as.character( df_blast.High.all_KA$otu[i] )
  
  
  sel <- which( row.names(relabun.fecalw0.High.dc.16s@otu_table) == this_otu)
  if (length(sel)==1){
    x <- as.numeric(relabun.fecalw0.High.dc.16s@otu_table[sel, ])
    subsel <- which(!x==0)
    
    df_blast.High.all_KA$rel_abun_fecalw0_mean[i] <- mean( x )
    df_blast.High.all_KA$rel_abun_fecalw0_n_nonzero[i] <- length( x[subsel] )
    df_blast.High.all_KA$rel_abun_fecalw0_n[i] <- length( x )
  }
  
  sel <- which( row.names(relabun.fecalw7.High.dc.16s@otu_table) == this_otu)
  if (length(sel)==1){
    x <- as.numeric(relabun.fecalw7.High.dc.16s@otu_table[sel, ])
    subsel <- which(!x==0)
    
    df_blast.High.all_KA$rel_abun_fecalw7_mean[i] <- mean( x )
    df_blast.High.all_KA$rel_abun_fecalw7_n_nonzero[i] <- length( x[subsel] )
    df_blast.High.all_KA$rel_abun_fecalw7_n[i] <- length( x )
  }
  
  sel <- which( row.names(relabun.cecal.High.dc.16s@otu_table) == this_otu)
  if (length(sel)==1){
    x <- as.numeric(relabun.cecal.High.dc.16s@otu_table[sel, ])
    subsel <- which(!x==0)
    
    df_blast.High.all_KA$rel_abun_cecal_mean[i] <- mean( x )
    df_blast.High.all_KA$rel_abun_cecal_n_nonzero[i] <- length( x[subsel] )
    df_blast.High.all_KA$rel_abun_cecal_n[i] <- length( x )
  }
  
  print(paste0("completed OTU # ",i))
}



df_blast.Cont.all_KA
df_blast.Low.all_KA
df_blast.High.all_KA



dat <- data.frame(otu = rep( c(as.character(df_blast.Cont.all_KA$otu),as.character(df_blast.Low.all_KA$otu),as.character(df_blast.High.all_KA$otu)), times =3),
                  Treatment = rep( c(rep("Control",times=dim(df_blast.Cont.all_KA)[1]),rep("Low",times=dim(df_blast.Low.all_KA)[1]),rep("High",times=dim(df_blast.High.all_KA)[1])), times = 3),
                  pident = rep( c(df_blast.Cont.all_KA$pident,df_blast.Low.all_KA$pident,df_blast.High.all_KA$pident), times = 3),
                  
                  rel_abun_mean = c(df_blast.Cont.all_KA$rel_abun_fecalw0_mean,df_blast.Low.all_KA$rel_abun_fecalw0_mean,df_blast.High.all_KA$rel_abun_fecalw0_mean,
                                    df_blast.Cont.all_KA$rel_abun_fecalw7_mean,df_blast.Low.all_KA$rel_abun_fecalw7_mean,df_blast.High.all_KA$rel_abun_fecalw7_mean,
                                    df_blast.Cont.all_KA$rel_abun_cecal_mean,df_blast.Low.all_KA$rel_abun_cecal_mean,df_blast.High.all_KA$rel_abun_cecal_mean),
                  
                  n_nonzero = c(df_blast.Cont.all_KA$rel_abun_fecalw0_n_nonzero,df_blast.Low.all_KA$rel_abun_fecalw0_n_nonzero,df_blast.High.all_KA$rel_abun_fecalw0_n_nonzero,
                                df_blast.Cont.all_KA$rel_abun_fecalw7_n_nonzero,df_blast.Low.all_KA$rel_abun_fecalw7_n_nonzero,df_blast.High.all_KA$rel_abun_fecalw7_n_nonzero,
                                df_blast.Cont.all_KA$rel_abun_cecal_n_nonzero,df_blast.Low.all_KA$rel_abun_cecal_n_nonzero,df_blast.High.all_KA$rel_abun_cecal_n_nonzero),
                  
                  n = c(df_blast.Cont.all_KA$rel_abun_fecalw0_n,df_blast.Low.all_KA$rel_abun_fecalw0_n,df_blast.High.all_KA$rel_abun_fecalw0_n,
                        df_blast.Cont.all_KA$rel_abun_fecalw7_n,df_blast.Low.all_KA$rel_abun_fecalw7_n,df_blast.High.all_KA$rel_abun_fecalw7_n,
                        df_blast.Cont.all_KA$rel_abun_cecal_n,df_blast.Low.all_KA$rel_abun_cecal_n,df_blast.High.all_KA$rel_abun_cecal_n)
)
dat$Sample_type <- c( rep("Faecal w0",times = length( c(as.character(df_blast.Cont.all_KA$otu),as.character(df_blast.Low.all_KA$otu),as.character(df_blast.High.all_KA$otu)) ) ),
                      rep("Faecal w7",times = length( c(as.character(df_blast.Cont.all_KA$otu),as.character(df_blast.Low.all_KA$otu),as.character(df_blast.High.all_KA$otu)) ) ),
                      rep(   "Caecal",times = length( c(as.character(df_blast.Cont.all_KA$otu),as.character(df_blast.Low.all_KA$otu),as.character(df_blast.High.all_KA$otu)) ) )
)

dat$percent_prev <- 100*dat$n_nonzero/dat$n

dat$log10_mean_rel_abun <- log10(dat$rel_abun_mean)

str(dat)
names(dat)
# [1] "otu"                 "Treatment"           "pident"              "rel_abun_mean"       "n_nonzero"           "n"                  
# [7] "Sample_type"         "percent_prev"        "log10_mean_rel_abun"

dat$Treatment <- factor(dat$Treatment, levels = c("Control","Low","High"),
                        ordered = TRUE)
dat$Sample_type <- factor(dat$Sample_type, levels = c("Faecal w0","Faecal w7","Caecal"), ordered = TRUE)
dat$pident <- factor(dat$pident)

levels(dat$pident) # "92" "94" "95" "96" "97"

str(dat)
# 'data.frame':	36 obs. of  9 variables:
# $ otu                : Factor w/ 10 levels "OTU_143","OTU_37",..: 1 3 4 7 9 10 2 8 3 5 ...
# $ Treatment          : Ord.factor w/ 3 levels "Control"<"Low"<..: 1 1 1 1 1 2 3 3 3 3 ...
# $ pident             : Factor w/ 5 levels "92","94","95",..: 3 2 5 1 4 2 5 4 2 4 ...
# $ rel_abun_mean      : num  0.00334 0.19496 0.00291 0.00188 0.00448 ...
# $ n_nonzero          : int  3 10 7 3 7 5 9 17 12 11 ...
# $ n                  : int  18 18 18 18 18 18 18 18 18 18 ...
# $ Sample_type        : Ord.factor w/ 3 levels "Faecal w0"<"Faecal w7"<..: 1 1 1 1 1 1 1 1 1 1 ...
# $ percent_prev       : num  16.7 55.6 38.9 16.7 38.9 ...
# $ log10_mean_rel_abun: num  -2.48 -0.71 -2.54 -2.72 -2.35 ...

dat$group <- NA
sel <- grep(pattern = "Faecal", x = dat$Sample_type)
dat$group[sel] <- "Faecal"
sel <- grep(pattern = "Caecal", x = dat$Sample_type)
dat$group[sel] <- "Caecal"

dat$group <- factor(dat$group, levels = c("Faecal","Caecal"), ordered = TRUE)
dat$link <- paste0(dat$otu,"_",dat$Treatment,"_",dat$group)


cols <- c( "Control" ="#f46d43", "Low" ="#66c2a5", "High" ="#5e4fa2" )
shapes <- c( "Faecal w0"=1, "Faecal w7"=16, "Caecal"=17 )

pid_sizes <- c("92"= 1.5, "94"= 2, "95"= 2.5, "96"= 3, "97"= 3.5)


p <- ggplot(data=dat, aes(x=log10_mean_rel_abun, y = percent_prev, color = Treatment, shape = Sample_type)) +
  geom_point(aes(size = pident)) +
  geom_line(aes(group=link)) +
  scale_shape_manual(values = shapes, name = "Sample\ntype") +
  scale_size_manual(values = pid_sizes, name = "Percent\nidentity") +
  scale_color_manual(values = cols) +
  theme_classic() +
  labs(x = "log10(mean % OTU rel. abun.)", y = "Prevalence (%)" ) + 
  facet_wrap(~group) + 
  theme(
    plot.margin = margin(t = 2, r = 2, b = 2, l = 5, "pt"),
    axis.ticks.length=unit(-3, "pt"),
    axis.text.x = element_text(margin=margin(t = 5,r = 5,b = 2,l = 5,"pt"), size = rel(0.7)),  # , size = rel(0.9))
    axis.text.y = element_text(margin=margin(t = 5,r = 5,b = 5,l = 2,"pt"), size = rel(0.7)),
    axis.title.x = element_text(margin=margin(t = 5,r = 2,b = 5,l = 2,"pt"), size = rel(0.8)),
    axis.title.y = element_text(margin=margin(t = 5,r = 2,b = 5,l = 2,"pt"), size = rel(0.8)),
    
    strip.background = element_rect(fill="white", linetype = "blank"),
    strip.text = element_text(margin=margin(t = 0,r = 0,b = 2,l = 0,"pt"), size = rel(0.8)),
    
    legend.key.size = unit(2, "pt"),
    legend.text = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.5)),
    legend.title = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.6)),
    legend.margin = margin(t = 0,r = 0,b = 0,l = 2,"pt"),
    legend.box.spacing = unit(2, "pt"),
    legend.box.margin = margin(t = 0,r = 0,b = 0,l = 0,"pt"),
    legend.spacing = unit(4, "pt")
    
  )

p

ggsave(plot=p, filename = paste0("plots/","All_KA_pc_prevalence-vs-log10-mean-pc-OTU-rel-abun-Faecal-vs-Caecal-vFINAL.tiff"), width = 10, height = 7, units = "cm", dpi = 600, compression = "lzw")

# this plot presents all mice. See plot later for male vs female breakdown






### Rel abun K. alysoides vs OF centre times - most anxious Females
### Analyse total % of K. alysoides-like OTUs in female mice with the lowest Open Field centre times

## use % OTU relative abundance phyloseq objects
dim( relabun.fecalw7.Cont.dc.16s@sam_data ) # 18 20
dim( relabun.cecal.Cont.dc.16s@sam_data ) # 18 20

dim( relabun.fecalw7.Low.dc.16s@sam_data ) # 17 20
dim( relabun.cecal.Low.dc.16s@sam_data ) # 17 20

dim( relabun.fecalw7.High.dc.16s@sam_data ) # 18 20
dim( relabun.cecal.High.dc.16s@sam_data ) # 18 20


## identify subjects
str(of.post.COPY)
sel <- which(of.post.COPY$Sex=="Female")
beh_fem <- of.post.COPY[sel, ]
str(beh_fem)
#'data.frame':	27 obs. of  38 variables:


dat <- data.frame(mouseID = c(as.character(relabun.fecalw7.Cont.dc.16s@sam_data$mouseID),
                              as.character(relabun.cecal.Cont.dc.16s@sam_data$mouseID),
                              as.character(relabun.fecalw7.Low.dc.16s@sam_data$mouseID),
                              as.character(relabun.cecal.Low.dc.16s@sam_data$mouseID),
                              as.character(relabun.fecalw7.High.dc.16s@sam_data$mouseID),
                              as.character(relabun.cecal.High.dc.16s@sam_data$mouseID)),
                  Treatment = c(rep("Control",times=18*2),
                                rep("Low",times=17*2),
                                rep("High",times=18*2)),
                  Sample_type = c(rep("Faecal w7",times=18),rep("Caecal",times=18),
                                  rep("Faecal w7",times=17),rep("Caecal",times=17),
                                  rep("Faecal w7",times=18),rep("Caecal",times=18))
)

dat$percent_all_KA <- NA
dat$Centre_time <- NA

## list all KA-like OTUs

all_KA_OTUs <- unique( c( as.character( df_blast.Cont.all_KA$otu ), as.character( df_blast.Low.all_KA$otu ), as.character( df_blast.High.all_KA$otu ) ) )
all_KA_OTUs
# [1] "OTU_143"  "OTU_481"  "OTU_531"  "OTU_5971" "OTU_9505" "OTU_9639" "OTU_37"   "OTU_60"   "OTU_5520"
# [10] "OTU_5790"

names(dat) # "mouseID"        "Treatment"      "Sample_type"    "percent_all_KA" "Centre_time"

for (i in 1:dim(dat)[1]) {
  # i<-1
  
  if (dat$Treatment[i]=="Control" & dat$Sample_type[i]=="Faecal w7") {
    phy_obj <- relabun.fecalw7.Cont.dc.16s
  } else if (dat$Treatment[i]=="Control" & dat$Sample_type[i]=="Caecal") {
    phy_obj <- relabun.cecal.Cont.dc.16s
  } else if (dat$Treatment[i]=="Low" & dat$Sample_type[i]=="Faecal w7") {
    phy_obj <- relabun.fecalw7.Low.dc.16s
  } else if (dat$Treatment[i]=="Low" & dat$Sample_type[i]=="Caecal") {
    phy_obj <- relabun.cecal.Low.dc.16s
  } else if (dat$Treatment[i]=="High" & dat$Sample_type[i]=="Faecal w7") {
    phy_obj <- relabun.fecalw7.High.dc.16s
    # else: #(dat$Treatment[i]=="High" & dat$Sample_type[i]=="Caecal")
  } else  {
    phy_obj <- relabun.cecal.High.dc.16s
  }
  
  
  this_subject <- as.character( dat$mouseID[i] )  # this_subject: "C1m3"
  
  sel_phy_obj <- prune_samples( phy_obj@sam_data$mouseID == this_subject, phy_obj )
  # sample_names(sel_phy_obj): "C1m3T16"
  
  sel_KA <- prune_taxa( taxa_names(sel_phy_obj) %in% all_KA_OTUs, sel_phy_obj )
  
  #sel_KA@otu_table
  dat$percent_all_KA[i] <- sum( taxa_sums(sel_KA@otu_table) )
  
  sel <- which(as.character(beh_fem$ID) == as.character(dat$mouseID[i]))
  
  if (length(sel)==1) {
    dat$Centre_time[i] <- beh_fem$Centre_time[sel]
  }
  
  print(paste0("mouse: ", dat$mouseID[i]," ",beh_fem$ID[sel] ))
  
}

## male are not present in 'dat'

ok <- complete.cases(dat)
sel <- which(ok==TRUE)
dat <- dat[sel, ]


dat$lowest_2 <- NA
dat$lowest_3 <- NA
dat$lowest_4 <- NA
dat$lowest_5 <- NA

x <- 2

sel <- which(dat$Treatment=="Control" & dat$Sample_type=="Faecal w7")
subsel <- which(dat$Centre_time[sel] %in% sort(dat$Centre_time[sel])[1:x])
dat[sel[subsel], paste0("lowest_",x)] <- "Faecal w7"

sel <- which(dat$Treatment=="Control" & dat$Sample_type=="Caecal")
subsel <- which(dat$Centre_time[sel] %in% sort(dat$Centre_time[sel])[1:x])
dat[sel[subsel], paste0("lowest_",x)] <- "Caecal"

sel <- which(dat$Treatment=="Low" & dat$Sample_type=="Faecal w7")
subsel <- which(dat$Centre_time[sel] %in% sort(dat$Centre_time[sel])[1:x])
dat[sel[subsel], paste0("lowest_",x)] <- "Faecal w7"

sel <- which(dat$Treatment=="Low" & dat$Sample_type=="Caecal")
subsel <- which(dat$Centre_time[sel] %in% sort(dat$Centre_time[sel])[1:x])
dat[sel[subsel], paste0("lowest_",x)] <- "Caecal"

sel <- which(dat$Treatment=="High" & dat$Sample_type=="Faecal w7")
subsel <- which(dat$Centre_time[sel] %in% sort(dat$Centre_time[sel])[1:x])
dat[sel[subsel], paste0("lowest_",x)] <- "Faecal w7"

sel <- which(dat$Treatment=="High" & dat$Sample_type=="Caecal")
subsel <- which(dat$Centre_time[sel] %in% sort(dat$Centre_time[sel])[1:x])
dat[sel[subsel], paste0("lowest_",x)] <- "Caecal"

choose <- which(dat$lowest_2=="Faecal w7")
plot(dat$percent_all_KA[choose], dat$Centre_time[choose])

choose <- which(dat$lowest_2=="Caecal")
plot(dat$percent_all_KA[choose], dat$Centre_time[choose])

# Faecal w7
cor(dat$percent_all_KA[choose], dat$Centre_time[choose]) # -0.165264

# Caecal
cor(dat$percent_all_KA[choose], dat$Centre_time[choose]) # 0.8997448



# # #

x <- 3

sel <- which(dat$Treatment=="Control" & dat$Sample_type=="Faecal w7")
subsel <- which(dat$Centre_time[sel] %in% sort(dat$Centre_time[sel])[1:x])
dat[sel[subsel], paste0("lowest_",x)] <- "Faecal w7"

sel <- which(dat$Treatment=="Control" & dat$Sample_type=="Caecal")
subsel <- which(dat$Centre_time[sel] %in% sort(dat$Centre_time[sel])[1:x])
dat[sel[subsel], paste0("lowest_",x)] <- "Caecal"

sel <- which(dat$Treatment=="Low" & dat$Sample_type=="Faecal w7")
subsel <- which(dat$Centre_time[sel] %in% sort(dat$Centre_time[sel])[1:x])
dat[sel[subsel], paste0("lowest_",x)] <- "Faecal w7"

sel <- which(dat$Treatment=="Low" & dat$Sample_type=="Caecal")
subsel <- which(dat$Centre_time[sel] %in% sort(dat$Centre_time[sel])[1:x])
dat[sel[subsel], paste0("lowest_",x)] <- "Caecal"

sel <- which(dat$Treatment=="High" & dat$Sample_type=="Faecal w7")
subsel <- which(dat$Centre_time[sel] %in% sort(dat$Centre_time[sel])[1:x])
dat[sel[subsel], paste0("lowest_",x)] <- "Faecal w7"

sel <- which(dat$Treatment=="High" & dat$Sample_type=="Caecal")
subsel <- which(dat$Centre_time[sel] %in% sort(dat$Centre_time[sel])[1:x])
dat[sel[subsel], paste0("lowest_",x)] <- "Caecal"

choose <- which(dat$lowest_3=="Faecal w7")
plot(dat$percent_all_KA[choose], dat$Centre_time[choose])

choose <- which(dat$lowest_3=="Caecal")
plot(dat$percent_all_KA[choose], dat$Centre_time[choose])

# Faecal w7
cor(dat$percent_all_KA[choose], dat$Centre_time[choose]) # -0.2536777

# Caecal
cor(dat$percent_all_KA[choose], dat$Centre_time[choose]) # 0.7306088



# # #

x <- 4

sel <- which(dat$Treatment=="Control" & dat$Sample_type=="Faecal w7")
subsel <- which(dat$Centre_time[sel] %in% sort(dat$Centre_time[sel])[1:x])
dat[sel[subsel], paste0("lowest_",x)] <- "Faecal w7"

sel <- which(dat$Treatment=="Control" & dat$Sample_type=="Caecal")
subsel <- which(dat$Centre_time[sel] %in% sort(dat$Centre_time[sel])[1:x])
dat[sel[subsel], paste0("lowest_",x)] <- "Caecal"

sel <- which(dat$Treatment=="Low" & dat$Sample_type=="Faecal w7")
subsel <- which(dat$Centre_time[sel] %in% sort(dat$Centre_time[sel])[1:x])
dat[sel[subsel], paste0("lowest_",x)] <- "Faecal w7"

sel <- which(dat$Treatment=="Low" & dat$Sample_type=="Caecal")
subsel <- which(dat$Centre_time[sel] %in% sort(dat$Centre_time[sel])[1:x])
dat[sel[subsel], paste0("lowest_",x)] <- "Caecal"

sel <- which(dat$Treatment=="High" & dat$Sample_type=="Faecal w7")
subsel <- which(dat$Centre_time[sel] %in% sort(dat$Centre_time[sel])[1:x])
dat[sel[subsel], paste0("lowest_",x)] <- "Faecal w7"

sel <- which(dat$Treatment=="High" & dat$Sample_type=="Caecal")
subsel <- which(dat$Centre_time[sel] %in% sort(dat$Centre_time[sel])[1:x])
dat[sel[subsel], paste0("lowest_",x)] <- "Caecal"

choose <- which(dat$lowest_4=="Faecal w7")
plot(dat$percent_all_KA[choose], dat$Centre_time[choose])

choose <- which(dat$lowest_4=="Caecal")
plot(dat$percent_all_KA[choose], dat$Centre_time[choose])

# Faecal w7
cor(dat$percent_all_KA[choose], dat$Centre_time[choose]) # -0.08290818

# Caecal
cor(dat$percent_all_KA[choose], dat$Centre_time[choose]) # 0.7595732



# # #

x <- 5

sel <- which(dat$Treatment=="Control" & dat$Sample_type=="Faecal w7")
subsel <- which(dat$Centre_time[sel] %in% sort(dat$Centre_time[sel])[1:x])
dat[sel[subsel], paste0("lowest_",x)] <- "Faecal w7"

sel <- which(dat$Treatment=="Control" & dat$Sample_type=="Caecal")
subsel <- which(dat$Centre_time[sel] %in% sort(dat$Centre_time[sel])[1:x])
dat[sel[subsel], paste0("lowest_",x)] <- "Caecal"

sel <- which(dat$Treatment=="Low" & dat$Sample_type=="Faecal w7")
subsel <- which(dat$Centre_time[sel] %in% sort(dat$Centre_time[sel])[1:x])
dat[sel[subsel], paste0("lowest_",x)] <- "Faecal w7"

sel <- which(dat$Treatment=="Low" & dat$Sample_type=="Caecal")
subsel <- which(dat$Centre_time[sel] %in% sort(dat$Centre_time[sel])[1:x])
dat[sel[subsel], paste0("lowest_",x)] <- "Caecal"

sel <- which(dat$Treatment=="High" & dat$Sample_type=="Faecal w7")
subsel <- which(dat$Centre_time[sel] %in% sort(dat$Centre_time[sel])[1:x])
dat[sel[subsel], paste0("lowest_",x)] <- "Faecal w7"

sel <- which(dat$Treatment=="High" & dat$Sample_type=="Caecal")
subsel <- which(dat$Centre_time[sel] %in% sort(dat$Centre_time[sel])[1:x])
dat[sel[subsel], paste0("lowest_",x)] <- "Caecal"

choose <- which(dat$lowest_5=="Faecal w7")
plot(dat$percent_all_KA[choose], dat$Centre_time[choose])

choose <- which(dat$lowest_5=="Caecal")
plot(dat$percent_all_KA[choose], dat$Centre_time[choose])

# Faecal w7
cor(dat$percent_all_KA[choose], dat$Centre_time[choose]) # -0.2923509

# Caecal
cor(dat$percent_all_KA[choose], dat$Centre_time[choose]) # 0.3019307


## No relationship found for faecal w7.
## Strip back to Caecal samples


dat_c <- dat[ which(dat$Sample_type=="Caecal"), ]

names(dat_c)
# [1] "mouseID"        "Treatment"      "Sample_type"    "percent_all_KA" "Centre_time"    "lowest_2"      
# [7] "lowest_3"       "lowest_4"       "lowest_5" 

sel <- which(dat_c$lowest_2=="Caecal")
dat_c$lowest_2[sel] <- 2

sel <- which(dat_c$lowest_3=="Caecal")
dat_c$lowest_3[sel] <- 3

sel <- which(dat_c$lowest_4=="Caecal")
dat_c$lowest_4[sel] <- 4

sel <- which(dat_c$lowest_5=="Caecal")
dat_c$lowest_5[sel] <- 5



## reformat for plotting with ggplot facet_grid

names(dat_c)
# [1] "mouseID"        "Treatment"      "Sample_type"    "percent_all_KA" "Centre_time"    "lowest_2"      
# [7] "lowest_3"       "lowest_4"       "lowest_5"   

new_names <- c("mouseID", "Treatment", "Sample_type", "percent_all_KA", "Centre_time", "lowest_X")

get_cols <- c("lowest_2", "lowest_3", "lowest_4", "lowest_5")
most_anxious_x <- c(2:5)
length(most_anxious_x) # 4

dat_c.plot <- data.frame(matrix(nrow = 1, ncol = length( new_names )))
names(dat_c.plot) <- new_names

for (i in 1:length(most_anxious_x)) {
  #i<-1
  # getsubset of dataframe which examines most anxious X mice in each treatment
  dat_sel <- dat_c[  ,c( names(dat_c)[1:5],get_cols[i] ) ]
  
  subsel <- which(dat_sel[ , get_cols[i] ] == most_anxious_x[i])
  
  names(dat_sel) <- names(dat_c.plot)
  
  dat_c.plot <- rbind(dat_c.plot, dat_sel[subsel, ])
  
}

# remove NA first row
dat_c.plot <- dat_c.plot[-1, ]

str(dat_c.plot)
# 'data.frame':	42 obs. of  6 variables:
# $ mouseID       : chr  "C4m2" "C2m1" "L3m1" "L5m2" ...
# $ Treatment     : chr  "Control" "Control" "Low" "Low" ...
# $ Sample_type   : chr  "Caecal" "Caecal" "Caecal" "Caecal" ...
# $ percent_all_KA: num  0.564 0.87 0.927 1.288 1.222 ...
# $ Centre_time   : num  3.3 10.4 13 18.5 28.5 22.6 3.3 10.6 10.4 20.4 ...
# $ lowest_X      : chr  "2" "2" "2" "2" ...

3*2 + 3*3 + 3*4 + 3*5 # 42

temp <- dat_c.plot

cols <- c( "Control" ="#f46d43", "Low" ="#66c2a5", "High" ="#5e4fa2" )

levels(factor(dat_c.plot$lowest_X)) # [1] "2" "3" "4" "5"
#dat_c.plot$lowest_X <- factor(dat_c.plot$lowest_X, levels = c("2", "3", "4", "5"), labels = c("2 most\nanxious", "3 most\nanxious", "4 most\nanxious", "5 most\nanxious") )
dat_c.plot$lowest_X <- factor(dat_c.plot$lowest_X, levels = c("2", "3", "4", "5"), labels = c("Female\n2 most anxious", "Female\n3 most anxious", "Female\n4 most anxious", "Female\n5 most anxious") )

# annotation for r and n
anno <- data.frame(x = c(1.5, 1.5, 1.5, 1.5),
                   y = c(0, 0, 0, 0),
                   #lowest_X = c("2 most\nanxious", "3 most\nanxious", "4 most\nanxious", "5 most\nanxious"),
                   lowest_X = c("Female\n2 most anxious", "Female\n3 most anxious", "Female\n4 most anxious", "Female\n5 most anxious"),
                   r = paste0("r = ", round( c(0.8997448, 0.7306088, 0.7595732, 0.3019307),digits = 2 )),
                   n = paste0("n = ", c(2*3, 3*3, 4*3, 5*3))
)


p <- ggplot(data=dat_c.plot, aes(x=percent_all_KA, y= Centre_time, color = Treatment)) + 
  geom_point(size=1) + scale_colour_manual(values = cols) +
  #stat_smooth(aes(x=percent_all_KA, y= Centre_time), inherit.aes = FALSE, show.legend = FALSE) +
  geom_smooth(aes(x=percent_all_KA, y= Centre_time), method='lm', inherit.aes = FALSE, show.legend = FALSE, fill = "#bdbdbd", size = 0.5) +
  
  #geom_text( data = anno, aes(x=x, y=y, label = r), size = 2.5, inherit.aes = FALSE) +
  geom_text( data = anno, aes(x=x, y=y, label = paste0(r,"\n",n)), size = 2.5, inherit.aes = FALSE) +
  
  theme_classic() +
  #labs(x = "Total % OTU rel. abun. K. alysoides", y = "Time in centre (s)") +
  labs(x = parse(text='"Total % OTU rel. abun. "*italic("K. alysoides")'), y = "Time in centre (s)") +
  
  theme(legend.position="bottom") + 
  facet_grid(~lowest_X) + #, scales="free_x", space="free_x") +
  
  theme(
    plot.margin = margin(t = 5, r = 5, b = 0, l = 5, "pt"),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    
    axis.ticks.length=unit(-3, "pt"),
    
    axis.text.x = element_text(margin=margin(t = 5,r = 5,b = 2,l = 5,"pt"), vjust=0.5, size = rel(0.7)),
    axis.text.y = element_text(margin=margin(t = 5,r = 5,b = 5,l = 5,"pt"), size = rel(0.6)),
    axis.title.x = element_text(margin=margin(t = 2,r = 0,b = 0,l = 0,"pt"), size = rel(0.8)),
    axis.title.y = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.8)),
    
    strip.background = element_rect(fill="white", linetype = "blank"),
    strip.text = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.7)),
    
    legend.box.margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"),
    legend.key.size = unit(5, "pt"),
    legend.text = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.6)),
    legend.title = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.7)),
    legend.margin = margin(t = 0,r = 0,b = 5,l = 0,"pt"),
    legend.box.spacing = unit(5, "pt")
    
  )
p


dev.print(tiff, file = paste0("plots/","Total-OTU-rel-abun-KA-vs-Base-anxiety-Most-anxious-in-Treatments-vFINAL.tiff"), width = 12, height = 6, units = "cm", res=600, compression="lzw")



# NO LEGEND or X-AXIS TITLE

p <- ggplot(data=dat_c.plot, aes(x=percent_all_KA, y= Centre_time, color = Treatment)) + 
  geom_point(size=1) + scale_colour_manual(values = cols) +
  #stat_smooth(aes(x=percent_all_KA, y= Centre_time), inherit.aes = FALSE, show.legend = FALSE) +
  geom_smooth(aes(x=percent_all_KA, y= Centre_time), method='lm', inherit.aes = FALSE, show.legend = FALSE, fill = "#bdbdbd", size = 0.5) +
  
  #geom_text( data = anno, aes(x=x, y=y, label = r), size = 2.5, inherit.aes = FALSE) +
  geom_text( data = anno, aes(x=x, y=y, label = paste0(r,"\n",n)), size = 2.5, inherit.aes = FALSE, lineheight=0.9) +
  
  theme_classic() +
  #labs(x = "Total % OTU rel. abun. K. alysoides", y = "Time in centre (s)") +
  #labs(x = parse(text='"Total % OTU rel. abun. "*italic("K. alysoides")'), y = "Time in centre (s)") +
  labs(x = NULL , y = "Time in centre (s)") +
  
  theme(legend.position="none") + 
  facet_grid(~lowest_X) + #, scales="free_x", space="free_x") +
  
  theme(
    plot.margin = margin(t = 5, r = 5, b = 5, l = 5, "pt"),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    
    axis.ticks.length=unit(-3, "pt"),
    
    axis.text.x = element_text(margin=margin(t = 5,r = 5,b = 2,l = 5,"pt"), vjust=0.5, size = rel(0.7)),
    axis.text.y = element_text(margin=margin(t = 5,r = 5,b = 5,l = 5,"pt"), size = rel(0.6)),
    axis.title.x = element_text(margin=margin(t = 2,r = 0,b = 0,l = 0,"pt"), size = rel(0.8)),
    axis.title.y = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.8)),
    
    strip.background = element_rect(fill="white", linetype = "blank"),
    strip.text = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.7))
    
    # legend.box.margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"),
    # legend.key.size = unit(5, "pt"),
    # legend.text = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.6)),
    # legend.title = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.7)),
    # legend.margin = margin(t = 0,r = 0,b = 5,l = 0,"pt"),
    # legend.box.spacing = unit(5, "pt")
    
  )
p


grid.text(label = "A" , x = unit(0.03, "npc") , y = unit(0.95,"npc"), gp=gpar(fontsize=11, fontface="bold") )

dev.print(tiff, file = paste0("plots/","Total-OTU-rel-abun-KA-vs-Base-anxiety-Most-anxious-in-Treatments-NO-TITLE-NO-LEGEND-A-vFINAL.tiff"), width = 12, height = 5, units = "cm", res=600, compression="lzw")






### Also compare '1/3 most anxious' vs. '1/3 least anxious' based on top third and bottom third of Open Field centre times

## identify subjects
str(of.post.COPY)
sel <- which(of.post.COPY$Sex=="Female")
beh_fem <- of.post.COPY[sel, ]
str(beh_fem)
#'data.frame':	27 obs. of  38 variables:


dat <- data.frame(mouseID = c(as.character(relabun.cecal.Cont.dc.16s@sam_data$mouseID),
                              as.character(relabun.cecal.Low.dc.16s@sam_data$mouseID),
                              as.character(relabun.cecal.High.dc.16s@sam_data$mouseID)),
                  Treatment = c(rep("Control",times=18),
                                rep("Low",times=17),
                                rep("High",times=18)),
                  Sample_type = c(rep("Caecal",times=18),
                                  rep("Caecal",times=17),
                                  rep("Caecal",times=18))
)

dat$percent_all_KA <- NA
dat$Centre_time <- NA

## list all KA-like OTUs

all_KA_OTUs <- unique( c( as.character( df_blast.Cont.all_KA$otu ), as.character( df_blast.Low.all_KA$otu ), as.character( df_blast.High.all_KA$otu ) ) )
all_KA_OTUs
# [1] "OTU_143"  "OTU_481"  "OTU_531"  "OTU_5971" "OTU_9505" "OTU_9639" "OTU_37"   "OTU_60"   "OTU_5520"
# [10] "OTU_5790"

names(dat) # "mouseID"        "Treatment"      "Sample_type"    "percent_all_KA" "Centre_time"



for (i in 1:dim(dat)[1]) {
  # i<-1
  
  if (dat$Treatment[i]=="Control" & dat$Sample_type[i]=="Caecal") {
    phy_obj <- relabun.cecal.Cont.dc.16s
  } else if (dat$Treatment[i]=="Low" & dat$Sample_type[i]=="Caecal") {
    phy_obj <- relabun.cecal.Low.dc.16s
  } else {
    phy_obj <- relabun.cecal.High.dc.16s
  }
  
  
  this_subject <- as.character( dat$mouseID[i] )  # this_subject: "C1m3"
  
  sel_phy_obj <- prune_samples( phy_obj@sam_data$mouseID == this_subject, phy_obj )
  # sample_names(sel_phy_obj): "C6m2Ce"
  
  sel_KA <- prune_taxa( taxa_names(sel_phy_obj) %in% all_KA_OTUs, sel_phy_obj )
  
  #sel_KA@otu_table
  dat$percent_all_KA[i] <- sum( taxa_sums(sel_KA@otu_table) )
  
  sel <- which(as.character(beh_fem$ID) == as.character(dat$mouseID[i]))
  
  if (length(sel)==1) {
    dat$Centre_time[i] <- beh_fem$Centre_time[sel]
  }
  
  print(paste0("mouse: ", dat$mouseID[i]," ",beh_fem$ID[sel] ))
  
}

## note: males are not present in 'dat'

ok <- complete.cases(dat)
sel <- which(ok==TRUE)
dat <- dat[sel, ]


dat$group <- NA


## define 'anxious' and 'less anxious' groups based on
## lower and upper thirds of Open Field centre times

quantile(beh_fem$Centre_time, probs = c(1/3, 2/3) )
# 33.33333% 66.66667% 
#   26.56667  51.76667 

beh_fem$group <- NA 
sel <- which(beh_fem$Centre_time <= quantile(beh_fem$Centre_time, probs = c(1/3) ))
beh_fem$group[sel] <- "anxious"

sel <- which(beh_fem$Centre_time >= quantile(beh_fem$Centre_time, probs = c(2/3) ))
beh_fem$group[sel] <- "less anxious"

# remove middle third
ok <- complete.cases(beh_fem)
sel <- which(ok==TRUE)
beh_fem_groups <- beh_fem[sel, ]

dim(beh_fem_groups) # 18  39

beh_fem_groups[ ,c("ID","Treatment","Centre_time","group")]
#      ID Treatment Centre_time        group
# 4  C2m1   Control        10.4      anxious
# 5  C2m2   Control       162.5 less anxious
# 6  C2m3   Control        88.7 less anxious
# 10 C4m1   Control        12.1      anxious
# 11 C4m2   Control         3.3      anxious
# 13 C5m1   Control        10.6      anxious
# 15 C5m3   Control        25.5      anxious
# 27 H3m3      High        61.4 less anxious
# 28 H4m1      High        22.6      anxious
# 29 H4m2      High        90.0 less anxious
# 34 H6m1      High        97.6 less anxious
# 36 H6m3      High        57.7 less anxious
# 38 L1m2       Low        82.5 less anxious
# 43 L3m1       Low        13.0      anxious
# 44 L3m2       Low       126.8 less anxious
# 45 L3m3       Low        20.4      anxious
# 50 L5m2       Low        18.5      anxious
# 51 L5m3       Low       113.0 less anxious

anxious_mice <- as.character( beh_fem_groups$ID[ which(beh_fem_groups$group=="anxious") ] )
anxious_mice
# "C2m1" "C4m1" "C4m2" "C5m1" "C5m3" "H4m1" "L3m1" "L3m3" "L5m2"

less_anxious_mice <- as.character( beh_fem_groups$ID[ which(beh_fem_groups$group=="less anxious") ] )
less_anxious_mice
# "C2m2" "C2m3" "H3m3" "H4m2" "H6m1" "H6m3" "L1m2" "L3m2" "L5m3"


sel <- which(dat$mouseID %in% anxious_mice)
sel #  1  3  4  6  9 11 12 16 22
dat$mouseID[sel] # C4m2 C4m1 C5m1 C5m3 C2m1 L3m3 L3m1 L5m2 H4m1
dat$group[sel] <- "anxious"


sel <- which(dat$mouseID %in% less_anxious_mice)
sel #   7  8 10 13 17 19 24 25 26
dat$mouseID[sel] # C2m2 C2m3 L3m2 L1m2 L5m3 H3m3 H4m2 H6m1 H6m3
dat$group[sel] <- "less anxious"

# remove missing data
ok <- complete.cases(dat)
sel <- which(ok==TRUE)
dat_groups <- dat[sel, ]

choose <- which(dat_groups$group=="anxious")
plot(dat_groups$percent_all_KA[choose], dat_groups$Centre_time[choose])

cor(dat_groups$percent_all_KA[choose], dat_groups$Centre_time[choose]) # 0.5847493



choose <- which(dat_groups$group=="less anxious")
plot(dat_groups$percent_all_KA[choose], dat_groups$Centre_time[choose])

cor(dat_groups$percent_all_KA[choose], dat_groups$Centre_time[choose]) # -0.389352



str(dat_groups)
# 'data.frame':	18 obs. of  6 variables:
# $ mouseID       : Factor w/ 53 levels "C1m1","C1m2",..: 11 10 13 15 5 6 4 43 44 42 ...
# $ Treatment     : Factor w/ 3 levels "Control","High",..: 1 1 1 1 1 1 1 3 3 3 ...
# $ Sample_type   : Factor w/ 1 level "Caecal": 1 1 1 1 1 1 1 1 1 1 ...
# $ percent_all_KA: num  0.564 1.05 1.14 1.447 0.486 ...
# $ Centre_time   : num  3.3 12.1 10.6 25.5 162.5 ...
# $ group         : chr  "anxious" "anxious" "anxious" "anxious" ...

table(dat_groups$group)
# Anxious Less anxious 
#       9            9

cols <- c( "Control" ="#f46d43", "Low" ="#66c2a5", "High" ="#5e4fa2" )

#dat_groups$group <- factor(dat_groups$group, levels = c("anxious", "less anxious"), labels = c("Anxious", "Less anxious") )
#dat_groups$group <- factor(dat_groups$group, levels = c("anxious", "less anxious"), labels = c("1/3 most\nanxious", "1/3 least\nanxious") )
dat_groups$group <- factor(dat_groups$group, levels = c("anxious", "less anxious"), labels = c("Female\n1/3 most anxious", "Female\n1/3 least anxious") )

# annotation for r and n
anno <- data.frame(x = c(1.1, 1.1),
                   y = c(5, 140),
                   #group = c("Anxious", "Less anxious"),
                   #group = c("1/3 most\nanxious", "1/3 least\nanxious"),
                   group = c("Female\n1/3 most anxious", "Female\n1/3 least anxious"),
                   r = paste0("r = ", round( c(0.5847493, -0.389352),digits = 2 )),
                   n = paste0("n = ", c(9,9))
)


p <- ggplot(data=dat_groups, aes(x=percent_all_KA, y= Centre_time, color = Treatment)) + 
  geom_point(size=1) + scale_colour_manual(values = cols) +
  #stat_smooth(aes(x=percent_all_KA, y= Centre_time), inherit.aes = FALSE, show.legend = FALSE) +
  geom_smooth(aes(x=percent_all_KA, y= Centre_time), method='lm', inherit.aes = FALSE, show.legend = FALSE, fill = "#bdbdbd", size = 0.5) +
  
  #geom_text( data = anno, aes(x=x, y=y, label = r), size = 2.5, inherit.aes = FALSE) +
  geom_text( data = anno, aes(x=x, y=y, label = paste0(r,"\n",n)), size = 2.5, inherit.aes = FALSE, lineheight=0.9) +
  
  theme_classic() +
  #labs(x = "Total % OTU rel. abun. K. alysoides", y = "Time in centre (s)") +
  #labs(x = parse(text='"Total % OTU rel. abun. "*italic("K. alysoides")'), y = "Time in centre (s)") +
  labs(x = NULL, y = "Time in centre (s)") +
  
  theme(legend.position="right") + 
  #facet_grid(~group) + #, scales="free_x", space="free_x") +
  facet_wrap(~group, scales = "free_y") + #, scales="free_x", space="free_x") +
  
  theme(
    plot.margin = margin(t = 5, r = 5, b = 0, l = 5, "pt"),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    
    axis.ticks.length=unit(-3, "pt"),
    
    axis.text.x = element_text(margin=margin(t = 5,r = 5,b = 2,l = 5,"pt"), vjust=0.5, size = rel(0.7)),
    axis.text.y = element_text(margin=margin(t = 5,r = 5,b = 5,l = 5,"pt"), size = rel(0.6)),
    #axis.title.x = element_text(margin=margin(t = 2,r = 0,b = 0,l = 0,"pt"), size = rel(0.8)),
    axis.title.x = element_blank(),
    axis.title.y = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.8)),
    
    strip.background = element_rect(fill="white", linetype = "blank"),
    strip.text = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.7)),
    
    legend.box.margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"),
    legend.key.size = unit(5, "pt"),
    legend.text = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.6)),
    legend.title = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.7)),
    legend.margin = margin(t = 0,r = 0,b = 5,l = 0,"pt"),
    legend.box.spacing = unit(5, "pt")
    
  )
p

grid.text(label = "B" , x = unit(0.03, "npc") , y = unit(0.95,"npc"), gp=gpar(fontsize=11, fontface="bold"))


dev.print(tiff, file = paste0("plots/","Total-OTU-rel-abun-KA-vs-Base-anxiety-Anxious-vs-Less-anxious-B-vFINAL.tiff"), width = 12, height = 5, units = "cm", res=600, compression="lzw")








### Plot X most anxious in males
### Analyse total % of K. alysoides-like OTUs in female mice with the lowest Open Field centre times

## use % OTU relative abundance phyloseq objects
dim( relabun.fecalw7.Cont.dc.16s@sam_data ) # 18 20
dim( relabun.cecal.Cont.dc.16s@sam_data ) # 18 20

dim( relabun.fecalw7.Low.dc.16s@sam_data ) # 17 20
dim( relabun.cecal.Low.dc.16s@sam_data ) # 17 20

dim( relabun.fecalw7.High.dc.16s@sam_data ) # 18 20
dim( relabun.cecal.High.dc.16s@sam_data ) # 18 20


## identify subjects
str(of.post.COPY)
sel <- which(of.post.COPY$Sex=="Male")
beh_male <- of.post.COPY[sel, ]
str(beh_male)
#'data.frame':	27 obs. of  38 variables:



dat <- data.frame(mouseID = c(as.character(relabun.fecalw7.Cont.dc.16s@sam_data$mouseID),
                              as.character(relabun.cecal.Cont.dc.16s@sam_data$mouseID),
                              as.character(relabun.fecalw7.Low.dc.16s@sam_data$mouseID),
                              as.character(relabun.cecal.Low.dc.16s@sam_data$mouseID),
                              as.character(relabun.fecalw7.High.dc.16s@sam_data$mouseID),
                              as.character(relabun.cecal.High.dc.16s@sam_data$mouseID)),
                  Treatment = c(rep("Control",times=18*2),
                                rep("Low",times=17*2),
                                rep("High",times=18*2)),
                  Sample_type = c(rep("Faecal w7",times=18),rep("Caecal",times=18),
                                  rep("Faecal w7",times=17),rep("Caecal",times=17),
                                  rep("Faecal w7",times=18),rep("Caecal",times=18))
)

dat$percent_all_KA <- NA
dat$Centre_time <- NA

## list all KA-like OTUs

all_KA_OTUs <- unique( c( as.character( df_blast.Cont.all_KA$otu ), as.character( df_blast.Low.all_KA$otu ), as.character( df_blast.High.all_KA$otu ) ) )
all_KA_OTUs
# [1] "OTU_143"  "OTU_481"  "OTU_531"  "OTU_5971" "OTU_9505" "OTU_9639" "OTU_37"   "OTU_60"   "OTU_5520"
# [10] "OTU_5790"

names(dat) # "mouseID"        "Treatment"      "Sample_type"    "percent_all_KA" "Centre_time"

for (i in 1:dim(dat)[1]) {
  # i<-1
  
  if (dat$Treatment[i]=="Control" & dat$Sample_type[i]=="Faecal w7") {
    phy_obj <- relabun.fecalw7.Cont.dc.16s
  } else if (dat$Treatment[i]=="Control" & dat$Sample_type[i]=="Caecal") {
    phy_obj <- relabun.cecal.Cont.dc.16s
  } else if (dat$Treatment[i]=="Low" & dat$Sample_type[i]=="Faecal w7") {
    phy_obj <- relabun.fecalw7.Low.dc.16s
  } else if (dat$Treatment[i]=="Low" & dat$Sample_type[i]=="Caecal") {
    phy_obj <- relabun.cecal.Low.dc.16s
  } else if (dat$Treatment[i]=="High" & dat$Sample_type[i]=="Faecal w7") {
    phy_obj <- relabun.fecalw7.High.dc.16s
    # else: #(dat$Treatment[i]=="High" & dat$Sample_type[i]=="Caecal")
  } else  {
    phy_obj <- relabun.cecal.High.dc.16s
  }
  
  
  this_subject <- as.character( dat$mouseID[i] )  # this_subject: "C1m3"
  
  sel_phy_obj <- prune_samples( phy_obj@sam_data$mouseID == this_subject, phy_obj )
  # sample_names(sel_phy_obj): "C1m3T16"
  
  sel_KA <- prune_taxa( taxa_names(sel_phy_obj) %in% all_KA_OTUs, sel_phy_obj )
  
  #sel_KA@otu_table
  dat$percent_all_KA[i] <- sum( taxa_sums(sel_KA@otu_table) )
  
  sel <- which(as.character(beh_male$ID) == as.character(dat$mouseID[i]))
  
  if (length(sel)==1) {
    dat$Centre_time[i] <- beh_male$Centre_time[sel]
  }
  
  print(paste0("mouse: ", dat$mouseID[i]," ",beh_male$ID[sel] ))
  
}

## now females are not present in 'dat'

ok <- complete.cases(dat)
sel <- which(ok==TRUE)
dat <- dat[sel, ]


dat$lowest_2 <- NA
dat$lowest_3 <- NA
dat$lowest_4 <- NA
dat$lowest_5 <- NA

x <- 2

sel <- which(dat$Treatment=="Control" & dat$Sample_type=="Faecal w7")
subsel <- which(dat$Centre_time[sel] %in% sort(dat$Centre_time[sel])[1:x])
dat[sel[subsel], paste0("lowest_",x)] <- "Faecal w7"

sel <- which(dat$Treatment=="Control" & dat$Sample_type=="Caecal")
subsel <- which(dat$Centre_time[sel] %in% sort(dat$Centre_time[sel])[1:x])
dat[sel[subsel], paste0("lowest_",x)] <- "Caecal"

sel <- which(dat$Treatment=="Low" & dat$Sample_type=="Faecal w7")
subsel <- which(dat$Centre_time[sel] %in% sort(dat$Centre_time[sel])[1:x])
dat[sel[subsel], paste0("lowest_",x)] <- "Faecal w7"

sel <- which(dat$Treatment=="Low" & dat$Sample_type=="Caecal")
subsel <- which(dat$Centre_time[sel] %in% sort(dat$Centre_time[sel])[1:x])
dat[sel[subsel], paste0("lowest_",x)] <- "Caecal"

sel <- which(dat$Treatment=="High" & dat$Sample_type=="Faecal w7")
subsel <- which(dat$Centre_time[sel] %in% sort(dat$Centre_time[sel])[1:x])
dat[sel[subsel], paste0("lowest_",x)] <- "Faecal w7"

sel <- which(dat$Treatment=="High" & dat$Sample_type=="Caecal")
subsel <- which(dat$Centre_time[sel] %in% sort(dat$Centre_time[sel])[1:x])
dat[sel[subsel], paste0("lowest_",x)] <- "Caecal"

choose <- which(dat$lowest_2=="Faecal w7")
plot(dat$percent_all_KA[choose], dat$Centre_time[choose])

choose <- which(dat$lowest_2=="Caecal")
plot(dat$percent_all_KA[choose], dat$Centre_time[choose])

# Faecal w7
cor(dat$percent_all_KA[choose], dat$Centre_time[choose]) # 0.4160884

# Caecal
cor(dat$percent_all_KA[choose], dat$Centre_time[choose]) # 0.6276367


# # #

x <- 3

sel <- which(dat$Treatment=="Control" & dat$Sample_type=="Faecal w7")
subsel <- which(dat$Centre_time[sel] %in% sort(dat$Centre_time[sel])[1:x])
dat[sel[subsel], paste0("lowest_",x)] <- "Faecal w7"

sel <- which(dat$Treatment=="Control" & dat$Sample_type=="Caecal")
subsel <- which(dat$Centre_time[sel] %in% sort(dat$Centre_time[sel])[1:x])
dat[sel[subsel], paste0("lowest_",x)] <- "Caecal"

sel <- which(dat$Treatment=="Low" & dat$Sample_type=="Faecal w7")
subsel <- which(dat$Centre_time[sel] %in% sort(dat$Centre_time[sel])[1:x])
dat[sel[subsel], paste0("lowest_",x)] <- "Faecal w7"

sel <- which(dat$Treatment=="Low" & dat$Sample_type=="Caecal")
subsel <- which(dat$Centre_time[sel] %in% sort(dat$Centre_time[sel])[1:x])
dat[sel[subsel], paste0("lowest_",x)] <- "Caecal"

sel <- which(dat$Treatment=="High" & dat$Sample_type=="Faecal w7")
subsel <- which(dat$Centre_time[sel] %in% sort(dat$Centre_time[sel])[1:x])
dat[sel[subsel], paste0("lowest_",x)] <- "Faecal w7"

sel <- which(dat$Treatment=="High" & dat$Sample_type=="Caecal")
subsel <- which(dat$Centre_time[sel] %in% sort(dat$Centre_time[sel])[1:x])
dat[sel[subsel], paste0("lowest_",x)] <- "Caecal"

choose <- which(dat$lowest_3=="Faecal w7")
plot(dat$percent_all_KA[choose], dat$Centre_time[choose])

choose <- which(dat$lowest_3=="Caecal")
plot(dat$percent_all_KA[choose], dat$Centre_time[choose])

# Faecal w7
cor(dat$percent_all_KA[choose], dat$Centre_time[choose]) # -0.05388064

# Caecal
cor(dat$percent_all_KA[choose], dat$Centre_time[choose]) # 0.4841859


# # #

x <- 4

sel <- which(dat$Treatment=="Control" & dat$Sample_type=="Faecal w7")
subsel <- which(dat$Centre_time[sel] %in% sort(dat$Centre_time[sel])[1:x])
dat[sel[subsel], paste0("lowest_",x)] <- "Faecal w7"

sel <- which(dat$Treatment=="Control" & dat$Sample_type=="Caecal")
subsel <- which(dat$Centre_time[sel] %in% sort(dat$Centre_time[sel])[1:x])
dat[sel[subsel], paste0("lowest_",x)] <- "Caecal"

sel <- which(dat$Treatment=="Low" & dat$Sample_type=="Faecal w7")
subsel <- which(dat$Centre_time[sel] %in% sort(dat$Centre_time[sel])[1:x])
dat[sel[subsel], paste0("lowest_",x)] <- "Faecal w7"

sel <- which(dat$Treatment=="Low" & dat$Sample_type=="Caecal")
subsel <- which(dat$Centre_time[sel] %in% sort(dat$Centre_time[sel])[1:x])
dat[sel[subsel], paste0("lowest_",x)] <- "Caecal"

sel <- which(dat$Treatment=="High" & dat$Sample_type=="Faecal w7")
subsel <- which(dat$Centre_time[sel] %in% sort(dat$Centre_time[sel])[1:x])
dat[sel[subsel], paste0("lowest_",x)] <- "Faecal w7"

sel <- which(dat$Treatment=="High" & dat$Sample_type=="Caecal")
subsel <- which(dat$Centre_time[sel] %in% sort(dat$Centre_time[sel])[1:x])
dat[sel[subsel], paste0("lowest_",x)] <- "Caecal"

choose <- which(dat$lowest_4=="Faecal w7")
plot(dat$percent_all_KA[choose], dat$Centre_time[choose])

choose <- which(dat$lowest_4=="Caecal")
plot(dat$percent_all_KA[choose], dat$Centre_time[choose])

# Faecal w7
cor(dat$percent_all_KA[choose], dat$Centre_time[choose]) # -0.1830594

# Caecal
cor(dat$percent_all_KA[choose], dat$Centre_time[choose]) # 0.6719421


# # #

x <- 5

sel <- which(dat$Treatment=="Control" & dat$Sample_type=="Faecal w7")
subsel <- which(dat$Centre_time[sel] %in% sort(dat$Centre_time[sel])[1:x])
dat[sel[subsel], paste0("lowest_",x)] <- "Faecal w7"

sel <- which(dat$Treatment=="Control" & dat$Sample_type=="Caecal")
subsel <- which(dat$Centre_time[sel] %in% sort(dat$Centre_time[sel])[1:x])
dat[sel[subsel], paste0("lowest_",x)] <- "Caecal"

sel <- which(dat$Treatment=="Low" & dat$Sample_type=="Faecal w7")
subsel <- which(dat$Centre_time[sel] %in% sort(dat$Centre_time[sel])[1:x])
dat[sel[subsel], paste0("lowest_",x)] <- "Faecal w7"

sel <- which(dat$Treatment=="Low" & dat$Sample_type=="Caecal")
subsel <- which(dat$Centre_time[sel] %in% sort(dat$Centre_time[sel])[1:x])
dat[sel[subsel], paste0("lowest_",x)] <- "Caecal"

sel <- which(dat$Treatment=="High" & dat$Sample_type=="Faecal w7")
subsel <- which(dat$Centre_time[sel] %in% sort(dat$Centre_time[sel])[1:x])
dat[sel[subsel], paste0("lowest_",x)] <- "Faecal w7"

sel <- which(dat$Treatment=="High" & dat$Sample_type=="Caecal")
subsel <- which(dat$Centre_time[sel] %in% sort(dat$Centre_time[sel])[1:x])
dat[sel[subsel], paste0("lowest_",x)] <- "Caecal"

choose <- which(dat$lowest_5=="Faecal w7")
plot(dat$percent_all_KA[choose], dat$Centre_time[choose])

choose <- which(dat$lowest_5=="Caecal")
plot(dat$percent_all_KA[choose], dat$Centre_time[choose])

# Faecal w7
cor(dat$percent_all_KA[choose], dat$Centre_time[choose]) # -0.08933896

# Caecal
cor(dat$percent_all_KA[choose], dat$Centre_time[choose]) # 0.5919562



## Poor relationship found for faecal w7.
## Strip back to Caecal samples


dat_c <- dat[ which(dat$Sample_type=="Caecal"), ]

names(dat_c)
# [1] "mouseID"        "Treatment"      "Sample_type"    "percent_all_KA" "Centre_time"    "lowest_2"      
# [7] "lowest_3"       "lowest_4"       "lowest_5" 

sel <- which(dat_c$lowest_2=="Caecal")
dat_c$lowest_2[sel] <- 2

sel <- which(dat_c$lowest_3=="Caecal")
dat_c$lowest_3[sel] <- 3

sel <- which(dat_c$lowest_4=="Caecal")
dat_c$lowest_4[sel] <- 4

sel <- which(dat_c$lowest_5=="Caecal")
dat_c$lowest_5[sel] <- 5



## reformat for plotting with ggplot facet_grid

names(dat_c)
# [1] "mouseID"        "Treatment"      "Sample_type"    "percent_all_KA" "Centre_time"    "lowest_2"      
# [7] "lowest_3"       "lowest_4"       "lowest_5"   

new_names <- c("mouseID", "Treatment", "Sample_type", "percent_all_KA", "Centre_time", "lowest_X")

get_cols <- c("lowest_2", "lowest_3", "lowest_4", "lowest_5")
most_anxious_x <- c(2:5)
length(most_anxious_x) # 4

dat_c.plot <- data.frame(matrix(nrow = 1, ncol = length( new_names )))
names(dat_c.plot) <- new_names

for (i in 1:length(most_anxious_x)) {
  #i<-1
  # getsubset of dataframe which examines most anxious X mice in each treatment
  dat_sel <- dat_c[  ,c( names(dat_c)[1:5],get_cols[i] ) ]
  
  subsel <- which(dat_sel[ , get_cols[i] ] == most_anxious_x[i])
  
  names(dat_sel) <- names(dat_c.plot)
  
  dat_c.plot <- rbind(dat_c.plot, dat_sel[subsel, ])
  
}

# remove NA first row
dat_c.plot <- dat_c.plot[-1, ]

str(dat_c.plot)
# 'data.frame':	42 obs. of  6 variables:
# $ mouseID       : chr  "C6m3" "C1m1" "L2m2" "L4m1" ...
# $ Treatment     : chr  "Control" "Control" "Low" "Low" ...
# $ Sample_type   : chr  "Caecal" "Caecal" "Caecal" "Caecal" ...
# $ percent_all_KA: num  2.069 0.679 1.269 1.397 0.708 ...
# $ Centre_time   : num  21.8 13.2 9 11.4 13.8 21.2 37.5 21.8 13.2 13.2 ...
# $ lowest_X      : chr  "2" "2" "2" "2" ...

3*2 + 3*3 + 3*4 + 3*5 # 42

temp <- dat_c.plot

cols <- c( "Control" ="#f46d43", "Low" ="#66c2a5", "High" ="#5e4fa2" )

levels(factor(dat_c.plot$lowest_X)) # [1] "2" "3" "4" "5"
#dat_c.plot$lowest_X <- factor(dat_c.plot$lowest_X, levels = c("2", "3", "4", "5"), labels = c("2 most\nanxious", "3 most\nanxious", "4 most\nanxious", "5 most\nanxious") )
dat_c.plot$lowest_X <- factor(dat_c.plot$lowest_X, levels = c("2", "3", "4", "5"), labels = c("Male\n2 most anxious", "Male\n3 most anxious", "Male\n4 most anxious", "Male\n5 most anxious") )

# annotation for r and n
anno <- data.frame(x = c(2.0, 2.0, 2.0, 2.0),
                   y = c(0, 0, 0, 0),
                   #lowest_X = c("2 most\nanxious", "3 most\nanxious", "4 most\nanxious", "5 most\nanxious"),
                   lowest_X = c("Male\n2 most anxious", "Male\n3 most anxious", "Male\n4 most anxious", "Male\n5 most anxious"),
                   r = paste0("r = ", round( c(0.6276367, 0.4841859, 0.6719421, 0.5919562),digits = 2 )),
                   n = paste0("n = ", c(2*3, 3*3, 4*3, 5*3))
)


p <- ggplot(data=dat_c.plot, aes(x=percent_all_KA, y= Centre_time, color = Treatment)) + 
  geom_point(size=1) + scale_colour_manual(values = cols) +
  #stat_smooth(aes(x=percent_all_KA, y= Centre_time), inherit.aes = FALSE, show.legend = FALSE) +
  geom_smooth(aes(x=percent_all_KA, y= Centre_time), method='lm', inherit.aes = FALSE, show.legend = FALSE, fill = "#bdbdbd", size = 0.5) +
  
  #geom_text( data = anno, aes(x=x, y=y, label = r), size = 2.5, inherit.aes = FALSE) +
  geom_text( data = anno, aes(x=x, y=y, label = paste0(r,"\n",n)), size = 2.5, inherit.aes = FALSE, lineheight=0.9) +
  
  theme_classic() +
  #labs(x = "Total % OTU rel. abun. K. alysoides", y = "Time in centre (s)") +
  labs(x = parse(text='"Total % OTU rel. abun. "*italic("K. alysoides")'), y = "Time in centre (s)") +
  
  #theme(legend.position="bottom") + 
  theme(legend.position="none") + 
  
  facet_grid(~lowest_X) + #, scales="free_x", space="free_x") +
  
  theme(
    plot.margin = margin(t = 5, r = 5, b = 0, l = 5, "pt"),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    
    axis.ticks.length=unit(-3, "pt"),
    
    axis.text.x = element_text(margin=margin(t = 5,r = 5,b = 2,l = 5,"pt"), vjust=0.5, size = rel(0.7)),
    axis.text.y = element_text(margin=margin(t = 5,r = 5,b = 5,l = 5,"pt"), size = rel(0.6)),
    axis.title.x = element_text(margin=margin(t = 2,r = 0,b = 0,l = 0,"pt"), size = rel(0.8)),
    axis.title.y = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.8)),
    
    strip.background = element_rect(fill="white", linetype = "blank"),
    strip.text = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.7))  #,
    
    # legend.box.margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"),
    # legend.key.size = unit(5, "pt"),
    # legend.text = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.6)),
    # legend.title = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.7)),
    # legend.margin = margin(t = 0,r = 0,b = 5,l = 0,"pt"),
    # legend.box.spacing = unit(5, "pt")
    
  )
p


grid.text(label = "C" , x = unit(0.03, "npc") , y = unit(0.95,"npc"), gp=gpar(fontsize=11, fontface="bold") )

dev.print(tiff, file = paste0("plots/","Total-OTU-rel-abun-KA-vs-Base-anxiety-Most-anxious-in-Treatments-MALES-vFINAL.tiff"), width = 12, height = 5, units = "cm", res=600, compression="lzw")










### Compare all K. alysoides that increase between Wk0 to Wk7
### examine quantity and relative abundance of OTUs

unique(clean.16s@sam_data$samp_type)
# "cecal"         "fecal"         "air"           "soil"          "fresh bedding"

fecalw0w7cecal.dc.16s <- prune_samples(  clean.16s@sam_data$samp_type %in% c("fecal","cecal"),  clean.16s )
fecalw0w7cecal.dc.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 2730 taxa and 160 samples ]
# sample_data() Sample Data:       [ 160 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 2730 taxa by 7 taxonomic ranks ]

min(sample_sums(fecalw0w7cecal.dc.16s)) # 11450
min(taxa_sums(fecalw0w7cecal.dc.16s)) # 0

# prune taxa that have zero sequence reads
fecalw0w7cecal.dc.16s <- prune_taxa(taxa = taxa_sums(fecalw0w7cecal.dc.16s) > 0, x = fecalw0w7cecal.dc.16s)
fecalw0w7cecal.dc.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 817 taxa and 160 samples ]
# sample_data() Sample Data:       [ 160 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 817 taxa by 7 taxonomic ranks ]

min(taxa_sums(fecalw0w7cecal.dc.16s)) # 1


table(fecalw0w7cecal.dc.16s@sam_data$Cage.Name)
# C1 C2 C3 C4 C5 C6 H1 H2 H3 H4 H5 H6 L1 L2 L3 L4 L5 L6
#  9  9  9  9  9  9  9  9  9  9  9  9  8  8  9  9  9  9

table(fecalw0w7cecal.dc.16s@sam_data$Sex)
# female   male
#     80     80


## Normalise by converting to relative abundance ...

phy_obj <- fecalw0w7cecal.dc.16s

relabun.phy_obj <- transform_sample_counts(phy_obj, function(x) 100*x / sum(x) )

relabun.phy_obj
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 817 taxa and 160 samples ]
# sample_data() Sample Data:       [ 160 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 817 taxa by 7 taxonomic ranks ]


# prune taxa - keep only K. alysoides that increase between Wk0 to Wk7

all_KA_OTUs
# [1] "OTU_143"  "OTU_481"  "OTU_531"  "OTU_5971" "OTU_9505" "OTU_9639" "OTU_37"   "OTU_60"   "OTU_5520"
# [10] "OTU_5790"

KAonly.relabun.phy_obj <- prune_taxa( taxa_names(relabun.phy_obj) %in% all_KA_OTUs, relabun.phy_obj )
KAonly.relabun.phy_obj
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 10 taxa and 160 samples ]
# sample_data() Sample Data:       [ 160 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 10 taxa by 7 taxonomic ranks ]



dat <- data.frame(samp_id=sample_names(KAonly.relabun.phy_obj) )

dat$Treatment <- substring(text = dat$samp_id, first = 1, last = 1)
dat$Treatment <- factor(dat$Treatment, levels=c("C","L","H"), labels = c("Control","Low","High"), ordered = TRUE)
dat$mouseID <- substring(text = dat$samp_id, first = 1, last = 4)

str(mice_info)
dat <- merge.data.frame(x=dat, y=mice_info[ ,c("mouseID","Sex")], by = "mouseID" )
dat$Sex <- factor(dat$Sex, levels=c("female","male"), labels=c("Female","Male"), ordered = TRUE)

dat$samp_type <- NA
dat$Samp_group <- NA

sel <- grep(pattern = "T01", x = dat$samp_id)
dat$samp_type[sel] <- "Faecal w0"
dat$Samp_group[sel] <- "Faecal"

sel <- grep(pattern = "T16", x = dat$samp_id)
dat$samp_type[sel] <- "Faecal w7"
dat$Samp_group[sel] <- "Faecal"

sel <- grep(pattern = "Ce", x = dat$samp_id)
dat$samp_type[sel] <- "Caecal"
dat$Samp_group[sel] <- "Caecal"

#dat$group <- paste0(dat$Sex,"\n",dat$Samp_group)
dat$group <- paste0(dat$Sex," ",dat$Samp_group)

str(dat)
# 'data.frame':	160 obs. of  7 variables:
# $ mouseID   : chr  "C1m1" "C1m1" "C1m1" "C1m2" ...
# $ samp_id   : Factor w/ 160 levels "C1m1Ce","C1m1T01",..: 1 2 3 5 4 6 8 9 7 10 ...
# $ Treatment : Ord.factor w/ 3 levels "Control"<"Low"<..: 1 1 1 1 1 1 1 1 1 1 ...
# $ Sex       : Ord.factor w/ 2 levels "Female"<"Male": 2 2 2 2 2 2 2 2 2 1 ...
# $ samp_type : chr  "Caecal" "Faecal w0" "Faecal w7" "Faecal w0" ...
# $ Samp_group: chr  "Caecal" "Faecal" "Faecal" "Faecal" ...
# $ group     : chr  "Male\nCaecal" "Male\nFaecal" "Male\nFaecal" "Male\nFaecal" ...

dim(dat) #  160   7


## now get individual taxa

taxa_names(KAonly.relabun.phy_obj)
# [1] "OTU_37"   "OTU_60"   "OTU_143"  "OTU_481"  "OTU_531"  "OTU_5520" "OTU_5790" "OTU_5971" "OTU_9505"
# [10] "OTU_9639"

dat$OTU_37 <- NA
dat$OTU_60  <- NA
dat$OTU_143 <- NA
dat$OTU_481  <- NA
dat$OTU_531  <- NA
dat$OTU_5520  <- NA
dat$OTU_5790  <- NA
dat$OTU_5971  <- NA
dat$OTU_9505 <- NA
dat$OTU_9639 <- NA


# for (s in 1:length(dat$samp_id)) {
#   #s<-1
#   this_samp <- prune_samples( sample_names(KAonly.relabun.phy_obj) == as.character(dat$samp_id[s]), KAonly.relabun.phy_obj )
#   dat[ s , taxa_names(KAonly.relabun.phy_obj) ]  <- as.numeric( taxa_sums(this_samp) )
# }


KA_taxa_names <- taxa_names(KAonly.relabun.phy_obj)

for (s in 1:length(dat$samp_id)) {
  #s<-1
  this_samp <- prune_samples( sample_names(KAonly.relabun.phy_obj) == as.character(dat$samp_id[s]), KAonly.relabun.phy_obj )
  
  # consider only the K. alysoides that increase within each treatment
  
  treatment <- as.character( dat$Treatment[s] )
  
  if (treatment == "Control") {
    KA_list <- df_blast.Cont.all_KA$otu
  } else if (treatment == "Low") {
    KA_list <- df_blast.Low.all_KA$otu
  } else {
    KA_list <- df_blast.High.all_KA$otu
  }
  
  subsel <- which(KA_taxa_names %in% KA_list)
  
  dat[ s , KA_taxa_names[subsel] ]  <- as.numeric( taxa_sums(this_samp) )[subsel]
  
  # set NA values as zero
  dat[ s , KA_taxa_names[-subsel] ] <- 0
}



dat$Sex_Treatment_Samp_type <- paste0(dat$Sex,"_",dat$Treatment,"_",dat$samp_type)


batches <- unique(dat$Sex_Treatment_Samp_type)
batches
# [1] "Male_Control_Caecal"      "Male_Control_Faecal w0"   "Male_Control_Faecal w7"   "Female_Control_Caecal"   
# [5] "Female_Control_Faecal w0" "Female_Control_Faecal w7" "Male_High_Caecal"         "Male_High_Faecal w7"     
# [9] "Male_High_Faecal w0"      "Female_High_Faecal w0"    "Female_High_Faecal w7"    "Female_High_Caecal"      
# [13] "Female_Low_Caecal"        "Female_Low_Faecal w0"     "Female_Low_Faecal w7"     "Male_Low_Caecal"         
# [17] "Male_Low_Faecal w7"       "Male_Low_Faecal w0"   

## retrieve data in these batches

datnew <- data.frame(otu=NA,Treatment=NA,rel_abun_mean=NA,n_nonzero=NA,n=NA,samp_type=NA,percent_prev=NA,Sex_Treatment_Samp_type=NA,group=NA)
names(datnew)
# [1] "otu"                     "Treatment"               "rel_abun_mean"           "n_nonzero"              
# [5] "n"                       "samp_type"               "percent_prev"            "Sex_Treatment_Samp_type"
# [9] "group" 

for (i in 1:length(batches)) {
  #i<-1
  sel <- which(dat$Sex_Treatment_Samp_type %in% batches[i])
  
  for (z in 1:length( taxa_names(KAonly.relabun.phy_obj) )) {
    #z<-1
    this_otu <- taxa_names(KAonly.relabun.phy_obj)[z]
    rownew <- data.frame(otu=this_otu,Treatment=NA,rel_abun_mean=NA,n_nonzero=NA,n=NA,samp_type=NA,percent_prev=NA,Sex_Treatment_Samp_type=NA,group=NA)
    
    rownew$Treatment <- as.character( unique(dat$Treatment[sel]) )
    rownew$rel_abun_mean <- mean(dat[ sel , this_otu ])
    rownew$n_nonzero <- length(which( dat[ sel , this_otu ] >0 ))
    rownew$n <- length( dat[ sel , this_otu ] )
    rownew$samp_type <- as.character( unique(dat$samp_type[sel]) )
    rownew$percent_prev <- 100*rownew$n_nonzero/rownew$n
    rownew$Sex_Treatment_Samp_type <- as.character( unique(dat$Sex_Treatment_Samp_type[sel]) )
    rownew$group <- as.character( unique(dat$group[sel]) )
    
    datnew <- rbind(datnew, rownew)
    
    print(paste0("completed ",z, " of ", length( taxa_names(KAonly.relabun.phy_obj) )))
    
  }
  
  print(paste0("------------------------"))
  print(paste0("completed ",i, " = batch: ", batches[i]  ))
  print(paste0("------------------------"))
  
}

# remove NA 1st row
datnew <- datnew[-1, ]

# log10 mean rel. abun.
datnew$log10_mean_rel_abun <- log10(datnew$rel_abun_mean)

# lookup pident ?

lkup <- rbind(df_blast.Cont.all_KA[ ,c("otu","pident")],
              df_blast.Low.all_KA[ ,c("otu","pident")],
              df_blast.High.all_KA[ ,c("otu","pident")])

sel.dup <-which(duplicated(lkup$otu))
lkup <- lkup[-sel.dup, ]

temp <- datnew
datnew <- temp

datnew <- merge(x=datnew,y=lkup,by="otu", all.x=TRUE )

datnew$pident <- as.character(datnew$pident)

# remove cases with 0% prevalence
sel <- which(datnew$percent_prev==0)

datnew <- datnew[-sel, ]


str(datnew)
# 'data.frame':	72 obs. of  11 variables:
# $ otu                    : chr  "OTU_143" "OTU_143" "OTU_143" "OTU_143" ...
# $ Treatment              : chr  "Control" "Control" "Control" "Control" ...
# $ rel_abun_mean          : num  0.0616 0.00497 0.00171 0.04066 0.03048 ...
# $ n_nonzero              : int  6 1 2 3 4 6 5 9 9 9 ...
# $ n                      : int  9 9 9 9 9 9 9 9 9 9 ...
# $ samp_type              : chr  "Faecal w7" "Faecal w0" "Faecal w0" "Faecal w7" ...
# $ percent_prev           : num  66.7 11.1 22.2 33.3 44.4 ...
# $ Sex_Treatment_Samp_type: chr  "Male_Control_Faecal w7" "Female_Control_Faecal w0" "Male_Control_Faecal w0" "Female_Control_Faecal w7" ...
# $ group                  : chr  "Male\nFaecal" "Female\nFaecal" "Male\nFaecal" "Female\nFaecal" ...
# $ log10_mean_rel_abun    : num  -1.21 -2.3 -2.77 -1.39 -1.52 ...
# $ pident                 : chr  "95" "95" "95" "95" ...



unique(datnew$group) # "Male Faecal"   "Female Faecal" "Female Caecal" "Male Caecal"   

datnew$group <- factor(datnew$group, levels = c( "Female Faecal", "Female Caecal", "Male Faecal", "Male Caecal" ),
                       labels = c( "Female Faeces", "Female Caeca", "Male Faeces", "Male Caeca" ),
                       ordered = TRUE)
datnew$link <- paste0(datnew$otu,"_",datnew$Treatment,"_",datnew$group)


cols <- c( "Control" ="#f46d43", "Low" ="#66c2a5", "High" ="#5e4fa2" )
shapes <- c( "Faecal w0"=1, "Faecal w7"=16, "Caecal"=17 )

pid_sizes <- c("92"= 1.5, "94"= 2, "95"= 2.5, "96"= 3, "97"= 3.5)

str(datnew)

temp <- datnew


p <- ggplot(data=datnew, aes(x=log10_mean_rel_abun, y = percent_prev, color = Treatment, shape = samp_type)) +
  geom_point(aes(size = pident)) +
  geom_line(aes(group=link)) +
  scale_shape_manual(values = shapes, name = "Sample\ntype") +
  scale_size_manual(values = pid_sizes, name = "Percent\nidentity") +
  scale_color_manual(values = cols) +
  theme_classic() +
  labs(x = "log10(mean % OTU rel. abun.)", y = "Prevalence (%)" ) + 
  facet_wrap(~group) + 
  guides(colour = guide_legend(order = 1), 
         shape = guide_legend(order = 2),
         size = guide_legend(order = 3)) +
  theme(
    plot.margin = margin(t = 2, r = 2, b = 2, l = 5, "pt"),
    axis.ticks.length=unit(-3, "pt"),
    axis.text.x = element_text(margin=margin(t = 5,r = 5,b = 2,l = 5,"pt"), size = rel(0.8)),  # , size = rel(0.9))
    axis.text.y = element_text(margin=margin(t = 5,r = 5,b = 5,l = 2,"pt"), size = rel(0.8)),
    axis.title.x = element_text(margin=margin(t = 5,r = 2,b = 5,l = 2,"pt"), size = rel(0.8)),
    axis.title.y = element_text(margin=margin(t = 5,r = 2,b = 5,l = 2,"pt"), size = rel(0.8)),
    
    strip.background = element_rect(fill="white", linetype = "blank"),
    strip.text = element_text(margin=margin(t = 0,r = 0,b = 5,l = 0,"pt"), size = rel(0.8)),
    
    legend.key.size = unit(2, "pt"),
    legend.text = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.6)),
    legend.title = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.6)),
    legend.margin = margin(t = 0,r = 0,b = 0,l = 2,"pt"),
    legend.box.spacing = unit(2, "pt"),
    legend.box.margin = margin(t = 0,r = 0,b = 0,l = 0,"pt"),
    legend.spacing = unit(4, "pt")
    
  )

p

ggsave(plot=p, filename = paste0("plots/","All_KA_pc_prevalence-vs-log10-mean-pc-OTU-rel-abun-Male-Female-Faecal-Caecal-vFINAL.tiff"), width = 10, height = 10, units = "cm", dpi = 600, compression = "lzw")


#------------------------





#### Composition Relative abundance bar plots
#------------------------

# compute % OTU Relative Abundance for:
soil.dc.16s
air.dc.16s
fecal.w1w7.dc.16s
cecal.dc.16s

### Soils
relabun.soil.dc.16s <- transform_sample_counts(soil.dc.16s, function(x) 100*x / sum(x) )
relabun.soil.dc.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 2345 taxa and 65 samples ]
# sample_data() Sample Data:       [ 65 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 2345 taxa by 7 taxonomic ranks ]

# agglomerate at level of Phylum
relabun.soil.dc.16s <- tax_glom(physeq = relabun.soil.dc.16s, taxrank = "Phylum")

relabun.soil.dc.16s@sam_data$sample
relabun.soil.dc.16s@sam_data$Treatment

# compare only homogenised treatment soils
sel <- grep(pattern = "Clr|Rem", x = soil.dc.16s@sam_data$sample)
soil.dc.16s@sam_data$sample[sel]
soilw1w7 <- soil.dc.16s@sam_data$sample[-sel]
soilw1w7

relabun.soilw1w7.dc.16s <- prune_samples( sample_names(relabun.soil.dc.16s) %in% soilw1w7, relabun.soil.dc.16s )
relabun.soilw1w7.dc.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 24 taxa and 47 samples ]
# sample_data() Sample Data:       [ 47 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 24 taxa by 7 taxonomic ranks ]


p.soil<-plot_bar( relabun.soilw1w7.dc.16s , x = "sample", fill = "Phylum", facet_grid = ~Treatment + Time )

# Identify which Phyla are rare?
hist(p.soil$data$Abundance)

levels(as.factor(as.character( p.soil$data$Phylum  )))
# [1] "p__Acidobacteria"    "p__Actinobacteria"   "p__AD3"              "p__Armatimonadetes"  "p__Bacteroidetes"   
# [6] "p__BHI80-139"        "p__Chlamydiae"       "p__Chlorobi"         "p__Chloroflexi"      "p__Cyanobacteria"   
# [11] "p__Elusimicrobia"    "p__Fibrobacteres"    "p__Firmicutes"       "p__Gemmatimonadetes" "p__GN02"            
# [16] "p__Nitrospirae"      "p__OD1"              "p__Planctomycetes"   "p__Proteobacteria"   "p__Tenericutes"     
# [21] "p__TM6"              "p__TM7"              "p__Verrucomicrobia"  "p__WPS-2"  

levels(as.factor(as.character( p.soil$data$Phylum[which(p.soil$data$Abundance >= 0.5)]  )))
# [1] "p__Acidobacteria"    "p__Actinobacteria"   "p__AD3"              "p__Bacteroidetes"    "p__Chloroflexi"     
# [6] "p__Firmicutes"       "p__Gemmatimonadetes" "p__Planctomycetes"   "p__Proteobacteria"   "p__Verrucomicrobia" 
# [11] "p__WPS-2" 

pp <- p.soil +
  facet_grid(~Treatment+Time, scales="free_x") # , space="free_x"
pp


## only display phyla >= 0.5%
major_phyla <- levels(as.factor(as.character( p.soil$data$Phylum[which(p.soil$data$Abundance >= 0.5)]  )))
sel.row <- which(p.soil$data$Phylum %in% major_phyla)
100*sum( p.soil$data$Abundance[sel.row] )/sum(p.soil$data$Abundance) # 99.66032 % of relative abundance is covered
100 - 99.66032 # 0.33968 % left representing rare phyla

p.soil.plot <- p.soil
class(p.soil.plot$data$Phylum) # "factor"
p.soil.plot$data$Phylum <- as.character(p.soil.plot$data$Phylum)
p.soil.plot$data$Phylum[-sel.row] <- "Other minor phyla"
levels(factor(p.soil.plot$data$Phylum))
# [1] "Other minor phyla"   "p__Acidobacteria"    "p__Actinobacteria"   "p__AD3"              "p__Bacteroidetes"   
# [6] "p__Chloroflexi"      "p__Firmicutes"       "p__Gemmatimonadetes" "p__Planctomycetes"   "p__Proteobacteria"  
# [11] "p__Verrucomicrobia"  "p__WPS-2"    
p.soil.plot$data$Phylum <- factor(p.soil.plot$data$Phylum, levels = c(
  "p__Acidobacteria" ,   "p__Actinobacteria" ,  "p__AD3"            ,  "p__Bacteroidetes" ,  
  "p__Chloroflexi"    ,  "p__Firmicutes"      , "p__Gemmatimonadetes", "p__Planctomycetes" ,  "p__Proteobacteria"  ,
  "p__Verrucomicrobia" , "p__WPS-2"    ,
  "Other minor phyla"),
  labels = c(
    "Acidobacteria" ,   "Actinobacteria" ,  "AD3"            ,  "Bacteroidetes" ,  
    "Chloroflexi"    ,  "Firmicutes"      , "Gemmatimonadetes", "Planctomycetes" ,  "Proteobacteria"  ,
    "Verrucomicrobia" , "WPS-2"    ,
    "Other minor phyla"
  ),ordered = TRUE
  
)

p.soil.plot$data$Treatment <- factor(p.soil.plot$data$Treatment, levels = c("Low","High"),ordered = TRUE)

pp <- p.soil.plot +
  theme_classic() +
  facet_grid(~Treatment+Time, scales="free_x", margins = unit(2,"pt")) +
  scale_fill_manual(values = cols.phlya.16S) +
  ylab("OTU relative abundance (%)") +
  xlab(NULL) +
  geom_bar(stat = "identity", size=0.5) +
  scale_y_continuous(expand = expand_scale(mult = .01)) +  # default .05
  scale_x_discrete(expand = expand_scale(add = .1)) +      # default .6
  theme(
    rect = element_rect(size = 0.5),
    plot.margin = margin(t = 2, r = 2, b = 2, l = 5, "pt"),
    axis.text.x = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), angle=90,size = rel(0.7), vjust=0.5 ), #  , hjust=0.5, 
    axis.text.y = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.7)),
    axis.title.y = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.7)),
    strip.background = element_rect(fill="white", linetype = "blank"),
    strip.text = element_text(margin=margin(t = 0,r = 0,b = 2,l = 0,"pt"), vjust = 0, size = rel(0.7)),
    axis.line.x.bottom = element_blank(),
    axis.line.y.left = element_blank(),
    axis.ticks.x.bottom = element_blank(),
    legend.box.margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"),
    legend.key.size = unit(5, "pt"),
    legend.text = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.5)),
    legend.title = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.7)),
    legend.margin = margin(t = 0,r = 0,b = 0,l = 0,"pt"),
    legend.box.spacing = unit(5, "pt")
  )

pp$data$sample <- substring(pp$data$Sample, first =1, last =4)
pp
grid.text(label = "A", x = unit(0.015, "npc") , y = unit(0.96,"npc"), gp=gpar(fontsize=11, fontface="bold") )


dev.print(tiff, file = paste0("plots/","Phylum-barplots-Soils-vFINAL.tiff"), width = 18, height = 5.8, units = "cm", res=600, compression="lzw")





### Air
relabun.air.dc.16s <- transform_sample_counts(air.dc.16s, function(x) 100*x / sum(x) )
relabun.air.dc.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 2073 taxa and 72 samples ]
# sample_data() Sample Data:       [ 72 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 2073 taxa by 7 taxonomic ranks ]

# agglomerate at level of Phylum
relabun.air.dc.16s <- tax_glom(physeq = relabun.air.dc.16s, taxrank = "Phylum")

relabun.air.dc.16s@sam_data$sample
relabun.air.dc.16s@sam_data$Treatment

p.air<-plot_bar( relabun.air.dc.16s , x = "sample", fill = "Phylum", facet_grid = ~Treatment + Time )

# Identify which Phyla are rare?
hist(p.air$data$Abundance)

levels(as.factor(as.character( p.air$data$Phylum  )))
# [1] "p__Acidobacteria"    "p__Actinobacteria"   "p__AD3"              "p__Armatimonadetes"  "p__Bacteroidetes"   
# [6] "p__BHI80-139"        "p__Chlamydiae"       "p__Chlorobi"         "p__Chloroflexi"      "p__Cyanobacteria"   
# [11] "p__Deferribacteres"  "p__Elusimicrobia"    "p__Fibrobacteres"    "p__Firmicutes"       "p__Gemmatimonadetes"
# [16] "p__GN02"             "p__Nitrospirae"      "p__OD1"              "p__Planctomycetes"   "p__Proteobacteria"  
# [21] "p__Tenericutes"      "p__TM6"              "p__TM7"              "p__Verrucomicrobia"  "p__WPS-2" 

levels(as.factor(as.character( p.air$data$Phylum[which(p.air$data$Abundance >= 0.5)]  )))
# [1] "p__Acidobacteria"    "p__Actinobacteria"   "p__AD3"              "p__Bacteroidetes"    "p__Chloroflexi"     
# [6] "p__Cyanobacteria"    "p__Deferribacteres"  "p__Fibrobacteres"    "p__Firmicutes"       "p__Gemmatimonadetes"
# [11] "p__GN02"             "p__Planctomycetes"   "p__Proteobacteria"   "p__Tenericutes"      "p__TM6"             
# [16] "p__TM7"              "p__Verrucomicrobia"  "p__WPS-2"   


pp <- p.air +
  facet_grid(~Treatment+Time, scales="free_x") # , space="free_x"
pp


## only display phyla >= 0.5%
major_phyla <- levels(as.factor(as.character( p.air$data$Phylum[which(p.air$data$Abundance >= 0.5)]  )))
sel.row <- which(p.air$data$Phylum %in% major_phyla)
100*sum( p.air$data$Abundance[sel.row] )/sum(p.air$data$Abundance) # 99.89774 % of relative abundance is covered
100 - 99.89774 # 0.10226 % left representing rare phyla

p.air.plot <- p.air
class(p.air.plot$data$Phylum) # "factor"
p.air.plot$data$Phylum <- as.character(p.air.plot$data$Phylum)
p.air.plot$data$Phylum[-sel.row] <- "Other minor phyla"
levels(factor(p.air.plot$data$Phylum))
# [1] "Other minor phyla"   "p__Acidobacteria"    "p__Actinobacteria"   "p__AD3"              "p__Bacteroidetes"   
# [6] "p__Chloroflexi"      "p__Cyanobacteria"    "p__Deferribacteres"  "p__Fibrobacteres"    "p__Firmicutes"      
# [11] "p__Gemmatimonadetes" "p__GN02"             "p__Planctomycetes"   "p__Proteobacteria"   "p__Tenericutes"     
# [16] "p__TM6"              "p__TM7"              "p__Verrucomicrobia"  "p__WPS-2"
p.air.plot$data$Phylum <- factor(p.air.plot$data$Phylum, levels = c(
  "p__Acidobacteria" ,   "p__Actinobacteria",   "p__AD3"          ,    "p__Bacteroidetes"  , 
  "p__Chloroflexi"    ,  "p__Cyanobacteria"  ,  "p__Deferribacteres",  "p__Fibrobacteres"   , "p__Firmicutes"      ,
  "p__Gemmatimonadetes", "p__GN02"            , "p__Planctomycetes"  , "p__Proteobacteria"   ,"p__Tenericutes"     ,
  "p__TM6"              ,"p__TM7"              ,"p__Verrucomicrobia"  ,"p__WPS-2",
  "Other minor phyla"),
  labels = c(
    "Acidobacteria" ,   "Actinobacteria",   "AD3"          ,    "Bacteroidetes"  , 
    "Chloroflexi"    ,  "Cyanobacteria"  ,  "Deferribacteres",  "Fibrobacteres"   , "Firmicutes"      ,
    "Gemmatimonadetes", "GN02"            , "Planctomycetes"  , "Proteobacteria"   ,"Tenericutes"     ,
    "TM6"              ,"TM7"              ,"Verrucomicrobia"  ,"WPS-2",
    "Other minor phyla"
  ),ordered = TRUE
  
)

p.air.plot$data$Treatment <- factor(p.air.plot$data$Treatment, levels = c("Control","Low","High"),ordered = TRUE)

pp <- p.air.plot +
  theme_classic() +
  facet_grid(~Treatment+Time, scales="free_x", margins = unit(2,"pt")) +
  scale_fill_manual(values = cols.phlya.16S) +
  ylab("OTU relative abundance (%)") +
  xlab(NULL) +
  geom_bar(stat = "identity", size=0.5) +
  scale_y_continuous(expand = expand_scale(mult = .01)) +  # default .05
  scale_x_discrete(expand = expand_scale(add = .1)) +      # default .6
  theme(
    rect = element_rect(size = 0.5),
    plot.margin = margin(t = 2, r = 2, b = 2, l = 5, "pt"),
    axis.text.x = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), angle=90,size = rel(0.7), vjust=0.5 ), #  , hjust=0.5, 
    axis.text.y = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.7)),
    axis.title.y = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.7)),
    strip.background = element_rect(fill="white", linetype = "blank"),
    strip.text = element_text(margin=margin(t = 0,r = 0,b = 2,l = 0,"pt"), vjust = 0, size = rel(0.7)),
    axis.line.x.bottom = element_blank(),
    axis.line.y.left = element_blank(),
    axis.ticks.x.bottom = element_blank(),
    legend.box.margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"),
    legend.key.size = unit(5, "pt"),
    legend.text = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.5)),
    legend.title = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.7)),
    legend.margin = margin(t = 0,r = 0,b = 0,l = 0,"pt"),
    legend.box.spacing = unit(5, "pt")
    
  )

pp$data$sample <- substring(pp$data$Sample, first =1, last =4)
pp
grid.text(label = "B", x = unit(0.015, "npc") , y = unit(0.96,"npc"), gp=gpar(fontsize=11, fontface="bold") )


dev.print(tiff, file = paste0("plots/","Phylum-barplots-Air-B-vFINAL.tiff"), width = 18, height = 5.8, units = "cm", res=600, compression="lzw")






### Fecal
relabun.fecal.w0w7.dc.16s <- transform_sample_counts(fecal.w0w7.dc.16s, function(x) 100*x / sum(x) )
relabun.fecal.w0w7.dc.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 800 taxa and 107 samples ]
# sample_data() Sample Data:       [ 107 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 800 taxa by 7 taxonomic ranks ]

# agglomerate at level of Phylum
relabun.fecal.w0w7.dc.16s <- tax_glom(physeq = relabun.fecal.w0w7.dc.16s, taxrank = "Phylum")

relabun.fecal.w0w7.dc.16s@sam_data$sample
relabun.fecal.w0w7.dc.16s@sam_data$Treatment

p.fecal<-plot_bar( relabun.fecal.w0w7.dc.16s , x = "sample", fill = "Phylum", facet_grid = ~Treatment )

# Identify which Phyla are rare?
hist(p.fecal$data$Abundance)

levels(as.factor(as.character( p.fecal$data$Phylum  )))
# [1] "p__Acidobacteria"    "p__Actinobacteria"   "p__Bacteroidetes"    "p__Chloroflexi"      "p__Cyanobacteria"   
# [6] "p__Deferribacteres"  "p__Elusimicrobia"    "p__Fibrobacteres"    "p__Firmicutes"       "p__Gemmatimonadetes"
# [11] "p__OD1"              "p__Planctomycetes"   "p__Proteobacteria"   "p__Tenericutes"      "p__TM7"             
# [16] "p__Verrucomicrobia"  "p__WPS-2" 

levels(as.factor(as.character( p.fecal$data$Phylum[which(p.fecal$data$Abundance >= 0.5)]  )))
# [1] "p__Actinobacteria"  "p__Bacteroidetes"   "p__Deferribacteres" "p__Firmicutes"      "p__Proteobacteria" 

pp <- p.fecal +
  facet_grid(~Treatment + Time, scales="free_x") # , space="free_x"
pp



## only display phyla >= 0.5%
major_phyla <- levels(as.factor(as.character( p.fecal$data$Phylum[which(p.fecal$data$Abundance >= 0.5)]  )))
sel.row <- which(p.fecal$data$Phylum %in% major_phyla)
100*sum( p.fecal$data$Abundance[sel.row] )/sum(p.fecal$data$Abundance) # 99.98005 % of relative abundance is covered
100 - 99.98005 # 0.01995 % left representing rare phyla

p.fecal.plot <- p.fecal
class(p.fecal.plot$data$Phylum) # "factor"
p.fecal.plot$data$Phylum <- as.character(p.fecal.plot$data$Phylum)
p.fecal.plot$data$Phylum[-sel.row] <- "Other minor phyla"
levels(factor(p.fecal.plot$data$Phylum))
# [1] "Other minor phyla"  "p__Actinobacteria"  "p__Bacteroidetes"   "p__Deferribacteres" "p__Firmicutes"     
# [6] "p__Proteobacteria" 
p.fecal.plot$data$Phylum <- factor(p.fecal.plot$data$Phylum, levels = c(
  "p__Actinobacteria",  "p__Bacteroidetes" ,  "p__Deferribacteres" , "p__Firmicutes"  ,
  "p__Proteobacteria" ,
  "Other minor phyla"),
  labels = c(
    "Actinobacteria",  "Bacteroidetes" ,  "Deferribacteres" , "Firmicutes"  ,
    "Proteobacteria" ,
    "Other minor phyla"
  ),ordered = TRUE
  
)

p.fecal.plot$data$Treatment <- factor(p.fecal.plot$data$Treatment, levels = c("Control","Low","High"),ordered = TRUE)

pp <- p.fecal.plot +
  theme_classic() +
  facet_grid(~Treatment+Time, scales="free_x", margins = unit(2,"pt")) +
  scale_fill_manual(values = cols.phlya.16S) +
  ylab("OTU relative abundance (%)") +
  xlab(NULL) +
  geom_bar(stat = "identity", size=0.5) +
  scale_y_continuous(expand = expand_scale(mult = .01)) +  # default .05
  scale_x_discrete(expand = expand_scale(add = .1)) +      # default .6
  theme(
    rect = element_rect(size = 0.5),
    plot.margin = margin(t = 2, r = 2, b = 2, l = 5, "pt"),
    axis.text.x = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), angle=90,size = rel(0.6), vjust=0.5), #  , hjust=0.5, 
    axis.text.y = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.7)),
    axis.title.y = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.7)),
    strip.background = element_rect(fill="white", linetype = "blank"),
    strip.text = element_text(margin=margin(t = 0,r = 0,b = 2,l = 0,"pt"), vjust = 0, size = rel(0.7)),
    axis.line.x.bottom = element_blank(),
    axis.line.y.left = element_blank(),
    axis.ticks.x.bottom = element_blank(),
    legend.box.margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"),
    legend.key.size = unit(5, "pt"),
    legend.text = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.5)),
    legend.title = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.7)),
    legend.margin = margin(t = 0,r = 0,b = 0,l = 0,"pt"),
    legend.box.spacing = unit(5, "pt")
    
  )

pp$data$sample <- substring(pp$data$Sample, first =1, last =4)
pp
grid.text(label = "C", x = unit(0.015, "npc") , y = unit(0.96,"npc"), gp=gpar(fontsize=11, fontface="bold") )


dev.print(tiff, file = paste0("plots/","Phylum-barplots-Fecal-C-vFINAL.tiff"), width = 18, height = 5.8, units = "cm", res=600, compression="lzw")





### Cecal
relabun.cecal.dc.16s <- transform_sample_counts(cecal.dc.16s, function(x) 100*x / sum(x) )
relabun.cecal.dc.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 550 taxa and 53 samples ]
# sample_data() Sample Data:       [ 53 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 550 taxa by 7 taxonomic ranks ]

# agglomerate at level of Phylum
relabun.cecal.dc.16s <- tax_glom(physeq = relabun.cecal.dc.16s, taxrank = "Phylum")

relabun.cecal.dc.16s@sam_data$sample
relabun.cecal.dc.16s@sam_data$Treatment

p.cecal<-plot_bar( relabun.cecal.dc.16s , x = "sample", fill = "Phylum", facet_grid = ~Treatment )

# Identify which Phyla are rare?
hist(p.cecal$data$Abundance)

levels(as.factor(as.character( p.cecal$data$Phylum  )))
# [1] "p__Acidobacteria"   "p__Actinobacteria"  "p__Bacteroidetes"   "p__Chloroflexi"     "p__Cyanobacteria"  
# [6] "p__Deferribacteres" "p__Firmicutes"      "p__Proteobacteria"  "p__Tenericutes"     "p__TM7"            
# [11] "p__Verrucomicrobia"

levels(as.factor(as.character( p.cecal$data$Phylum[which(p.cecal$data$Abundance >= 0.5)]  )))
#[1] "p__Actinobacteria" "p__Bacteroidetes"  "p__Firmicutes"     "p__Proteobacteria" "p__TM7"  


pp <- p.cecal +
  facet_grid(~Treatment, scales="free_x") # , space="free_x"
pp



## only display phyla >= 0.5%
major_phyla <- levels(as.factor(as.character( p.cecal$data$Phylum[which(p.cecal$data$Abundance >= 0.5)]  )))


sel.row <- which(p.cecal$data$Phylum %in% major_phyla)
100*sum( p.cecal$data$Abundance[sel.row] )/sum(p.cecal$data$Abundance) # 99.9694 % of relative abundance is covered
100 - 99.9694 # 0.0306 % left representing rare phyla

p.cecal.plot <- p.cecal
class(p.cecal.plot$data$Phylum) # "factor"
p.cecal.plot$data$Phylum <- as.character(p.cecal.plot$data$Phylum)
p.cecal.plot$data$Phylum[-sel.row] <- "Other minor phyla"
levels(factor(p.cecal.plot$data$Phylum))
# [1] "Other minor phyla" "p__Actinobacteria" "p__Bacteroidetes"  "p__Firmicutes"     "p__Proteobacteria"
# [6] "p__TM7" 
p.cecal.plot$data$Phylum <- factor(p.cecal.plot$data$Phylum, levels = c(
  "p__Actinobacteria", "p__Bacteroidetes",  "p__Firmicutes",    "p__Proteobacteria",
  "p__TM7" ,
  "Other minor phyla"),
  labels = c(
    "Actinobacteria", "Bacteroidetes",  "Firmicutes",    "Proteobacteria",
    "TM7" ,
    "Other minor phyla"
  ),ordered = TRUE
  
)

#p.cecal.plot$data$Treatment <- factor(p.cecal.plot$data$Treatment, levels = c("Control","Low","High"),ordered = TRUE)
p.cecal.plot$data$Treatment <- factor(p.cecal.plot$data$Treatment, levels = c("Control","Low","High"),
                                      labels = c("Control\n","Low\n","High\n"),ordered = TRUE)


pp <- p.cecal.plot +
  theme_classic() +
  facet_grid(~Treatment, scales="free_x", margins = unit(2,"pt")) +
  scale_fill_manual(values = cols.phlya.16S) +
  ylab("OTU relative abundance (%)") +
  xlab(NULL) +
  geom_bar(stat = "identity", size=0.5) +
  scale_y_continuous(expand = expand_scale(mult = .01)) +  # default .05
  scale_x_discrete(expand = expand_scale(add = .1)) +      # default .6
  theme(
    rect = element_rect(size = 0.5),
    plot.margin = margin(t = 2, r = 2, b = 2, l = 5, "pt"),
    axis.text.x = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), angle=90,size = rel(0.7), vjust=0.5 ), #  , hjust=0.5, 
    axis.text.y = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.7)),
    axis.title.y = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.7)),
    strip.background = element_rect(fill="white", linetype = "blank"),
    strip.text = element_text(margin=margin(t = 0,r = 0,b = 2,l = 0,"pt"), vjust = 0, size = rel(0.7)),
    axis.line.x.bottom = element_blank(),
    axis.line.y.left = element_blank(),
    axis.ticks.x.bottom = element_blank(),
    legend.box.margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"),
    legend.key.size = unit(5, "pt"),
    legend.text = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.5)),
    legend.title = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.7)),
    legend.margin = margin(t = 0,r = 0,b = 0,l = 0,"pt"),
    legend.box.spacing = unit(5, "pt")
    
  )

pp$data$sample <- substring(pp$data$Sample, first =1, last =4)
pp
grid.text(label = "D", x = unit(0.015, "npc") , y = unit(0.96,"npc"), gp=gpar(fontsize=11, fontface="bold") )

dev.print(tiff, file = paste0("plots/","Phylum-barplots-Cecal-D-vFINAL.tiff"), width = 18, height = 5.8, units = "cm", res=600, compression="lzw")







### for plotting purposes
all_major_phyla <- unique(
  c(
    levels(as.factor(as.character( p.soil$data$Phylum[which(p.soil$data$Abundance >= 0.5)]  ))),
    levels(as.factor(as.character( p.air$data$Phylum[which(p.air$data$Abundance >= 0.5)]  ))),
    levels(as.factor(as.character( p.fecal$data$Phylum[which(p.fecal$data$Abundance >= 0.5)]  ))),
    levels(as.factor(as.character( p.cecal$data$Phylum[which(p.cecal$data$Abundance >= 0.5)]  )))
    
  ))
length(all_major_phyla) # 18
all_major_phyla
# [1] "p__Acidobacteria"    "p__Actinobacteria"   "p__AD3"              "p__Bacteroidetes"    "p__Chloroflexi"     
# [6] "p__Firmicutes"       "p__Gemmatimonadetes" "p__Planctomycetes"   "p__Proteobacteria"   "p__Verrucomicrobia" 
# [11] "p__WPS-2"            "p__Cyanobacteria"    "p__Deferribacteres"  "p__Fibrobacteres"    "p__GN02"            
# [16] "p__Tenericutes"      "p__TM6"              "p__TM7"  



cols.phlya.16S <-c(
  # https://sashat.me/2017/01/11/list-of-20-simple-distinct-colors/
  
  "Acidobacteria" ='#e6194b' , "Actinobacteria"='#3cb44b' , "AD3"='#bcf60c' , "Bacteroidetes" ='#4363d8', 
  "Chloroflexi"= '#ffe119', "Firmicutes"= '#fabebe', "Gemmatimonadetes"='#46f0f0' , "Planctomycetes"='#f032e6' , 
  "Proteobacteria" ='#f58231', "Verrucomicrobia" ='#911eb4', "WPS-2" ='#008080', 
  "Cyanobacteria"='#9a6324' , "Deferribacteres"='#fffac8' , "Fibrobacteres"='#800000' , "GN02"='#aaffc3' ,
  "Tenericutes"='#808000' , "TM6" ='#ffd8b1', "TM7"='#000075', "Other minor phyla"  = '#a9a9a9'    
)


#------------------------





#### Fecal alpha diversity comparisons
#### Includes comparison by Sex
#### Fecal Week 0, Week 7, Difference between Week 7-Week 0
#### Keep all samples except when tracking individual change Week 0 > Week 7
#------------------------

fecal.w0w7.dc.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 800 taxa and 107 samples ]
# sample_data() Sample Data:       [ 107 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 800 taxa by 7 taxonomic ranks ]


table(fecal.w0w7.dc.16s@sam_data$Time)
# Week 0 Week 7 
#     54     53

table(fecal.w0w7.dc.16s@sam_data$mouseID)
# C1m1 C1m2 C1m3 C2m1 C2m2 C2m3 C3m1 C3m2 C3m3 C4m1 C4m2 C4m3 C5m1 C5m2 C5m3 C6m1 C6m2 C6m3 H1m1 H1m2 
# 2    2    2    2    2    2    2    2    2    2    2    2    2    2    2    2    2    2    2    2 
# H1m3 H2m1 H2m2 H2m3 H3m1 H3m2 H3m3 H4m1 H4m2 H4m3 H5m1 H5m2 H5m3 H6m1 H6m2 H6m3 L1m1 L1m2 L1m3 L2m1 
# 2    2    2    2    2    2    2    2    2    2    2    2    2    2    2    2    2    1    2    2 
# L2m2 L2m3 L3m1 L3m2 L3m3 L4m1 L4m2 L4m3 L5m1 L5m2 L5m3 L6m1 L6m2 L6m3 
# 2    2    2    2    2    2    2    2    2    2    2    2    2    2


phy_obj <- fecal.w0w7.dc.16s

phy_obj
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 800 taxa and 107 samples ]
# sample_data() Sample Data:       [ 107 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 800 taxa by 7 taxonomic ranks ]


min(sample_sums(phy_obj)) # 11450
min(taxa_sums(phy_obj)) # 1


table(phy_obj@sam_data$Cage.Name)
# C1 C2 C3 C4 C5 C6 H1 H2 H3 H4 H5 H6 L1 L2 L3 L4 L5 L6 
# 6  6  6  6  6  6  6  6  6  6  6  6  5  6  6  6  6  6 

table(phy_obj@sam_data$Treatment)
# Control    High     Low 
#     36      36      35

table(phy_obj@sam_data$Time)
# Week 0 Week 7 
# 54     53 

## Use lowest number of reads to create rarefied dataset
seed <- 123
rare.phy_obj <- rarefy_even_depth(phy_obj, sample.size = min(sample_sums(phy_obj)),
                                  rngseed = seed, replace = FALSE, trimOTUs = TRUE, verbose = TRUE)
sample_sums(rare.phy_obj) # all 11450
ntaxa(rare.phy_obj) # 658
nsamples(rare.phy_obj) # 107



shan <- plot_richness(rare.phy_obj, measures=c("Shannon"))
str(shan)

out <- data.frame(sample=shan$data$sample,
                  samp_type=shan$data$samp_type,
                  mouseID=shan$data$mouseID,
                  Treatment=shan$data$Treatment,
                  Time=shan$data$Time,
                  Sex=shan$data$Sex,
                  shanH=shan$data$value)
out$eff_no <- exp(out$shanH)

str(out)
out$Treatment <- factor(out$Treatment, levels = c("Control", "Low", "High"), ordered = TRUE)
out$Sex <- factor(out$Sex, levels = c("female","male"), labels = c("Female","Male"))
out$mouseID <- as.character(out$mouseID)

str(out)
out$Time <- factor(out$Time, levels=c("Week 0","Week 7"), labels = c("Week 0","Week 7"))


p <- ggplot(data=out, aes(x = Treatment, y = eff_no)) +
  geom_boxplot() + 
  facet_grid(~Time + Sex)

p


## tidy plot # # # # # # # # #

cols <- c("Control" = "#f46d43", "Low" = "#66c2a5" , "High" = "#5e4fa2" ) # orange, aqua, dark-blue
set.seed(123)

# don't colour outliers
bp <- boxplot(out$eff_no ~ out$Treatment + out$Time + out$Sex)
bp$out #  23.31338 43.85028 15.94216 20.03916 16.81255

sel <- which(out$eff_no %in% bp$out)
out$eff_no[sel] # 23.31338 16.81255 43.85028 15.94216 20.03916
out[sel,]
#     sample samp_type mouseID Treatment   Time    Sex    shanH   eff_no
# 15 C5m3T01     fecal    C5m3   Control Week 1 Female 3.149027 23.31338
# 27 L4m2T01     fecal    L4m2       Low Week 1   Male 2.822126 16.81255
# 42 C4m1T01     fecal    C4m1   Control Week 1 Female 3.780781 43.85028
# 44 H4m3T01     fecal    H4m3      High Week 1 Female 2.768967 15.94216
# 49 H4m3T16     fecal    H4m3      High Week 7 Female 2.997688 20.03916


## exclude outliers from Kruskal-Wallis significance testing
out_exout <- out[-sel, ]

sel <- which(out_exout$Time=="Week 0")
wk0 <- out_exout[sel, ]

sel <- which(out_exout$Time=="Week 7")
wk7 <- out_exout[sel, ]


# Kruskal-Wallis test
kt <- kruskal.test( eff_no ~ Treatment, wk0) # Kruskal Wallis test
kt
# Kruskal-Wallis rank sum test
# data:  eff_no by Treatment
# Kruskal-Wallis chi-squared = 1.3478, df = 2, p-value = 0.5097

# Kruskal-Wallis test
kt <- kruskal.test( eff_no ~ Treatment, wk7) # Kruskal Wallis test
kt
# Kruskal-Wallis rank sum test
# data:  eff_no by Treatment
# Kruskal-Wallis chi-squared = 2.311, df = 2, p-value = 0.3149

# Kruskal-Wallis test
kt <- kruskal.test( eff_no ~ Treatment, wk0[which(wk0$Sex=="Female"), ]) # Kruskal Wallis test
kt
# Kruskal-Wallis rank sum test
# data:  eff_no by Treatment
# Kruskal-Wallis chi-squared = 3.3884, df = 2, p-value = 0.1837


# Kruskal-Wallis test
kt <- kruskal.test( eff_no ~ Treatment, wk0[which(wk0$Sex=="Male"), ]) # Kruskal Wallis test
kt
# Kruskal-Wallis rank sum test
# data:  eff_no by Treatment
# Kruskal-Wallis chi-squared = 4.0085, df = 2, p-value = 0.1348

# Kruskal-Wallis test
kt <- kruskal.test( eff_no ~ Treatment, wk7[which(wk7$Sex=="Female"), ]) # Kruskal Wallis test
kt
# Kruskal-Wallis rank sum test
# data:  eff_no by Treatment
# Kruskal-Wallis chi-squared = 5.5777, df = 2, p-value = 0.06149

table( wk7[which(wk7$Sex=="Female"), ]$Treatment )
# Control     Low    High 
#       9       8       8


# Kruskal-Wallis test
kt <- kruskal.test( eff_no ~ Treatment, wk7[which(wk7$Sex=="Male"), ]) # Kruskal Wallis test
kt
# Kruskal-Wallis rank sum test
# data:  eff_no by Treatment
# Kruskal-Wallis chi-squared = 0.20106, df = 2, p-value = 0.9044





## add  geom_signif() 
## see this example - https://www.rdocumentation.org/packages/ggsignif/versions/0.5.0

annotation_df <- data.frame(
  Time = c("Week 7"),
  Sex = c("Female"),
  start=c("Control"),
  end = c("High"),
  y = c(61),
  label = c("P = 0.06"))


set.seed(123)
p <- ggplot(data=out, aes(x = Treatment, y = eff_no)) +
  geom_boxplot(outlier.size = 1) + 
  geom_signif(data=annotation_df,
              aes(xmin=start, xmax=end, annotations=label, y_position=y),
              textsize = 2.5, vjust = -0.2, tip_length = 0, margin_top = 0,
              manual=TRUE) +
  facet_grid(~Time+Sex) +
  geom_jitter(data = out_exout, aes(colour = Treatment), width=0.3, height=0, size=1) +
  scale_color_manual(values = cols, guide = FALSE) +
  theme_classic() +
  labs(y="Effective OTUs (count)", x=NULL) +
  theme(
    plot.margin = margin(t = 2, r = 2, b = 2, l = 5, "pt"),
    axis.ticks.length=unit(-3, "pt"),
    axis.ticks.x.bottom = element_blank(),
    axis.text.x = element_text(margin=margin(t = 5,r = 5,b = 2,l = 5,"pt"), size = rel(0.9)),  # , size = rel(0.9))
    axis.text.y = element_text(margin=margin(t = 5,r = 5,b = 5,l = 2,"pt"), size = rel(0.8)),
    axis.title.y = element_text(margin=margin(t = 5,r = 2,b = 5,l = 2,"pt"), size = rel(0.8)),
    strip.background = element_rect(fill="white", linetype = "blank"),
    strip.text = element_text(margin=margin(t = 2,r = 2,b = 2,l = 2,"pt"))
    
  )

p


dev.print(tiff, file = paste0("plots/","Alpha-diversity-Fecal-Week0-Week7-bySex-vFINAL.tiff"), width = 14, height = 6, units = "cm", res=600, compression="lzw")





# # # # # # # # #


## calculate difference in alpha diversity: Week 0 > Week 7

## Need to identify which mouse cannot be traced from Week 0 > Week 7

# note from above, mouseID: L1m2 only has one entry

sel <- which(out$mouseID == "L1m2") # 57  so exclude this when examine differences


diff <- data.frame(mouseID = unique(as.character(out$mouseID[-sel])), w0 = NA, w7 = NA, diff_eff_no = NA, Treatment = NA, Sex=NA, stringsAsFactors = FALSE)
str(diff)

for (m in 1:length(diff$mouseID)) {
  # m<-1
  sel.out.w0 <- which(out$mouseID == diff$mouseID[m] & out$Time == "Week 0")
  sel.out.w7 <- which(out$mouseID == diff$mouseID[m] & out$Time == "Week 7")
  
  diff$w0[m] <- out$eff_no[sel.out.w0]
  diff$w7[m] <- out$eff_no[sel.out.w7]
  
  diff$diff_eff_no[m] <- out$eff_no[sel.out.w7] - out$eff_no[sel.out.w0]
  diff$Treatment[m] <- unique( c( as.character(out$Treatment[sel.out.w7]), as.character(out$Treatment[sel.out.w0]) )) 
  diff$Sex[m] <- unique( c( as.character(out$Sex[sel.out.w7]), as.character(out$Sex[sel.out.w0]) )) 
  print(paste0("calculating mouse # ", m, " = ",diff$mouseID[m], ": ",out$mouseID[sel.out.w7]," : ", out$mouseID[sel.out.w0]))
  
}

str(diff)
diff$Treatment <- factor(diff$Treatment, levels = c("Control", "Low", "High"), ordered = TRUE)
diff$Sex <- factor(diff$Sex)
str(diff)
# 'data.frame':	53 obs. of  6 variables:
# $ mouseID    : chr  "L3m3" "L1m3" "L5m3" "H3m1" ...
# $ w1         : num  39.1 31.3 32.6 40.8 16.2 ...
# $ w7         : num  47.8 55.1 26 54.7 57.6 ...
# $ diff_eff_no: num  8.69 23.85 -6.6 13.89 41.38 ...
# $ Treatment  : Ord.factor w/ 3 levels "Control"<"Low"<..: 2 2 2 3 3 3 3 1 1 3 ...
# $ Sex        : Factor w/ 2 levels "Female","Male": 1 1 1 1 2 2 2 2 1 2 ...

diff

p <- ggplot(data=diff, aes(x = Treatment, y = diff_eff_no)) +
  geom_boxplot() +
  facet_grid(~Sex)

p



## tidy plot # # # # # # # # #

cols <- c("Control" = "#f46d43", "Low" = "#66c2a5" , "High" = "#5e4fa2" ) # orange, aqua, dark-blue
set.seed(123)

# don't colour outliers
bp <- boxplot(diff$diff_eff_no ~ diff$Treatment + diff$Sex)
bp$out #  41.18108 56.73688

sel <- which(diff$diff_eff_no %in% bp$out)
diff$diff_eff_no[sel] # 41.18108 56.73688
diff[sel,]
#    mouseID        w1       w7  diff_eff_no Treatment  Sex
# 8     C1m3 12.473278 53.65436 41.181084388   Control Male
# 39    C1m2  4.414451 59.94410 55.529652716   Control Male


diff_exout <- diff[-sel, ]

# Kruskal-Wallis test
kt <- kruskal.test( diff_eff_no ~ Treatment, diff_exout[which(diff_exout$Sex=="Female"), ]) # Kruskal Wallis test
kt
# Kruskal-Wallis rank sum test
# data:  diff_eff_no by Treatment
# Kruskal-Wallis chi-squared = 1.382, df = 2, p-value = 0.5011

# Kruskal-Wallis test
kt <- kruskal.test( diff_eff_no ~ Treatment, diff_exout[which(diff_exout$Sex=="Male"), ]) # Kruskal Wallis test
kt
# Kruskal-Wallis rank sum test
# data:  diff_eff_no by Treatment
# Kruskal-Wallis chi-squared = 1.1499, df = 2, p-value = 0.5627



set.seed(123)
p <- ggplot(data=diff, aes(x = Treatment, y = diff_eff_no)) +
  geom_boxplot(outlier.size = 1) + 
  geom_jitter(data = diff[-sel, ], aes(colour = Treatment), width=0.3, height=0, size=1) +
  scale_color_manual(values = cols, guide = FALSE) +
  theme_classic() +
  labs(y="Change in effective OTUs (count)", x=NULL) +
  geom_hline(yintercept = 0, col="red", linetype="dashed") +
  facet_grid(~Sex) +
  theme(
    plot.margin = margin(t = 2, r = 2, b = 2, l = 5, "pt"),
    axis.ticks.length=unit(-3, "pt"),
    axis.ticks.x.bottom = element_blank(),
    axis.text.x = element_text(margin=margin(t = 5,r = 5,b = 2,l = 5,"pt"), size = rel(0.9)),  # , size = rel(0.9))
    axis.text.y = element_text(margin=margin(t = 5,r = 5,b = 5,l = 2,"pt"), size = rel(0.8)),
    axis.title.y = element_text(margin=margin(t = 5,r = 2,b = 5,l = 2,"pt"), size = rel(0.8)),
    strip.background = element_rect(fill="white", linetype = "blank")
    
  )

p


dev.print(tiff, file = paste0("plots/","Alpha-diversity-Fecal-Difference-Week0-to-Week7-bySex-vFINAL.tiff"), width = 8.3, height = 6, units = "cm", res=600, compression="lzw")

#------------------------





#### Cecal only - alpha diversity comparisons - Rarefied
#### Includes comparison by Sex
#------------------------

cecal.dc.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 550 taxa and 53 samples ]
# sample_data() Sample Data:       [ 53 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 550 taxa by 7 taxonomic ranks ]

table(cecal.dc.16s@sam_data$Cage.Name)
# C1 C2 C3 C4 C5 C6 H1 H2 H3 H4 H5 H6 L1 L2 L3 L4 L5 L6 
# 3  3  3  3  3  3  3  3  3  3  3  3  3  2  3  3  3  3 

table( cecal.dc.16s@sam_data$Treatment )
# Control    High     Low 
#      18      18      17  

table( cecal.dc.16s@sam_data$Time )
# Post-exposure 
# 53 




## Use lowest number of reads to create rarefied dataset

seed <- 123
rare.cecal.dc.16s <- rarefy_even_depth(cecal.dc.16s, sample.size = min(sample_sums(cecal.dc.16s)),
                                       rngseed = seed, replace = FALSE, trimOTUs = TRUE, verbose = TRUE)

sample_sums(rare.cecal.dc.16s) # all 14632
ntaxa(rare.cecal.dc.16s) # 496
nsamples(rare.cecal.dc.16s) # 53


shan <- plot_richness(rare.cecal.dc.16s, measures=c("Shannon"))
str(shan)

out <- data.frame(sample=shan$data$sample,
                  samp_type=shan$data$samp_type,
                  mouseID=shan$data$mouseID,
                  Treatment=shan$data$Treatment,
                  Time=shan$data$Time,
                  Sex=shan$data$Sex,
                  shanH=shan$data$value)
out$eff_no <- exp(out$shanH)

str(out)
out$Treatment <- factor(out$Treatment, levels = c("Control", "Low", "High"), ordered = TRUE)
out$Sex <- factor(out$Sex, levels = c("female", "male"), labels=c("Female","Male"))

p <- ggplot(data=out, aes(x = Treatment, y = eff_no)) +
  geom_boxplot() +
  facet_grid(~Sex)

p

# # # # # # # # # # # # #


## exclude outlier for significance test
bp <- boxplot(out$eff_no ~ out$Treatment + out$Sex)
bp$out #    15.73833

sel <- which(out$eff_no %in% bp$out)
out$eff_no[sel] #  15.73833
out_exout<-out[-sel,]



# Kruskal-Wallis test
kt <- kruskal.test( eff_no ~ Treatment, out_exout[which(out_exout$Sex=="Female") ,]) # Kruskal Wallis test
kt
# Kruskal-Wallis rank sum test
# data:  eff_no by Treatment
# Kruskal-Wallis chi-squared = 5.2925, df = 2, p-value = 0.07092

table( out_exout[which(out_exout$Sex=="Female") ,]$Treatment )
# Control     Low    High 
#       8       9       9 


# Kruskal-Wallis test
kt <- kruskal.test( eff_no ~ Treatment, out_exout[which(out_exout$Sex=="Male") ,]) # Kruskal Wallis test
kt
# Kruskal-Wallis rank sum test
# data:  eff_no by Treatment
# Kruskal-Wallis chi-squared = 2.4444, df = 2, p-value = 0.2946




## tidy plot # # # # # # # # #

cols <- c("Control" = "#f46d43", "Low" = "#66c2a5" , "High" = "#5e4fa2" ) # orange, aqua, dark-blue

annotation_df <- data.frame(
  Sex = c("Female"),
  start=c("Control"),
  end = c("High"),
  y = c(43),
  label = c("P = 0.07"))


set.seed(4321) # for jitter
p <- ggplot(data=out, aes(x = Treatment, y = eff_no)) +
  geom_boxplot(outlier.size = 1) + 
  geom_signif(data=annotation_df,
              aes(xmin=start, xmax=end, annotations=label, y_position=y),
              textsize = 2.5, vjust = -0.2, tip_length = 0, margin_top = 0,
              manual=TRUE) +
  geom_jitter(data = out_exout, aes(colour = Treatment), width=0.3, height=0, size =1 ) +
  scale_color_manual(values = cols, guide = FALSE) +
  theme_classic() +
  facet_grid(~Sex) +
  labs(y="Effective OTUs (count)", x=NULL) +
  theme(
    plot.margin = margin(t = 2, r = 2, b = 2, l = 5, "pt"),
    axis.ticks.length=unit(-3, "pt"),
    axis.ticks.x.bottom = element_blank(),
    axis.text.x = element_text(margin=margin(t = 5,r = 5,b = 2,l = 5,"pt"), size = rel(0.9)),  # , size = rel(0.9))
    axis.text.y = element_text(margin=margin(t = 5,r = 5,b = 5,l = 2,"pt"), size = rel(0.8)),
    axis.title.y = element_text(margin=margin(t = 5,r = 2,b = 5,l = 2,"pt"), size = rel(0.8)),
    strip.background = element_rect(fill="white", linetype = "blank"),
    strip.text = element_text(margin=margin(t = 2,r = 2,b = 2,l = 2,"pt"))
  ) #+


p

ggsave(plot=p, filename = paste0("plots/","Alpha-diversity-Cecal-vFINAL.tiff"), width = 8.3, height = 6, units = "cm", dpi = 600, compression = "lzw")

# # # # # # # # # # # # # 

#------------------------





#### Compare FECAL microbiota ordination and composition of
#### Top third vs. lowest third Open Field centre time - FEMALE only
#### Ordination
#------------------------

fecalw7.dc.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 645 taxa and 53 samples ]
# sample_data() Sample Data:       [ 53 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 645 taxa by 7 taxonomic ranks ]

str(of.post)
sel <- which(of.post.COPY$Sex=="Female")
beh_fem <- of.post.COPY[sel, ]
quantile(beh_fem$Centre_time, probs = c(1/3, 2/3) )
# 33.33333% 66.66667% 
#   26.56667  51.76667 

beh_fem$group <- NA 
sel <- which(beh_fem$Centre_time <= quantile(beh_fem$Centre_time, probs = c(1/3) ))
beh_fem$group[sel] <- "anxious"

sel <- which(beh_fem$Centre_time >= quantile(beh_fem$Centre_time, probs = c(2/3) ))
beh_fem$group[sel] <- "less anxious"

# remove middle third
ok <- complete.cases(beh_fem)
sel <- which(ok==TRUE)
beh_fem_groups <- beh_fem[sel, ]

dim(beh_fem_groups) # 18  39

beh_fem_groups[ ,c("ID","Treatment","Centre_time","group")]
#      ID Treatment Centre_time        group
# 4  C2m1   Control        10.4      anxious
# 5  C2m2   Control       162.5 less anxious
# 6  C2m3   Control        88.7 less anxious
# 10 C4m1   Control        12.1      anxious
# 11 C4m2   Control         3.3      anxious
# 13 C5m1   Control        10.6      anxious
# 15 C5m3   Control        25.5      anxious
# 27 H3m3      High        61.4 less anxious
# 28 H4m1      High        22.6      anxious
# 29 H4m2      High        90.0 less anxious
# 34 H6m1      High        97.6 less anxious
# 36 H6m3      High        57.7 less anxious
# 38 L1m2       Low        82.5 less anxious  # note: week 7 fecal sample for mouse L1m2 is not available
# 43 L3m1       Low        13.0      anxious
# 44 L3m2       Low       126.8 less anxious
# 45 L3m3       Low        20.4      anxious
# 50 L5m2       Low        18.5      anxious
# 51 L5m3       Low       113.0 less anxious



females <- as.character(beh_fem_groups$ID)


fem.fecalw7.dc.16s <- prune_samples(fecalw7.dc.16s@sam_data$mouseID %in% females, fecalw7.dc.16s)
fem.fecalw7.dc.16s

min(sample_sums(fem.fecalw7.dc.16s)) #  17606
min(taxa_sums(fem.fecalw7.dc.16s)) # 0

# prune taxa that have zero sequence reads
fem.fecalw7.dc.16s <- prune_taxa( taxa = taxa_sums(fem.fecalw7.dc.16s) > 0, x = fem.fecalw7.dc.16s )
fem.fecalw7.dc.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 562 taxa and 17 samples ]
# sample_data() Sample Data:       [ 17 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 562 taxa by 7 taxonomic ranks ]

table( fem.fecalw7.dc.16s@sam_data$Treatment )
# Control    High     Low 
#       7       5       5  




## Use lowest number of reads to create rarefied dataset

seed <- 123
rare.fem.fecalw7.dc.16s <- rarefy_even_depth(fem.fecalw7.dc.16s, sample.size = min(sample_sums(fem.fecalw7.dc.16s)),
                                         rngseed = seed, replace = FALSE, trimOTUs = TRUE, verbose = TRUE)

sample_sums(rare.fem.fecalw7.dc.16s) # all 17606
ntaxa(rare.fem.fecalw7.dc.16s) # 538
nsamples(rare.fem.fecalw7.dc.16s) # 17


### ORDINATION PLOT [finished-plot] # # # # # # # # # # # # # # # 
### NMDS + Bray-Curtis

set.seed(123)
ord <- ordinate(rare.fem.fecalw7.dc.16s, "NMDS", "bray")

ord
# Call:
#   metaMDS(comm = veganifyOTU(physeq), distance = distance) 
# 
# global Multidimensional Scaling using monoMDS
# 
# Data:     wisconsin(sqrt(veganifyOTU(physeq))) 
# Distance: bray 
# 
# Dimensions: 2 
# Stress:     0.113773 
# Stress type 1, weak ties
# Two convergent solutions found after 20 tries
# Scaling: centring, PC rotation, halfchange scaling 
# Species: expanded scores based on 'wisconsin(sqrt(veganifyOTU(physeq)))'  

str(ord)

rare.fem.fecalw7.dc.16s@sam_data$samp_type # all "fecal"
rare.fem.fecalw7.dc.16s@sam_data$Time      # all "Week 7"


## join to group: anxious vs less anxious
dat <- rare.fem.fecalw7.dc.16s@sam_data
dat <- data.frame(dat)
class(dat)
head(dat)
# join data
dat <- merge(x = dat, y = beh_fem_groups[ ,c("ID","group")], by.x= "mouseID", by.y= "ID", all.x = TRUE )
row.names(dat) <- dat$sample
# re-assign joined data to phyloseq object
sample_data(rare.fem.fecalw7.dc.16s) <- dat

p <- plot_ordination(rare.fem.fecalw7.dc.16s, ord, type="samples", color="group", shape="Cage.Name")
p
str(p)
p$data$Cage.Name <- factor(p$data$Cage.Name, levels = c( paste0("C",1:6),paste0("L",1:6),paste0("H",1:6)  ), ordered = TRUE)
p$data$Sex
#p$data$group <- factor(p$data$group, levels = c("anxious" , "less anxious") )
p$data$group <- factor(p$data$group, levels = c("anxious" , "less anxious"), labels =  c("1/3 most\nanxious", "1/3 least\nanxious") )

# cols <- c("Control" = "#f46d43", "Low" = "#66c2a5" , "High" = "#5e4fa2" ,  # orange, aqua, dark-blue
#           "anxious" = "#ffcc00", "less anxious" = "#3399ff") 
cols <- c("Control" = "#f46d43", "Low" = "#66c2a5" , "High" = "#5e4fa2" ,  # orange, aqua, dark-blue
          "1/3 most\nanxious" = "#ffcc00", "1/3 least\nanxious" = "#3399ff")

shapes <- c(1:18)


pp <- p + theme_bw() +
  scale_colour_manual(values = cols) +
  scale_shape_manual(values = shapes, name = "Enclosure") +
  scale_fill_manual(values = cols) +
  #geom_polygon(aes(fill = Treatment), alpha = 0.3) +
  geom_point(aes(colour = group)) +
  annotate(geom="text", x= 0.43, y= -0.3, label = paste0("Stress = ",round(ord$stress,5)), hjust=1, vjust=0, size=3.25 ) +
  guides(
    color = guide_legend(order = 0),
    shape = guide_legend(order = 1)
  ) +
  guides( shape = guide_legend(ncol = 2) ) # +

pp


## add centroids
#https://stackoverflow.com/questions/23463324/r-add-centroids-to-scatter-plot
x <- p$data$NMDS1
y <- p$data$NMDS2
group <- p$data$group
df <- data.frame(group, x, y)
centroids <- aggregate(cbind(x,y)~group,df,mean)
f         <- function(z) {sd(z)/sqrt(length(z))} # function to calculate std.err
se        <- aggregate(cbind(se.x=x,se.y=y)~group,df,f)
centroids <- merge(centroids,se, by="group")    # add std.err column to centroids
str(centroids)




## plot for publication


## https://stackoverflow.com/questions/47516448/how-to-get-ordispider-like-clusters-in-ggplot-with-nmds

# NMDS scores
scrs <- data.frame(x=p$data$NMDS1, y=p$data$NMDS2, group=p$data$group)

names(centroids) # "group" "x"     "y"     "se.x"  "se.y"

# segments
segs <- merge(scrs, centroids[ ,c("group", "x", "y") ], by = 'group', sort = FALSE)



#pp <- ggplot(data = p$data, aes(x=NMDS1, y=NMDS2, shape = Cage.Name, colour = group) ) + 
pp <- ggplot() + 
    theme_bw() + #
  #ggtitle("a") +
  #geom_point(size = 1.5) +
  scale_colour_manual(values = cols, name = "Group") +
  scale_shape_manual(values = shapes, name = "Enclosure") +
  #scale_fill_manual(values = cols) +
  
  #geom_point(data=centroids, aes(x=x, y=y, colour=group), shape=1, size=4, stroke=0.65, inherit.aes = FALSE, show.legend = FALSE) + 
  geom_segment(data = segs,
               mapping = aes(x=x.x, xend = x.y, y=y.x, yend = y.y, colour = group), size = 0.5, alpha=0.5) + # spiders
  geom_point(data = centroids, aes(x=x, y=y, colour=group), size = 3, alpha=0.5, inherit.aes = FALSE, show.legend = FALSE) +   # centroids
  
  geom_point(data = p$data, aes(x=NMDS1, y=NMDS2, shape = Cage.Name, colour = group), size = 1.5) + # points
  
  guides( shape = guide_legend(ncol = 2) ) +
  #annotate(geom="text", x= 0.41, y= -0.44, label = paste0("Stress = ",round(ord$stress,4)), hjust=1, vjust=0, size=2.2 ) +
  annotate(geom="text", x= 0.41, y= -0.44, label = paste0("Females\nStress = ",round(ord$stress,4)), hjust=1, vjust=0, size=2.2 ) +
  
  geom_text_repel(data = p$data, aes(x=NMDS1, y=NMDS2, label = mouseID), colour = "#737373" , size=2) +  # light grey:  "#999999"
  labs(x = NULL, y = NULL) +
  theme(
    
    #plot.margin = margin(t = 2, r = 5, b = 2, l = 2, "pt"),
    plot.margin = margin(t = 2, r = 1, b = 2, l = 1, "pt"),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    
    axis.text.x = element_blank(),
    axis.text.y = element_blank(),
    axis.ticks = element_blank(),
    
    legend.text = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.55)),
    legend.title = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.6)),
    #legend.margin = margin(t = 0,r = 0,b = 0,l = 0,"pt"),
    legend.margin = margin(t = 0,r = 0,b = 0,l = 2,"pt"),
    legend.box.spacing = unit(0, "pt"),
    legend.box.margin = margin(t = 0,r = 2,b = 0,l = 2,"pt"),
    legend.spacing = unit(4, "pt"),
    
    #legend.justification=c(0,0), 
    #legend.position=c(0.05, 0.05),
    #legend.background = element_blank(),
    legend.key = element_blank(),
    legend.key.size = unit(2, "pt")
    
  )

pp

grid.text(label = "A" , x = unit(0.06, "npc") , y = unit(0.93,"npc"), gp=gpar(fontsize=11, fontface="bold") )


dev.print(tiff, file = paste0("plots/","Ordination-Fecal-Week7-Female-top-vs-low-anxiety-1xRarefy-withSpiders-vFINAL.tiff"), width = 8.7, height = 6, units = "cm", res=600, compression="lzw")








# Test hypothesis that microbiota vary (with different centroids) by Treatment

# Calculate bray curtis distance matrix
set.seed(123)
bray.rare.fem.fecalw7.dc.16s <- phyloseq::distance(rare.fem.fecalw7.dc.16s, method = "bray")
sampledf <- data.frame(sample_data(rare.fem.fecalw7.dc.16s))
str(sampledf)

## Refer to PERMANOVA test below with both anxiety grouping and cage
# # Adonis test
# set.seed(123)
# adonis(bray.rare.fem.fecalw7.dc.16s ~ group, data = sampledf)
# # Call:
# # adonis(formula = bray.rare.fem.fecalw7.dc.16s ~ group, data = sampledf) 
# # 
# # Permutation: free
# # Number of permutations: 999
# # 
# # Terms added sequentially (first to last)
# # 
# #           Df SumsOfSqs MeanSqs F.Model     R2 Pr(>F)  
# # group      1   0.25285 0.25285  2.3832 0.1371  0.044 *
# # Residuals 15   1.59148 0.10610         0.8629         
# # Total     16   1.84433                 1.0000         
# # ---
# #   Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1


beta <- betadisper(bray.rare.fem.fecalw7.dc.16s, sampledf$group)
set.seed(123)
permutest(beta)
# Homogeneity of dispersion test
# Permutation test for homogeneity of multivariate dispersions
# Permutation: free
# Number of permutations: 999
# 
# Response: Distances
#           Df  Sum Sq   Mean Sq      F N.Perm Pr(>F)
# Groups     1 0.00439 0.0043897 0.5157    999  0.487
# Residuals 15 0.12768 0.0085120 



# Adonis test
set.seed(123)
adonis(bray.rare.fem.fecalw7.dc.16s ~ group + Cage.Name, data = sampledf)
# Call:
# adonis(formula = bray.rare.fem.fecalw7.dc.16s ~ group + Cage.Name,      data = sampledf) 
# 
# Permutation: free
# Number of permutations: 999
# 
# Terms added sequentially (first to last)
# 
#           Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)  
# group      1   0.25285 0.25285  3.3758 0.13710  0.017 *
# Cage.Name  7   0.99228 0.14175  1.8926 0.53802  0.018 *
# Residuals  8   0.59920 0.07490         0.32489         
# Total     16   1.84433                 1.00000         
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1


table( rare.fem.fecalw7.dc.16s@sam_data$group )
# anxious less anxious 
#       9            8

table( rare.fem.fecalw7.dc.16s@sam_data$Cage.Name )
# C2 C4 C5 H3 H4 H6 L3 L5 
#  3  2  2  1  2  2  3  2 



#
#------------------------





#### Compare CECAL microbiota ordination and composition of
#### Top third vs. lowest third Open Field centre time - FEMALE only
#### Ordination
#------------------------

cecal.dc.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 550 taxa and 53 samples ]
# sample_data() Sample Data:       [ 53 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 550 taxa by 7 taxonomic ranks ]

str(of.post)
sel <- which(of.post.COPY$Sex=="Female")
beh_fem <- of.post.COPY[sel, ]
quantile(beh_fem$Centre_time, probs = c(1/3, 2/3) )
# 33.33333% 66.66667% 
#   26.56667  51.76667 

beh_fem$group <- NA 
sel <- which(beh_fem$Centre_time <= quantile(beh_fem$Centre_time, probs = c(1/3) ))
beh_fem$group[sel] <- "anxious"

sel <- which(beh_fem$Centre_time >= quantile(beh_fem$Centre_time, probs = c(2/3) ))
beh_fem$group[sel] <- "less anxious"

# remove middle third
ok <- complete.cases(beh_fem)
sel <- which(ok==TRUE)
beh_fem_groups <- beh_fem[sel, ]

dim(beh_fem_groups) # 18  39

beh_fem_groups[ ,c("ID","Treatment","Centre_time","group")]
#      ID Treatment Centre_time        group
# 4  C2m1   Control        10.4      anxious
# 5  C2m2   Control       162.5 less anxious
# 6  C2m3   Control        88.7 less anxious
# 10 C4m1   Control        12.1      anxious
# 11 C4m2   Control         3.3      anxious
# 13 C5m1   Control        10.6      anxious
# 15 C5m3   Control        25.5      anxious
# 27 H3m3      High        61.4 less anxious
# 28 H4m1      High        22.6      anxious
# 29 H4m2      High        90.0 less anxious
# 34 H6m1      High        97.6 less anxious
# 36 H6m3      High        57.7 less anxious
# 38 L1m2       Low        82.5 less anxious
# 43 L3m1       Low        13.0      anxious
# 44 L3m2       Low       126.8 less anxious
# 45 L3m3       Low        20.4      anxious
# 50 L5m2       Low        18.5      anxious
# 51 L5m3       Low       113.0 less anxious



females <- as.character(beh_fem_groups$ID)


fem.cecal.dc.16s <- prune_samples(cecal.dc.16s@sam_data$mouseID %in% females, cecal.dc.16s)
fem.cecal.dc.16s

min(sample_sums(fem.cecal.dc.16s)) #  14632
min(taxa_sums(fem.cecal.dc.16s)) # 0

# prune taxa that have zero sequence reads
fem.cecal.dc.16s <- prune_taxa( taxa = taxa_sums(fem.cecal.dc.16s) > 0, x = fem.cecal.dc.16s )
fem.cecal.dc.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 494 taxa and 18 samples ]
# sample_data() Sample Data:       [ 18 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 494 taxa by 7 taxonomic ranks ]

table( fem.cecal.dc.16s@sam_data$Treatment )
# Control    High     Low 
#       7       5       6  




## Use lowest number of reads to create rarefied dataset

seed <- 123
rare.fem.cecal.dc.16s <- rarefy_even_depth(fem.cecal.dc.16s, sample.size = min(sample_sums(fem.cecal.dc.16s)),
                                             rngseed = seed, replace = FALSE, trimOTUs = TRUE, verbose = TRUE)

sample_sums(rare.fem.cecal.dc.16s) # all 14632 
ntaxa(rare.fem.cecal.dc.16s) # 431
nsamples(rare.fem.cecal.dc.16s) # 18


### ORDINATION PLOT [finished-plot] # # # # # # # # # # # # # # # 
### NMDS + Bray-Curtis

set.seed(123)
ord <- ordinate(rare.fem.cecal.dc.16s, "NMDS", "bray")

ord
# Call:
#   metaMDS(comm = veganifyOTU(physeq), distance = distance) 
# 
# global Multidimensional Scaling using monoMDS
# 
# Data:     wisconsin(sqrt(veganifyOTU(physeq))) 
# Distance: bray 
# 
# Dimensions: 2 
# Stress:     0.1208276 
# Stress type 1, weak ties
# Two convergent solutions found after 20 tries
# Scaling: centring, PC rotation, halfchange scaling 
# Species: expanded scores based on �wisconsin(sqrt(veganifyOTU(physeq)))� 

str(ord)

rare.fem.cecal.dc.16s@sam_data$samp_type # all "cecal"
rare.fem.cecal.dc.16s@sam_data$Time      # all "Post-exposure"


## join to group: anxious vs less anxious
dat <- rare.fem.cecal.dc.16s@sam_data
dat <- data.frame(dat)
class(dat)
head(dat)
# join data
dat <- merge(x = dat, y = beh_fem_groups[ ,c("ID","group")], by.x= "mouseID", by.y= "ID", all.x = TRUE )
row.names(dat) <- dat$sample
# re-assign joined data to phyloseq object
sample_data(rare.fem.cecal.dc.16s) <- dat

p <- plot_ordination(rare.fem.cecal.dc.16s, ord, type="samples", color="group", shape="Cage.Name")
p
str(p)
p$data$Cage.Name <- factor(p$data$Cage.Name, levels = c( paste0("C",1:6),paste0("L",1:6),paste0("H",1:6)  ), ordered = TRUE)
p$data$Sex
#p$data$group <- factor(p$data$group, levels = c("anxious" , "less anxious") )
p$data$group <- factor(p$data$group, levels = c("anxious" , "less anxious"), labels =  c("1/3 most\nanxious", "1/3 least\nanxious") )

# cols <- c("Control" = "#f46d43", "Low" = "#66c2a5" , "High" = "#5e4fa2" ,  # orange, aqua, dark-blue
#           "anxious" = "#ffcc00", "less anxious" = "#3399ff") 
cols <- c("Control" = "#f46d43", "Low" = "#66c2a5" , "High" = "#5e4fa2" ,  # orange, aqua, dark-blue
          "1/3 most\nanxious" = "#ffcc00", "1/3 least\nanxious" = "#3399ff") 

shapes <- c(1:18)


pp <- p + theme_bw() +
  scale_colour_manual(values = cols) +
  scale_shape_manual(values = shapes, name = "Enclosure") +
  scale_fill_manual(values = cols) +
  #geom_polygon(aes(fill = Treatment), alpha = 0.3) +
  geom_point(aes(colour = group)) +
  annotate(geom="text", x= 0.43, y= -0.3, label = paste0("Stress = ",round(ord$stress,5)), hjust=1, vjust=0, size=3.25 ) +
  guides(
    color = guide_legend(order = 0),
    shape = guide_legend(order = 1)
  ) +
  guides( shape = guide_legend(ncol = 2) ) # +

pp






# Test hypothesis that microbiota vary (with different centroids) by Treatment

# Calculate bray curtis distance matrix
set.seed(123)
bray.rare.fem.cecal.dc.16s <- phyloseq::distance(rare.fem.cecal.dc.16s, method = "bray")
sampledf <- data.frame(sample_data(rare.fem.cecal.dc.16s))
str(sampledf)


beta <- betadisper(bray.rare.fem.cecal.dc.16s, sampledf$group)
set.seed(123)
permutest(beta)
# Permutation test for homogeneity of multivariate dispersions
# Permutation: free
# Number of permutations: 999
# 
# Response: Distances
# Df   Sum Sq   Mean Sq      F N.Perm Pr(>F)
# Groups     1 0.007326 0.0073258 1.3935    999  0.257
# Residuals 16 0.084114 0.0052571  



# Adonis test
set.seed(123)
adonis(bray.rare.fem.cecal.dc.16s ~ group + Cage.Name, data = sampledf)
# Call:
#   adonis(formula = bray.rare.fem.cecal.dc.16s ~ group + Cage.Name,      data = sampledf) 
# 
# Permutation: free
# Number of permutations: 999
# 
# Terms added sequentially (first to last)
# 
# Df SumsOfSqs  MeanSqs F.Model      R2 Pr(>F)   
# group      1   0.09869 0.098687  1.9994 0.06973  0.064 . 
# Cage.Name  8   0.92171 0.115213  2.3343 0.65127  0.004 **
#   Residuals  8   0.39486 0.049358         0.27900          
# Total     17   1.41525                  1.00000          
# ---
#   Signif. codes:  0 �***� 0.001 �**� 0.01 �*� 0.05 �.� 0.1 � � 1


table( rare.fem.cecal.dc.16s@sam_data$group )
# anxious less anxious 
#       9            9

table( rare.fem.cecal.dc.16s@sam_data$Cage.Name )
# C2 C4 C5 H3 H4 H6 L1 L3 L5 
# 3  2  2  1  2  2  1  3  2 


#
#------------------------





#### Compare FECAL microbiota ordination and composition of
#### Top third vs. lowest third Open Field centre time - MALE only
#### Ordination
#------------------------

fecalw7.dc.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 645 taxa and 53 samples ]
# sample_data() Sample Data:       [ 53 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 645 taxa by 7 taxonomic ranks ]

str(of.post)
sel <- which(of.post.COPY$Sex=="Male")
beh_male <- of.post.COPY[sel, ]
quantile(beh_male$Centre_time, probs = c(1/3, 2/3) )
# 33.33333% 66.66667% 
# 25.70000  69.06667 

beh_male$group <- NA 
sel <- which(beh_male$Centre_time <= quantile(beh_male$Centre_time, probs = c(1/3) ))
beh_male$group[sel] <- "anxious"

sel <- which(beh_male$Centre_time >= quantile(beh_male$Centre_time, probs = c(2/3) ))
beh_male$group[sel] <- "less anxious"

# remove middle third
ok <- complete.cases(beh_male)
sel <- which(ok==TRUE)
beh_male_groups <- beh_male[sel, ]

dim(beh_male_groups) # 19  39

beh_male_groups[ ,c("ID","Treatment","Centre_time","group")]
#      ID Treatment Centre_time        group
# 1  C1m1   Control        13.2      anxious
# 2  C1m2   Control       128.8 less anxious
# 3  C1m3   Control        71.6 less anxious
# 8  C3m2   Control       208.3 less anxious
# 16 C6m1   Control        86.8 less anxious
# 18 C6m3   Control        21.8      anxious
# 19 H1m1      High        21.2      anxious
# 20 H1m2      High        25.7      anxious
# 22 H2m1      High        13.8      anxious
# 23 H2m2      High       289.9 less anxious
# 24 H2m3      High        70.8 less anxious
# 32 H5m2      High        89.3 less anxious
# 33 H5m3      High       279.0 less anxious
# 40 L2m1       Low        13.2      anxious
# 41 L2m2       Low         9.0      anxious
# 42 L2m3       Low        92.0 less anxious
# 46 L4m1       Low        11.4      anxious
# 47 L4m2       Low        23.2      anxious
# 53 L6m2       Low        25.7      anxious



males <- as.character(beh_male_groups$ID)


male.fecalw7.dc.16s <- prune_samples(fecalw7.dc.16s@sam_data$mouseID %in% males, fecalw7.dc.16s)
male.fecalw7.dc.16s

min(sample_sums(male.fecalw7.dc.16s)) #  19385
min(taxa_sums(male.fecalw7.dc.16s)) # 0

# prune taxa that have zero sequence reads
male.fecalw7.dc.16s <- prune_taxa( taxa = taxa_sums(male.fecalw7.dc.16s) > 0, x = male.fecalw7.dc.16s )
male.fecalw7.dc.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 580 taxa and 19 samples ]
# sample_data() Sample Data:       [ 19 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 580 taxa by 7 taxonomic ranks ]

table( male.fecalw7.dc.16s@sam_data$Treatment )
# Control    High     Low 
#       6       7       6




## Use lowest number of reads to create rarefied dataset

seed <- 123
rare.male.fecalw7.dc.16s <- rarefy_even_depth(male.fecalw7.dc.16s, sample.size = min(sample_sums(male.fecalw7.dc.16s)),
                                             rngseed = seed, replace = FALSE, trimOTUs = TRUE, verbose = TRUE)

sample_sums(rare.male.fecalw7.dc.16s) # all 19385
ntaxa(rare.male.fecalw7.dc.16s) # 535
nsamples(rare.male.fecalw7.dc.16s) # 19


### ORDINATION PLOT [finished-plot] # # # # # # # # # # # # # # # 
### NMDS + Bray-Curtis

set.seed(123)
ord <- ordinate(rare.male.fecalw7.dc.16s, "NMDS", "bray")

ord
# Call:
# metaMDS(comm = veganifyOTU(physeq), distance = distance) 
# 
# global Multidimensional Scaling using monoMDS
# 
# Data:     wisconsin(sqrt(veganifyOTU(physeq))) 
# Distance: bray 
# 
# Dimensions: 2 
# Stress:     0.1110704 
# Stress type 1, weak ties
# Two convergent solutions found after 20 tries
# Scaling: centring, PC rotation, halfchange scaling 
# Species: expanded scores based on �wisconsin(sqrt(veganifyOTU(physeq)))�


str(ord)

rare.male.fecalw7.dc.16s@sam_data$samp_type # all "fecal"
rare.male.fecalw7.dc.16s@sam_data$Time      # all "Week 7"


## join to group: anxious vs less anxious
dat <- rare.male.fecalw7.dc.16s@sam_data
dat <- data.frame(dat)
class(dat)
head(dat)
# join data
dat <- merge(x = dat, y = beh_male_groups[ ,c("ID","group")], by.x= "mouseID", by.y= "ID", all.x = TRUE )
row.names(dat) <- dat$sample
# re-assign joined data to phyloseq object
sample_data(rare.male.fecalw7.dc.16s) <- dat

p <- plot_ordination(rare.male.fecalw7.dc.16s, ord, type="samples", color="group", shape="Cage.Name")
p
str(p)
p$data$Cage.Name <- factor(p$data$Cage.Name, levels = c( paste0("C",1:6),paste0("L",1:6),paste0("H",1:6)  ), ordered = TRUE)
p$data$Sex
#p$data$group <- factor(p$data$group, levels = c("anxious" , "less anxious") )
p$data$group <- factor(p$data$group, levels = c("anxious" , "less anxious"), labels =  c("1/3 most\nanxious", "1/3 least\nanxious") )

# cols <- c("Control" = "#f46d43", "Low" = "#66c2a5" , "High" = "#5e4fa2" ,  # orange, aqua, dark-blue
#           "anxious" = "#ffcc00", "less anxious" = "#3399ff") 
cols <- c("Control" = "#f46d43", "Low" = "#66c2a5" , "High" = "#5e4fa2" ,  # orange, aqua, dark-blue
          "1/3 most\nanxious" = "#ffcc00", "1/3 least\nanxious" = "#3399ff")

shapes <- c(1:18)


pp <- p + theme_bw() +
  scale_colour_manual(values = cols) +
  scale_shape_manual(values = shapes, name = "Enclosure") +
  scale_fill_manual(values = cols) +
  #geom_polygon(aes(fill = Treatment), alpha = 0.3) +
  geom_point(aes(colour = group)) +
  annotate(geom="text", x= 0.43, y= -0.3, label = paste0("Stress = ",round(ord$stress,5)), hjust=1, vjust=0, size=3.25 ) +
  guides(
    color = guide_legend(order = 0),
    shape = guide_legend(order = 1)
  ) +
  guides( shape = guide_legend(ncol = 2) ) # +

pp


## add centroids
#https://stackoverflow.com/questions/23463324/r-add-centroids-to-scatter-plot
x <- p$data$NMDS1
y <- p$data$NMDS2
group <- p$data$group
df <- data.frame(group, x, y)
centroids <- aggregate(cbind(x,y)~group,df,mean)
f         <- function(z) {sd(z)/sqrt(length(z))} # function to calculate std.err
se        <- aggregate(cbind(se.x=x,se.y=y)~group,df,f)
centroids <- merge(centroids,se, by="group")    # add std.err column to centroids
str(centroids)



## plot for publication

## https://stackoverflow.com/questions/47516448/how-to-get-ordispider-like-clusters-in-ggplot-with-nmds

# NMDS scores
scrs <- data.frame(x=p$data$NMDS1, y=p$data$NMDS2, group=p$data$group)

names(centroids) # "group" "x"     "y"     "se.x"  "se.y"

# segments
segs <- merge(scrs, centroids[ ,c("group", "x", "y") ], by = 'group', sort = FALSE)




#pp <- ggplot(data = p$data, aes(x=NMDS1, y=NMDS2, shape = Cage.Name, colour = group) ) + 
pp <- ggplot() + 
  theme_bw() + #
  #ggtitle("a") +
  #geom_point(size = 1.5) +
  scale_colour_manual(values = cols, name = "Group") +
  scale_shape_manual(values = shapes, name = "Enclosure") +
  #scale_fill_manual(values = cols) +
  
  #geom_point(data=centroids, aes(x=x, y=y, colour=group), shape=1, size=4, stroke=0.65, inherit.aes = FALSE, show.legend = FALSE) + 
  geom_segment(data = segs,
               mapping = aes(x=x.x, xend = x.y, y=y.x, yend = y.y, colour = group), size = 0.5, alpha=0.5) + # spiders
  geom_point(data = centroids, aes(x=x, y=y, colour=group), size = 3, alpha=0.5, inherit.aes = FALSE, show.legend = FALSE) +   # centroids
  
  geom_point(data = p$data, aes(x=NMDS1, y=NMDS2, shape = Cage.Name, colour = group), size = 1.5) + # points
  
  guides( shape = guide_legend(ncol = 2) ) +
  #annotate(geom="text", x= 0.41, y= -0.44, label = paste0("Stress = ",round(ord$stress,4)), hjust=1, vjust=0, size=2.2 ) +
  annotate(geom="text", x= 0.405, y= 0.195, label = paste0("Males\nStress = ",round(ord$stress,4)), hjust=1, vjust=0, size=2.2 ) +
  
  geom_text_repel(data = p$data, aes(x=NMDS1, y=NMDS2, label = mouseID), colour = "#737373" , size=2) +  # light grey:  "#999999"
  labs(x = NULL, y = NULL) +
  theme(
    
    #plot.margin = margin(t = 2, r = 5, b = 2, l = 2, "pt"),
    plot.margin = margin(t = 2, r = 1, b = 2, l = 1, "pt"),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    
    axis.text.x = element_blank(),
    axis.text.y = element_blank(),
    axis.ticks = element_blank(),
    
    legend.text = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.55)),
    legend.title = element_text(margin=margin(t = 0,r = 0,b = 0,l = 0,"pt"), size = rel(0.6)),
    #legend.margin = margin(t = 0,r = 0,b = 0,l = 0,"pt"),
    legend.margin = margin(t = 0,r = 0,b = 0,l = 2,"pt"),
    legend.box.spacing = unit(0, "pt"),
    legend.box.margin = margin(t = 0,r = 2,b = 0,l = 2,"pt"),
    legend.spacing = unit(4, "pt"),
    
    #legend.justification=c(0,0), 
    #legend.position=c(0.05, 0.05),
    #legend.background = element_blank(),
    legend.key = element_blank(),
    legend.key.size = unit(2, "pt")
    
  )

pp

grid.text(label = "B" , x = unit(0.06, "npc") , y = unit(0.93,"npc"), gp=gpar(fontsize=11, fontface="bold") )

dev.print(tiff, file = paste0("plots/","Ordination-Fecal-Week7-MALE-top-vs-low-anxiety-1xRarefy-withSpiders-vFINAL.tiff"), width = 8.7, height = 6, units = "cm", res=600, compression="lzw")








# Test hypothesis that microbiota vary (with different centroids) by groups

# Calculate bray curtis distance matrix
set.seed(123)
bray.rare.male.fecalw7.dc.16s <- phyloseq::distance(rare.male.fecalw7.dc.16s, method = "bray")
sampledf <- data.frame(sample_data(rare.male.fecalw7.dc.16s))
str(sampledf)

# # Refer to PERMANOVA test below with both anxiety grouping and cage
# # Adonis test
# set.seed(123)
# adonis(bray.rare.male.fecalw7.dc.16s ~ group, data = sampledf)
# # Call:
# #   adonis(formula = bray.rare.male.fecalw7.dc.16s ~ group, data = sampledf) 
# # 
# # Permutation: free
# # Number of permutations: 999
# # 
# # Terms added sequentially (first to last)
# # 
# #           Df SumsOfSqs  MeanSqs F.Model      R2 Pr(>F)
# # group      1   0.02614 0.026140 0.30869 0.01783  0.962
# # Residuals 17   1.43958 0.084681         0.98217       
# # Total     18   1.46572                  1.00000 


beta <- betadisper(bray.rare.male.fecalw7.dc.16s, sampledf$group)
set.seed(123)
permutest(beta)
# Permutation test for homogeneity of multivariate dispersions
# Permutation: free
# Number of permutations: 999
# 
# Response: Distances
# Df   Sum Sq   Mean Sq      F N.Perm Pr(>F)
# Groups     1 0.001607 0.0016068 0.2727    999  0.645
# Residuals 17 0.100173 0.0058925      



# Adonis test
set.seed(123)
adonis(bray.rare.male.fecalw7.dc.16s ~ group + Cage.Name, data = sampledf)
# Call:
#   adonis(formula = bray.rare.male.fecalw7.dc.16s ~ group + Cage.Name,      data = sampledf) 
# 
# Permutation: free
# Number of permutations: 999
# 
# Terms added sequentially (first to last)
# 
#           Df SumsOfSqs  MeanSqs F.Model      R2 Pr(>F)   
# group      1   0.02614 0.026140 0.54212 0.01783  0.772   
# Cage.Name  8   1.00562 0.125703 2.60696 0.68609  0.002 **
# Residuals  9   0.43396 0.048218         0.29607          
# Total     18   1.46572                  1.00000          
# ---
# Signif. codes:  0 �***� 0.001 �**� 0.01 �*� 0.05 �.� 0.1 � � 1


table( rare.male.fecalw7.dc.16s@sam_data$group )
# anxious less anxious 
#      10            9 

table( rare.male.fecalw7.dc.16s@sam_data$Cage.Name )
# C1 C3 C6 H1 H2 H5 L2 L4 L6 
#  3  1  2  2  3  2  3  2  1 


#
#------------------------




#### Compare CECAL microbiota ordination and composition of
#### Top third vs. lowest third Open Field centre time - MALE only
#### Ordination
#------------------------

cecal.dc.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 550 taxa and 53 samples ]
# sample_data() Sample Data:       [ 53 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 550 taxa by 7 taxonomic ranks ]

str(of.post)
sel <- which(of.post.COPY$Sex=="Male")
beh_male <- of.post.COPY[sel, ]
quantile(beh_male$Centre_time, probs = c(1/3, 2/3) )
# 33.33333% 66.66667% 
# 25.70000  69.06667 

beh_male$group <- NA 
sel <- which(beh_male$Centre_time <= quantile(beh_male$Centre_time, probs = c(1/3) ))
beh_male$group[sel] <- "anxious"

sel <- which(beh_male$Centre_time >= quantile(beh_male$Centre_time, probs = c(2/3) ))
beh_male$group[sel] <- "less anxious"

# remove middle third
ok <- complete.cases(beh_male)
sel <- which(ok==TRUE)
beh_male_groups <- beh_male[sel, ]

dim(beh_male_groups) # 19  39

beh_male_groups[ ,c("ID","Treatment","Centre_time","group")]
#      ID Treatment Centre_time        group
# 1  C1m1   Control        13.2      anxious
# 2  C1m2   Control       128.8 less anxious
# 3  C1m3   Control        71.6 less anxious
# 8  C3m2   Control       208.3 less anxious
# 16 C6m1   Control        86.8 less anxious
# 18 C6m3   Control        21.8      anxious
# 19 H1m1      High        21.2      anxious
# 20 H1m2      High        25.7      anxious
# 22 H2m1      High        13.8      anxious
# 23 H2m2      High       289.9 less anxious
# 24 H2m3      High        70.8 less anxious
# 32 H5m2      High        89.3 less anxious
# 33 H5m3      High       279.0 less anxious
# 40 L2m1       Low        13.2      anxious
# 41 L2m2       Low         9.0      anxious
# 42 L2m3       Low        92.0 less anxious   # this sample L2m3Ce failed sequencing
# 46 L4m1       Low        11.4      anxious
# 47 L4m2       Low        23.2      anxious
# 53 L6m2       Low        25.7      anxious


males <- as.character(beh_male_groups$ID)


male.cecal.dc.16s <- prune_samples(cecal.dc.16s@sam_data$mouseID %in% males, cecal.dc.16s)
male.cecal.dc.16s

min(sample_sums(male.cecal.dc.16s)) #  37094
min(taxa_sums(male.cecal.dc.16s)) # 0

# prune taxa that have zero sequence reads
male.cecal.dc.16s <- prune_taxa( taxa = taxa_sums(male.cecal.dc.16s) > 0, x = male.cecal.dc.16s )
male.cecal.dc.16s
# phyloseq-class experiment-level object
# otu_table()   OTU Table:         [ 502 taxa and 18 samples ]
# sample_data() Sample Data:       [ 18 samples by 20 sample variables ]
# tax_table()   Taxonomy Table:    [ 502 taxa by 7 taxonomic ranks ]

table( male.cecal.dc.16s@sam_data$Treatment )
# Control    High     Low 
#       6       7       5 




## Use lowest number of reads to create rarefied dataset

seed <- 123
rare.male.cecal.dc.16s <- rarefy_even_depth(male.cecal.dc.16s, sample.size = min(sample_sums(male.cecal.dc.16s)),
                                            rngseed = seed, replace = FALSE, trimOTUs = TRUE, verbose = TRUE)

sample_sums(rare.male.cecal.dc.16s) # all 37094
ntaxa(rare.male.cecal.dc.16s) # 475
nsamples(rare.male.cecal.dc.16s) # 18


### ORDINATION PLOT [finished-plot] # # # # # # # # # # # # # # # 
### NMDS + Bray-Curtis

set.seed(123)
ord <- ordinate(rare.male.cecal.dc.16s, "NMDS", "bray")

ord
# Call:
#   metaMDS(comm = veganifyOTU(physeq), distance = distance) 
# 
# global Multidimensional Scaling using monoMDS
# 
# Data:     wisconsin(sqrt(veganifyOTU(physeq))) 
# Distance: bray 
# 
# Dimensions: 2 
# Stress:     6.338062e-05 
# Stress type 1, weak ties
# Two convergent solutions found after 20 tries
# Scaling: centring, PC rotation, halfchange scaling 
# Species: expanded scores based on �wisconsin(sqrt(veganifyOTU(physeq)))� 


# Warning message:
# In metaMDS(veganifyOTU(physeq), distance, ...) :
# stress is (nearly) zero: you may have insufficient data


str(ord)

rare.male.cecal.dc.16s@sam_data$samp_type # all "cecal"
rare.male.cecal.dc.16s@sam_data$Time      # all "Post-exposure"


## join to group: anxious vs less anxious
dat <- rare.male.cecal.dc.16s@sam_data
dat <- data.frame(dat)
class(dat)
head(dat)
# join data
dat <- merge(x = dat, y = beh_male_groups[ ,c("ID","group")], by.x= "mouseID", by.y= "ID", all.x = TRUE )
row.names(dat) <- dat$sample
# re-assign joined data to phyloseq object
sample_data(rare.male.cecal.dc.16s) <- dat

p <- plot_ordination(rare.male.cecal.dc.16s, ord, type="samples", color="group", shape="Cage.Name")
p
str(p)
p$data$Cage.Name <- factor(p$data$Cage.Name, levels = c( paste0("C",1:6),paste0("L",1:6),paste0("H",1:6)  ), ordered = TRUE)
p$data$Sex
#p$data$group <- factor(p$data$group, levels = c("anxious" , "less anxious") )
p$data$group <- factor(p$data$group, levels = c("anxious" , "less anxious"), labels =  c("1/3 most\nanxious", "1/3 least\nanxious") )

# cols <- c("Control" = "#f46d43", "Low" = "#66c2a5" , "High" = "#5e4fa2" ,  # orange, aqua, dark-blue
#           "anxious" = "#ffcc00", "less anxious" = "#3399ff") 
cols <- c("Control" = "#f46d43", "Low" = "#66c2a5" , "High" = "#5e4fa2" ,  # orange, aqua, dark-blue
          "1/3 most\nanxious" = "#ffcc00", "1/3 least\nanxious" = "#3399ff") 

shapes <- c(1:18)


pp <- p + theme_bw() +
  scale_colour_manual(values = cols) +
  scale_shape_manual(values = shapes, name = "Enclosure") +
  scale_fill_manual(values = cols) +
  #geom_polygon(aes(fill = Treatment), alpha = 0.3) +
  geom_point(aes(colour = group)) +
  annotate(geom="text", x= 0.43, y= -0.3, label = paste0("Stress = ",round(ord$stress,5)), hjust=1, vjust=0, size=3.25 ) +
  guides(
    color = guide_legend(order = 0),
    shape = guide_legend(order = 1)
  ) +
  guides( shape = guide_legend(ncol = 2) ) # +

pp






# Test hypothesis that microbiota vary (with different centroids) by Treatment

# Calculate bray curtis distance matrix
set.seed(123)
bray.rare.male.cecal.dc.16s <- phyloseq::distance(rare.male.cecal.dc.16s, method = "bray")
sampledf <- data.frame(sample_data(rare.male.cecal.dc.16s))
str(sampledf)


beta <- betadisper(bray.rare.male.cecal.dc.16s, sampledf$group)
set.seed(123)
permutest(beta)
# Permutation test for homogeneity of multivariate dispersions
# Permutation: free
# Number of permutations: 999
# 
# Response: Distances
# Df   Sum Sq   Mean Sq      F N.Perm Pr(>F)
# Groups     1 0.000289 0.0002891 0.0431    999   0.84
# Residuals 16 0.107197 0.0066998    



# Adonis test
set.seed(123)
adonis(bray.rare.male.cecal.dc.16s ~ group + Cage.Name, data = sampledf)
# Call:
# adonis(formula = bray.rare.male.cecal.dc.16s ~ group + Cage.Name,      data = sampledf) 
# 
# Permutation: free
# Number of permutations: 999
# 
# Terms added sequentially (first to last)
# 
#           Df SumsOfSqs  MeanSqs F.Model      R2 Pr(>F)    
# group      1   0.10414 0.104138  2.5673 0.08109  0.067 .  
# Cage.Name  8   0.85553 0.106942  2.6364 0.66621  0.001 ***
# Residuals  8   0.32451 0.040563         0.25270           
# Total     17   1.28418                  1.00000           
# ---
# Signif. codes:  0 �***� 0.001 �**� 0.01 �*� 0.05 �.� 0.1 � � 1


table( rare.male.cecal.dc.16s@sam_data$group )
# anxious less anxious 
#      10            8

table( rare.male.cecal.dc.16s@sam_data$Cage.Name )
# C1 C3 C6 H1 H2 H5 L2 L4 L6 
# 3  1  2  2  3  2  2  2  1 


#
#------------------------



#### END