diff --git a/.gitignore b/.gitignore index e75435c..5334e5b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,10 +1,12 @@ # History files .Rhistory .Rapp.history - +.Rproj.user # Session Data files .RData .RDataTmp +.Ruserdata +.DS_Store # User-specific files .Ruserdata @@ -47,3 +49,5 @@ po/*~ # RStudio Connect folder rsconnect/ + +Chapter_three/style.css \ No newline at end of file diff --git a/Chapter_three/R_script.html b/Chapter_three/R_script.html new file mode 100644 index 0000000..6f7e640 --- /dev/null +++ b/Chapter_three/R_script.html @@ -0,0 +1,2006 @@ + + + + +
+ + + + + + + + + +This supplementary information mainly describes data analysis part +used in this manuscript. The script used has been modified from Schandry +(2017).
+A common garden experiment was conducted at a single site on 20 enset +(Ensete ventricosum) landraces of 14 months old seedlings to +assess their resistance / tolerance against Xanthomonas wilt disease +caused by Xanthomonas vasicola pv. musacearum (Xvm). Disease +infection data was recorded as count of symptomatic and total leaves per +plant for each landrace. Data recorded at 15 days interval starting from +21st days post inoculation (dpi) to 155th day for 10 time points. +Percentage of symptomatic leaves calculated and converted into disease +index of 0-4 range numeric score multiplying each record by 4 and named +as ‘disease index’ or DI.
+If the required list of packages any are not available in the +machine, then the missing package(s) will be installed from CRAN.
+# List of package required
+packages = c("tidyr", "broom", "tibble", "dplyr", "ggplot2", "lme4", "lmerTest", "multcomp", "lsmeans", "MESS", 'agricolae', 'magrittr', "rms","coxme", "survival", "stringr","grid","tinytex")
+
+# Install packages missing packages.
+installed_packages <- packages %in% rownames(installed.packages())
+if (any(installed_packages == FALSE)) {
+ install.packages(packages[!installed_packages])
+}
+The input data was named as enset_xvm.csv and need to be placed in +the working directory so the the script will be able to read it.
+# get path to the current working direcotry
+work_dir <- getwd()
+
+path_to_enset_xvm <- file.path(work_dir, basename("enset_xvm.csv"))
+
+enset_xvm<- read.csv(path_to_enset_xvm)
+head(enset_xvm) # view the first six row of imported data
+## landrace plant ptno block ind dss21 leaf21 di21 dss35 leaf35 di35 dss50
+## 1 Lemat Enset 1 1 1 0 6 0 0 5 0 0
+## 2 Lemat Enset 2 1 1 0 5 0 0 4 0 0
+## 3 Lemat Enset 3 1 1 0 7 0 0 6 0 0
+## 4 Lemat Enset 4 1 1 0 7 0 0 7 0 0
+## 5 Lemat Enset 5 1 1 0 7 0 0 7 0 0
+## 6 Lemat Enset 6 1 1 0 6 0 0 6 0 0
+## leaf50 di50 dss65 leaf65 di65 dss80 leaf80 di80 dss95 leaf95 di95 dss110
+## 1 6 0 1 6 0.67 2 6 1.33 2 7 1.14 2
+## 2 5 0 0 5 0.00 0 5 0.00 0 6 0.00 0
+## 3 7 0 1 6 0.67 1 6 0.67 1 7 0.57 2
+## 4 7 0 0 7 0.00 1 7 0.57 1 8 0.50 2
+## 5 7 0 0 7 0.00 0 7 0.00 0 7 0.00 0
+## 6 6 0 0 6 0.00 0 6 0.00 0 6 0.00 0
+## leaf110 di110 dss125 leaf125 di125 dss140 leaf140 di140 dss155 leaf155 di155
+## 1 6 1.33 2 7 1.14 1 5 0.80 1 4 1.0
+## 2 7 0.00 1 6 0.67 0 5 0.00 0 4 0.0
+## 3 6 1.33 2 6 1.33 2 6 1.33 2 5 1.6
+## 4 6 1.33 2 8 1.00 1 7 0.57 0 7 0.0
+## 5 5 0.00 1 6 0.67 1 5 0.80 1 4 1.0
+## 6 6 0.00 0 6 0.00 0 7 0.00 0 7 0.0
+In the dataframe variable dss, leaf and di refers to count of +diseased leaves, count of total leaves, and disease index, respectively. +The numbers trailing dss, leaf, and di variables indicate the day on +which data was recorded. Disease index is calculated as 4 times +percentage of diseased leaves (4*dss/leaf). For details see Material and +Method section of the manuscript.
+Inspect data structure
+## 'data.frame': 600 obs. of 35 variables:
+## $ landrace: chr "Lemat" "Lemat" "Lemat" "Lemat" ...
+## $ plant : chr "Enset" "Enset" "Enset" "Enset" ...
+## $ ptno : int 1 2 3 4 5 6 7 8 9 10 ...
+## $ block : int 1 1 1 1 1 1 1 1 1 1 ...
+## $ ind : int 1 1 1 1 1 1 1 1 1 1 ...
+## $ dss21 : int 0 0 0 0 0 0 0 0 0 0 ...
+## $ leaf21 : int 6 5 7 7 7 6 7 6 5 5 ...
+## $ di21 : num 0 0 0 0 0 0 0 0 0 0 ...
+## $ dss35 : int 0 0 0 0 0 0 0 0 0 0 ...
+## $ leaf35 : int 5 4 6 7 7 6 7 6 5 5 ...
+## $ di35 : num 0 0 0 0 0 0 0 0 0 0 ...
+## $ dss50 : int 0 0 0 0 0 0 0 0 0 0 ...
+## $ leaf50 : int 6 5 7 7 7 6 7 6 5 5 ...
+## $ di50 : num 0 0 0 0 0 0 0 0 0 0 ...
+## $ dss65 : int 1 0 1 0 0 0 0 0 0 0 ...
+## $ leaf65 : int 6 5 6 7 7 6 7 6 5 5 ...
+## $ di65 : num 0.67 0 0.67 0 0 0 0 0 0 0 ...
+## $ dss80 : int 2 0 1 1 0 0 1 0 0 0 ...
+## $ leaf80 : int 6 5 6 7 7 6 7 6 5 5 ...
+## $ di80 : num 1.33 0 0.67 0.57 0 0 0.57 0 0 0 ...
+## $ dss95 : int 2 0 1 1 0 0 1 0 0 0 ...
+## $ leaf95 : int 7 6 7 8 7 6 7 6 5 5 ...
+## $ di95 : num 1.14 0 0.57 0.5 0 0 0.57 0 0 0 ...
+## $ dss110 : int 2 0 2 2 0 0 1 0 0 0 ...
+## $ leaf110 : int 6 7 6 6 5 6 7 6 6 6 ...
+## $ di110 : num 1.33 0 1.33 1.33 0 0 0.57 0 0 0 ...
+## $ dss125 : int 2 1 2 2 1 0 1 0 0 0 ...
+## $ leaf125 : int 7 6 6 8 6 6 7 6 6 6 ...
+## $ di125 : num 1.14 0.67 1.33 1 0.67 0 0.57 0 0 0 ...
+## $ dss140 : int 1 0 2 1 1 0 2 0 0 0 ...
+## $ leaf140 : int 5 5 6 7 5 7 7 7 6 6 ...
+## $ di140 : num 0.8 0 1.33 0.57 0.8 0 1.14 0 0 0 ...
+## $ dss155 : int 1 0 2 0 1 0 2 0 0 0 ...
+## $ leaf155 : int 4 4 5 7 4 7 6 7 6 6 ...
+## $ di155 : num 1 0 1.6 0 1 0 1.33 0 0 0 ...
+The wide formatted enset_xvm was converted to long format for ease of +analysis. In order to keep the original data as it was we have grouped +identical variable together for all landraces and then regrouped into +single dataframe to convert it to long formatted data.
+#extract number of leaves showing disease symptoms at each evaluation day
+leaf_dss=enset_xvm[,c(1:6,9,12,15,18,21,24,27,30,33)]
+
+#extract total leaves at each evaluation day
+leaf_num <- enset_xvm[,c(1:5,7,10,13,16,19,22,25,28,31,34)]
+
+#extract disease index values at each evaluation day
+di_wk <- enset_xvm[,c(1:5,8,11,14,17,20,23,26,29,32,35)]
+
+library(tidyr)
+#convert the extracted data into long format
+leaf_dss_long <- gather(leaf_dss,key = "dpi",
+ value = "dss_leaf",
+ dss21:dss155,na.rm = F)# converting data of disease leaf (this is 3 to 13) into long format
+leaf_num_long <- gather(leaf_num,key = "leaf_number",
+ value = "leaf_num",
+ leaf21:leaf155,na.rm = F)#converting total leaf data (this is 3 to 13) into long format
+di_wk_long <- gather(di_wk,key = "di",
+ value = "disease_index",
+ di21:di155,na.rm = F)# converting data disease index (this is 3 to 13) into long format
+
+#Combine long formatted subset data
+enset_xvm_long <- cbind(leaf_dss_long,leaf_num_long$leaf_num,di_wk_long$disease_index)
+
+enset_xvm_long$subject <- c(1:nrow(enset_xvm_long))# add subject variable. This variable helps to assign a unique, numeric identifier to each individual.
+enset_xvm_long$count_dslf <- c(rep(1,6000))# add variable count_dslf: a repeats of 1 from 1 to the length of data set (6000). This variable will be used to count the number of observations in landrace at each evaluation period that show disease. It might be used to calculate incubation period of landraces for the disease under study.
+colnames(enset_xvm_long) <- c("landrace", "plant", "ptno", "block", "ind", "dpi", "diseased", "leaf", "di", "subject", "count_dslf")# rename column headings
+enset_xvm_long$id <- interaction(enset_xvm_long$landrace,enset_xvm_long$ptno,enset_xvm_long$block,enset_xvm_long$ind) # create id column of landrace, plant number and block.
+
+enset_xvm_long <- na.omit(enset_xvm_long) #Generate a table that does not contain missing observations
+enset_xvm_long$dpi <- na.omit(as.numeric (unlist( strsplit( as.character( enset_xvm_long$dpi ), "dss" )))) #discard the "X" and save the number as a number (instead of a factor).
+
+# Factorize variables landrace, plant and block
+enset_xvm_long$block <- as.factor(enset_xvm_long$block)
+enset_xvm_long$plant <- as.factor(enset_xvm_long$plant)
+enset_xvm_long$landrace <- as.factor(enset_xvm_long$landrace)
+
+# # view structure of long formatted data
+# str(enset_xvm_long)
+Checking overall summary for missing values, or out of range +misrecorded values. Missing values coded as NA and will be observed +print in summary function if there are any. Out of range values can be +checked by looking at the min and max values of the summary table and +make sure if they’re within the expected range.
+summary(enset_xvm_long)
+## landrace plant ptno block ind
+## Abatemerza : 300 Enset:6000 Min. : 1.0 1:2000 Min. : 1.00
+## Ado : 300 1st Qu.: 3.0 2:2000 1st Qu.: 5.75
+## Alagena : 300 Median : 5.5 3:2000 Median :10.50
+## Arkiya : 300 Mean : 5.5 Mean :10.50
+## Bededet : 300 3rd Qu.: 8.0 3rd Qu.:15.25
+## Bota Arkiya: 300 Max. :10.0 Max. :20.00
+## (Other) :4200
+## dpi diseased leaf di
+## Min. : 21.0 Min. : 0.000 Min. : 1.000 Min. :0.000
+## 1st Qu.: 50.0 1st Qu.: 0.000 1st Qu.: 5.000 1st Qu.:0.000
+## Median : 87.5 Median : 2.000 Median : 6.000 Median :1.330
+## Mean : 87.6 Mean : 2.287 Mean : 6.255 Mean :1.483
+## 3rd Qu.:125.0 3rd Qu.: 4.000 3rd Qu.: 7.000 3rd Qu.:2.400
+## Max. :155.0 Max. :15.000 Max. :20.000 Max. :4.000
+##
+## subject count_dslf id
+## Min. : 1 Min. :1 Lemat.1.1.1: 10
+## 1st Qu.:1501 1st Qu.:1 Lemat.2.1.1: 10
+## Median :3000 Median :1 Lemat.3.1.1: 10
+## Mean :3000 Mean :1 Lemat.4.1.1: 10
+## 3rd Qu.:4500 3rd Qu.:1 Lemat.5.1.1: 10
+## Max. :6000 Max. :1 Lemat.6.1.1: 10
+## (Other) :5940
+Assigning and setting proper contrast is in analysis of variance. See +for detail here. +“Treatment” contrasts specify that the first alphabetical level will be +used as a reference for all others (see landrace below), while a “sum” +constrast means that the reference value is the mean across all levels +of that variable (the grand mean).
+####Specify what should be "appropriate" contrasts
+contrasts(enset_xvm_long$landrace) <- "contr.treatment" #First alphabetical landrace will be the baseline!
+contrasts(enset_xvm_long$block) <- "contr.sum" # blockes will be averaged to generate the baseline!
+A new variable called “Useful” is added the data and is a binary +yes/no variable. The purpose of this variable is marking observations, +that are re-observations of a subject that has previously reached +disease index 4. As disease index of 4 means 100% wilting, the plant +either dead or considered as dead when it reached this degree of +wilting. Since death is permanent, continuing to observe this plant is +unlikely to provide new information. To more accurately capture the +linear phase of disease development, all observations where no symptoms +are visible, except the one on the day before disease onset, will also +be assigned to Useful = No.
+###Interesting R-related observation, the below does not work when subsetting is done with filter(),
+###because filter does not retain rownames!
+for (i in 1:max(enset_xvm_long$subject)) { ###Go by subject
+ dummy1 <- enset_xvm_long[enset_xvm_long$subject==i,] ##Create a first dummy object, that is a subset of the full data containing the current subject
+ if(min(dummy1$di) == 0){
+ #remove those observations that are before disease onset, except the one directly before disease onset.
+ dummy1 <- dummy1[dummy1$dpi %in% (max(dummy1$dpi[dummy1$di==0]):max(dummy1$dpi)),]
+ }
+ if (max(dummy1$di) == 4) { ###If this subject dies at some point
+ dummy2 <- dummy1[dummy1$di==4,] ###Make a new dummy object, that only contains those recordings where di=4
+ NEW <- enset_xvm_long[enset_xvm_long$subject==i & (enset_xvm_long$dpi %in% min(dummy1$dpi):min(dummy2$dpi)),] ###Generate data subset "NEW", which contains those observations for a subject, that are between (including) the last recording where di=0 and the first recording where disease index is 4.
+ } else { ###If this dubject does not die
+ NEW <- dummy1 ###New is the same as dummy1
+ }
+ enset_xvm_long$Useful[rownames(enset_xvm_long) %in% rownames(NEW)] <- c("Yes") ###All of those row(names) that are part of the "NEW" object are useful. Therefore these receive status "Yes" in column "Useful"
+}
+enset_xvm_long$Useful[which(is.na(enset_xvm_long$Useful))] <- c("No") ###Those that are not yes (and therefore are NA) become No
+rm(dummy1,dummy2)
+Inspecting the final data before analysis
+library(broom)
+library(tibble)
+as_tibble(enset_xvm_long)
+## # A tibble: 6,000 × 13
+## landrace plant ptno block ind dpi diseased leaf di subject
+## <fct> <fct> <int> <fct> <int> <dbl> <int> <int> <dbl> <int>
+## 1 Lemat Enset 1 1 1 21 0 6 0 1
+## 2 Lemat Enset 2 1 1 21 0 5 0 2
+## 3 Lemat Enset 3 1 1 21 0 7 0 3
+## 4 Lemat Enset 4 1 1 21 0 7 0 4
+## 5 Lemat Enset 5 1 1 21 0 7 0 5
+## 6 Lemat Enset 6 1 1 21 0 6 0 6
+## 7 Lemat Enset 7 1 1 21 0 7 0 7
+## 8 Lemat Enset 8 1 1 21 0 6 0 8
+## 9 Lemat Enset 9 1 1 21 0 5 0 9
+## 10 Lemat Enset 10 1 1 21 0 5 0 10
+## # ℹ 5,990 more rows
+## # ℹ 3 more variables: count_dslf <dbl>, id <fct>, Useful <chr>
+# tidy(enset_xvm_long) ###This can be used to assess the descriptive statistics of the data.
+Here the data is summarized to visualize area under the curve using +disease index (y) and time (x).
+Summarizing disease index recordings and generating disease area +plot
+library(dplyr)
+di_summary <- enset_xvm_long %>%
+ group_by(plant, landrace, block, dpi) %>%
+ summarise(mean=mean(di),sd=sd(di),se=sd(di)/sqrt(length(di))) ##calculate within block mean, sd, and se for each plant/landrace combination.
+##Averages within each replicate. This summary table is mainly helpful for plotting, not used for analysis...
+colnames(di_summary) <- c("plant", "landrace", "block", "dpi", "mean", "sd", "se") ### rename column names
+
+library(ggplot2)
+ggplot(di_summary,aes(x=dpi,y=mean)) + ###Use data di_summary
+ geom_area(aes(color=block,fill=block))+
+ # geom_line() +#show line by each block
+ #aes(x=dpi,y=mean,color=landrace) + ###Color by landrace, specify x and y.
+ # geom_area(aes(fill=block),position="identity",alpha=0.15) + ###Area plot, colored by landrace
+ facet_wrap(~landrace) + ###One plot per landrace
+ labs(x = "Days post infection", y = "Average disease index") + #Labels
+ ggtitle("Disease areas per landrace, for blocks represented by three filled color")+ #Title
+ theme_bw()+
+ theme(strip.text.x = element_text(size = 14,face="bold"),
+ axis.text.x = element_text(size = 10,face = "bold"),
+ axis.title.x = element_text(size = 12,face = "bold"),
+ axis.title.y = element_text(size = 12,face = "bold"),
+ axis.text.y =element_text(size = 12,face ="bold" ),
+ legend.position = "none")
+
+From this plot, one can see that in the example dataset the areas +differ quite drastically among different landraces. To calculated the +actual audps for each individual in the dataset a new data frame is +created. AUDCP is calculated using audps function in agricolae +package.
+library("MESS")
+library('agricolae') # Used to calculated audps
+library('magrittr') # Adds %>% operator for chaining commands
+
+# Calculate AUDPS
+audps_data <- enset_xvm_long %>%
+ group_by(landrace, ptno, block) %>% # do any following calculations on every combination of treatments
+ summarise(audps = round(audps(di, dpi),0))
+Linear model and linear mixed effects model were fitted to identify +and select the better fit model to area under disease progress data to +be used for final analysis.
+library("lme4")
+library("lmerTest")
+auc_lm <- lm(audps~landrace+block,data=audps_data) #Linear model to identify landrace and block effect.
+auc_lm1<-lm(audps~landrace,data=audps_data) ### linear model to identify landrace effects.
+auc_lmer <- lmer(audps ~ landrace + (1|block),data = audps_data ) #Linear mixed effects model. AUC modeled as a function of landrace and random effects of block.
+AIC(auc_lm,auc_lm1,auc_lmer) #Lower AIC, better fit, linear mixed model is slightly better. #Hence, mixed model with block as random effect will be used for further analysis.
+## df AIC
+## auc_lm 23 7164.997
+## auc_lm1 21 7194.536
+## auc_lmer 22 7021.008
+Summary of mixed linear model fit to AUDPS
+summary(auc_lmer) ### A model summary, containing information on the model.
+## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
+## lmerModLmerTest]
+## Formula: audps ~ landrace + (1 | block)
+## Data: audps_data
+##
+## REML criterion at convergence: 6977
+##
+## Scaled residuals:
+## Min 1Q Median 3Q Max
+## -3.3257 -0.6504 -0.0080 0.6636 3.6631
+##
+## Random effects:
+## Groups Name Variance Std.Dev.
+## block (Intercept) 674.8 25.98
+## Residual 8642.7 92.97
+## Number of obs: 600, groups: block, 3
+##
+## Fixed effects:
+## Estimate Std. Error df t value Pr(>|t|)
+## (Intercept) 264.567 22.650 9.148 11.681 8.39e-07 ***
+## landraceAdo -66.867 24.004 578.000 -2.786 0.00552 **
+## landraceAlagena -94.400 24.004 578.000 -3.933 9.42e-05 ***
+## landraceArkiya 121.967 24.004 578.000 5.081 5.07e-07 ***
+## landraceBededet -107.900 24.004 578.000 -4.495 8.40e-06 ***
+## landraceBota Arkiya -48.667 24.004 578.000 -2.027 0.04307 *
+## landraceDirbo -19.133 24.004 578.000 -0.797 0.42572
+## landraceGefetano -30.800 24.004 578.000 -1.283 0.19996
+## landraceGenticha 12.167 24.004 578.000 0.507 0.61244
+## landraceGezewet -109.067 24.004 578.000 -4.544 6.73e-06 ***
+## landraceGinbuwe -2.033 24.004 578.000 -0.085 0.93252
+## landraceGodere 43.067 24.004 578.000 1.794 0.07331 .
+## landraceHae'la -167.633 24.004 578.000 -6.984 7.93e-12 ***
+## landraceKuro -115.200 24.004 578.000 -4.799 2.03e-06 ***
+## landraceLemat -144.067 24.004 578.000 -6.002 3.45e-09 ***
+## landraceMazia -145.200 24.004 578.000 -6.049 2.62e-09 ***
+## landraceNechuwe -11.200 24.004 578.000 -0.467 0.64097
+## landraceUnjame -5.300 24.004 578.000 -0.221 0.82533
+## landraceWachiso 4.833 24.004 578.000 0.201 0.84049
+## landraceYesha 32.200 24.004 578.000 1.341 0.18030
+## ---
+## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+Model summaries contain information on the estimated mean (Estimate) +and standard errors, together with a t - and corresponding p-value in +the output of the summary function. Note, that the above only contains +information on differences between different levels and the reference, +which is called (Intercept) in this case it’s landrace Abatemerza. The +reference is contrast dependent.
+ranova(auc_lmer)
+## ANOVA-like table for random-effects: Single term deletions
+##
+## Model:
+## audps ~ landrace + (1 | block)
+## npar logLik AIC LRT Df Pr(>Chisq)
+## <none> 22 -3488.5 7021.0
+## (1 | block) 21 -3500.9 7043.8 24.798 1 6.368e-07 ***
+## ---
+## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+#The result displays block has a significant random effect explaining the importance of including block variable in the model.
+As observed in the model summary R by default does contrast with the +alphabetically first landrace. This can be analyzed using a generalized +linear hypothesis test (glht), while adjusting for multiple comparisons +using Tukey’s method. Below is shown glht analysis using Tukey’s method +for area under disease progress stairs (AUDPS).
+library("multcomp")
+library("lsmeans")
+##carry out mean separation and adjust for compact letter desply (cld)###
+cld_audps_lmer <- cld(glht(auc_lmer, linfct = mcp(landrace = "Tukey")), test = adjusted("holm"))###Save letters
+cld_audps_cbind <- cbind(levels(enset_xvm_long$landrace),cld_audps_lmer$mcletters$Letters) ###bind letters to columns of respective landraces
+cld_audps_df <- data.frame(cld_audps_cbind)#format extracted cld with landraces from mean separation to data.frame
+colnames(cld_audps_df) <- c("landrace","aud_let")###Name columns
+rownames(cld_audps_df) <- NULL #remove repeated rownames
+cld_audps_df$landrace <- as.factor(cld_audps_df$landrace)
+
+#Extrating estimated mean and confidence interval of landraces using lsmeanLT
+audps_lsmeansLT <- lsmeansLT(auc_lmer,test.effs="landrace")#lsmeansLT calculates Least Squares Means and Confidence Intervals for the factors of a fixed part of mixed effects model of lmer object
+#audps_lsmeansLT=audps_lsmeansLT$lsmeans.table# extract table containing means with confidence interval
+rownames(audps_lsmeansLT) <- NULL #remove row names containing duplicates of landrace lists
+audps_lsmeansLT<-as.data.frame(audps_lsmeansLT)[,-1]
+
+library (dplyr)
+
+cld_audps <- cbind(audps_lsmeansLT,cld_audps_df[,2]) #combine extracted mean data from lmer using lsmeansLT and dataframe of letters from cld
+rownames(cld_audps) <- NULL #remove repeated rownames
+colnames(cld_audps)=c("landrace", "audps", "Stand_Error","df","t-value", "lower", "upper", "p_value", "audp_let")# rename column of combined orinal data. Here landrace is duplicated and can be remove but left as landrace
+cld_audps <- as.data.frame(cld_audps) #Coerce data containing means with confidence interval and letter display to dataframe
+cld_audps <- cld_audps[order(cld_audps$audps),]# reorder by mean value to cheat ggplot
+#cld_audps_lmerer$landrace<-factor(cld_audps_lmerer$landrace,levels=cld_audps_lmerer$landrace)# put factors in level
+
+audps_data__cld <- left_join(audps_data,cld_audps_df, by ="landrace",copy=T) ###Add letter information to raw data - audps
+head(audps_data__cld)
+## # A tibble: 6 × 5
+## # Groups: landrace, ptno [2]
+## landrace ptno block audps aud_let
+## <fct> <int> <fct> <dbl> <chr>
+## 1 Abatemerza 1 1 146 ab
+## 2 Abatemerza 1 2 386 ab
+## 3 Abatemerza 1 3 228 ab
+## 4 Abatemerza 2 1 311 ab
+## 5 Abatemerza 2 2 327 ab
+## 6 Abatemerza 2 3 286 ab
+This information can, for example, be integrated into a boxplot of +the individual disease areas. For example, using a compact letter +display, in combination with audps, combines statistical and visual +information. First, however, the grouping letters need to be calculated. +Then, these letters are added to the boxplot generated earlier.
+####Generate plot of meanCI of AUDCP with significance letters and raw data as jittered points
+cld_audps <- cld_audps[order(cld_audps$audps),]# reorder by mean value to cheat ggplot
+cld_audps$landrace <- factor(cld_audps$landrace,levels = cld_audps$landrace)#put factors in level
+
+ggplot(aes(x=landrace, y=audps, color=landrace),data=audps_data__cld) + ###Plot the audps_data
+ scale_shape_manual(values=c(9, 19, 22,26))+
+ geom_crossbar(data = cld_audps, aes(x = landrace, y = audps, ymin = lower, ymax = upper,fill=landrace), alpha=0.3) +
+ geom_jitter(aes(shape=block),data = audps_data__cld) + ###with jitter overplotted, symbol shape defined by block
+ geom_text(aes(x=landrace, y=0.039, label=aud_let),color="black", data=audps_data__cld) + ###Get the letters from auc_cld
+ #and write those to position y=-3
+ labs(y="audps") + #Y-Axis label
+ ggtitle("Mean and raw audps values per landrace and grouping letters") +#Title
+ theme_bw()+
+ theme(axis.title.x = element_text(face="bold", size=12), axis.text.x = element_text(angle=90, vjust=0.5, size=12))+
+ theme(legend.position="none")
+
+Linear mixed model (Bates, 2010) +used to handle the repeated measure data collected in this experiment +for testing landrace specific influence on the area under the disease +progression curve. Here, individual data of disease index which +formatted earlier will be used. Diease index (di) is modeled on the +fixed effects “landrace” and an interaction of landrace and time. Block +is included as a random effect, meaning it is not of direct interest, +but assumed to capture introduced variation, specifically by affecting +the intercept.
+library("lmerTest")
+###Define contrasts for lmer
+contrasts(enset_xvm_long$landrace) <- "contr.treatment"###First alphabetical landrace will be the baseline!
+
+contrasts(enset_xvm_long$block) <- "contr.sum" ###blockes will be averaged
+###Drop things that are not "Useful"
+enset_xvm_long_useful <- filter(enset_xvm_long, Useful=="Yes")
+# str(enset_xvm_long_useful)
+## only landrace as fixed effect (a simple model),
+## block and subject are assumed to have random effects.
+## landrace and time are assumed to interact, meaning that for each day, the landrace specific slope is estimated.
+## See page 6 (Table2) of the lme4 vignette on how to construct error terms
+## This model is supposed to capture disease development, so it will work with the data previously deemed "useful"
+disease_lmer <- lmer(di~landrace+(1|dpi/landrace)+(1|block/plant),data=enset_xvm_long_useful,REML = F)# Here the final optimum model is that best fits the data is used and model simplication steps not shown here.
+View summary of ananlysis of variance
+#analysis of variance
+anova(disease_lmer)# Analysis of variance shows highly significant difference among landrace for disease index
+## Type III Analysis of Variance Table with Satterthwaite's method
+## Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
+## landrace 584.82 30.78 19 189.98 37.899 < 2.2e-16 ***
+## ---
+## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+The model can be investigated using summary functions.
+###Check model summary
+summary(disease_lmer)
+## Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's
+## method [lmerModLmerTest]
+## Formula: di ~ landrace + (1 | dpi/landrace) + (1 | block/plant)
+## Data: enset_xvm_long_useful
+##
+## AIC BIC logLik deviance df.resid
+## 16066.9 16234.4 -8008.4 16016.9 5975
+##
+## Scaled residuals:
+## Min 1Q Median 3Q Max
+## -3.4267 -0.6379 -0.0575 0.6076 3.5720
+##
+## Random effects:
+## Groups Name Variance Std.Dev.
+## landrace:dpi (Intercept) 0.038658 0.19662
+## dpi (Intercept) 0.575584 0.75867
+## plant:block (Intercept) 0.022272 0.14924
+## block (Intercept) 0.007284 0.08535
+## Residual 0.812169 0.90120
+## Number of obs: 6000, groups:
+## landrace:dpi, 200; dpi, 10; plant:block, 3; block, 3
+##
+## Fixed effects:
+## Estimate Std. Error df t value Pr(>|t|)
+## (Intercept) 1.76837 0.27200 14.14362 6.501 1.33e-05 ***
+## landraceAdo -0.44803 0.11466 189.97888 -3.908 0.00013 ***
+## landraceAlagena -0.63163 0.11466 189.97888 -5.509 1.16e-07 ***
+## landraceArkiya 0.82113 0.11466 189.97888 7.162 1.70e-11 ***
+## landraceBededet -0.72333 0.11466 189.97888 -6.309 1.93e-09 ***
+## landraceBota Arkiya -0.32393 0.11466 189.97888 -2.825 0.00523 **
+## landraceDirbo -0.12797 0.11466 189.97888 -1.116 0.26579
+## landraceGefetano -0.20840 0.11466 189.97888 -1.818 0.07070 .
+## landraceGenticha 0.08113 0.11466 189.97888 0.708 0.48005
+## landraceGezewet -0.73017 0.11466 189.97888 -6.368 1.40e-09 ***
+## landraceGinbuwe -0.01337 0.11466 189.97888 -0.117 0.90732
+## landraceGodere 0.28467 0.11466 189.97888 2.483 0.01390 *
+## landraceHae'la -1.12143 0.11466 189.97888 -9.781 < 2e-16 ***
+## landraceKuro -0.77153 0.11466 189.97888 -6.729 1.96e-10 ***
+## landraceLemat -0.96287 0.11466 189.97888 -8.398 1.04e-14 ***
+## landraceMazia -0.97227 0.11466 189.97888 -8.480 6.23e-15 ***
+## landraceNechuwe -0.07667 0.11466 189.97888 -0.669 0.50452
+## landraceUnjame -0.03560 0.11466 189.97888 -0.310 0.75653
+## landraceWachiso 0.03403 0.11466 189.97888 0.297 0.76692
+## landraceYesha 0.21630 0.11466 189.97888 1.887 0.06075 .
+## ---
+## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+## optimizer (nloptwrap) convergence code: 0 (OK)
+## Model failed to converge with max|grad| = 0.0344211 (tol = 0.002, component 1)
+It’s important to check statistically importance random effects in +the model so that decision to include or remove them will be made before +proceeding to any analysis.
+rand(disease_lmer)# the random effects have shown highly significance suggesting that it wise to include these random effects in the model
+## ANOVA-like table for random-effects: Single term deletions
+##
+## Model:
+## di ~ landrace + (1 | landrace:dpi) + (1 | dpi) + (1 | plant:block) + (1 | block)
+## npar logLik AIC LRT Df Pr(>Chisq)
+## <none> 25 -8008.4 16067
+## (1 | landrace:dpi) 24 -8056.8 16162 96.78 1 <2e-16 ***
+## (1 | dpi) 24 -8209.7 16467 402.51 1 <2e-16 ***
+## (1 | plant:block) 24 -8008.4 16065 0.00 1 1
+## (1 | block) 24 -8008.4 16065 0.00 1 1
+## ---
+## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+Here function lsmeanLT from lmer package to extract means with their +confidence interval. Then multiple comparison of landrace for disease +index can be carried using general linear hypotheses (glht)and multiple +comparisons for parametric models from multicomp package. The result +adjusted to letter display for difference among landrace using compact +letter display (cld) function. ggplot2 used to plot the result combining +with original disease index data.
+di_lsmeansLT=lsmeansLT(disease_lmer,test.effs="landrace")#lsmeansLT calculates Least Squares Means and Confidence Intervals for the factors of a fixed part of mixed effects model of lmer object
+
+di_lsmeansLT<-as.data.frame(di_lsmeansLT)[,-1]
+rownames(di_lsmeansLT) <- NULL #remove row names containing duplicates of landrace lists
+
+library (dplyr)
+di_lsmeansLT <- di_lsmeansLT %>% mutate_if(is.numeric, funs(round(., 2)))#round data.frame to two digist using dplyr
+
+##carry out mean separation and adjust for compact letter desply (cld)###
+cld_di_lmer <- cld(glht(disease_lmer, linfct = mcp(landrace = "Tukey")), test = adjusted("holm"))###Save letters
+cld_di_lmer <- cbind(levels(enset_xvm_long$landrace),cld_di_lmer$mcletters$Letters) ###bind letters to columns of respective landraces
+cld_di_lmer <- data.frame(cld_di_lmer)#formatextracted cld with landraces from mean separation to data.frame
+colnames(cld_di_lmer) <- c("landrace","di_let")###Name columns
+rownames(cld_di_lmer) <- NULL #remove repeated rownames
+di_lmer_cld <- cbind(di_lsmeansLT,cld_di_lmer[,2]) #combine extracted mean data from lmer using lsmeansLT and dataframe of letters from cld
+rownames(di_lmer_cld) <- NULL #remove repeated rownames
+colnames(di_lmer_cld) <- c("landrace", "mean", "Stand_Error", "DF", "t-value", "lower", "upper", "p-value", "di_let")# rename column of combined orinal data.
+di_lmer_cld <- as.data.frame(di_lmer_cld) #Coerce data containing means with confidence interval and letter display to dataframe
+di_lmer_cld <- di_lmer_cld[order(di_lmer_cld$mean),]# reorder by mean value to cheat ggplot
+di_lmer_cld$landrace<-factor(di_lmer_cld$landrace,levels=di_lmer_cld$landrace)# put factors in level
+di_long_cld <- left_join(enset_xvm_long,cld_di_lmer,by="landrace",copy=T) ###Add letter information to raw data - disease index
+di_long_cld_copy <- di_long_cld
+di_long_cld_copy <- di_long_cld_copy[order(di_long_cld_copy$di),]# reorder by mean value to cheat ggplot
+di_lmer_cld=di_lmer_cld[order(di_lmer_cld$mean),]# reorder by mean value to cheat ggplot
+di_lmer_cld$landrace<-factor(di_lmer_cld$landrace, levels = di_lmer_cld$landrace)# put factors in level
+di_long_cld$block <- as.factor(di_long_cld$block)
+####Generate plot of mean+confidence interval of disease index with significance letters and raw data as jittered points#
+ggplot(aes(x=landrace, y=di, color=landrace),data=di_long_cld) + ###Plot the origial disease index data
+ geom_crossbar(data = di_lmer_cld, aes(x = landrace, y = mean, ymin = lower, ymax = upper,fill=landrace), alpha=0.3) +
+ geom_jitter(aes(shape=block)) + ###with jitter overplotted, symbol shape defined by block
+ scale_shape_manual(values=c(9, 19, 22,23))+
+ geom_text(aes(x=landrace, y=-0.25, label=di_let,size=10),color="black", data=di_long_cld) + ###Get the letters from di_long_cld
+ #and write those to position y=-3
+ labs(y="disease index") + #Y-Axis label
+ ggtitle("disease index raw values and mean from the LMM\ nwith 95% CI per landrace and grouping letters") +#Title
+ theme_bw()+
+ theme(axis.title.x = element_text(face="bold", size=14), axis.text.x = element_text(angle=90, vjust=0.5, size=12,face = "bold"))+
+ theme(axis.title.y= element_text(size = 12,face = "bold"))+
+ theme(legend.position="none")
+
+Apparent infection rate which is the speed at which an epidemic +develops (Meena +et al., 2011), was calculated as the slope of disease index +development.
+Disease index data though best fits to mixed model we could not able +to extract rate of disease development value for each landrace using +mixed model approach. Instead we considered linear development of +disease and used disease index as function of time and intercept for +analysis (i.e y=ax+b, where y is disease index, x is time and b is +intercept). In this way a function that goes through each landrace is +constructed to extract the slop of disease index from each landrace.
+slop_data=di_wk# take copy of the original wide formated data for analysis
+
+slop_api<-numeric(600)
+time<-c(21,35,50,65,80,95,110,125,140,155)# disease evaluation days after inoculation of the pathogen.
+for(i in 1:600) # go row by row from the first to the last row (60th)
+{
+ y<-as.numeric(slop_data[i,6:15])#drop column 1-3 from the data disease index (di)
+ l<-lm(y~time) # construct linear model for each row data (each landrace in blocks so that the coeffiecent of linear model will extract for slop analysis)
+ slop_api[i]<- l$coefficients[2]# extract slop of the model
+}
+di_wk$slop_api=round(slop_api,3)# add extract slop to original disease index data
+slop=di_wk[,c(1:6,16)]# take columns containing landrace, plant and slop
+# head(slop)#view the first six rows
+slop$block=as.factor(slop$block)#factorize block
+slop$landrace=as.factor(slop$landrace)#factorize landrace
+enset_xvm$id=interaction(enset_xvm$landrace,enset_xvm$ptno,enset_xvm$block)# create combination id of landrace plant number and block which it belongs
+contrasts(slop$landrace) <- "contr.treatment"###First alphabetical landrace will be the baseline!
+contrasts(slop$block) <- "contr.sum" ###blockes will be averaged to generate the baseline!
+#analyse data use
+slop_lm=lm(slop_api~ block+landrace, data=slop)
+slop_lmer=lmer(slop_api~ landrace +(1|block), data=slop)
+AIC(slop_lm,slop_lmer)# The lower AIC the better the model is and when comparing negative AIC model with, model with high negative value is better as indicated before in anakysis of area under disease progress. So linear model is the choice for analysis.
+## df AIC
+## slop_lm 23 -3962.235
+## slop_lmer 22 -3735.224
+summary(slop_lm)# view model summary
+##
+## Call:
+## lm(formula = slop_api ~ block + landrace, data = slop)
+##
+## Residuals:
+## Min 1Q Median 3Q Max
+## -0.0253917 -0.0060133 -0.0000358 0.0058717 0.0252050
+##
+## Coefficients:
+## Estimate Std. Error t value Pr(>|t|)
+## (Intercept) 0.0209667 0.0015947 13.148 < 2e-16 ***
+## block1 -0.0027050 0.0005043 -5.364 1.18e-07 ***
+## block2 0.0024250 0.0005043 4.809 1.94e-06 ***
+## landraceAdo -0.0021000 0.0022553 -0.931 0.352165
+## landraceAlagena -0.0059000 0.0022553 -2.616 0.009127 **
+## landraceArkiya -0.0063000 0.0022553 -2.793 0.005388 **
+## landraceBededet -0.0062333 0.0022553 -2.764 0.005894 **
+## landraceBota Arkiya -0.0081333 0.0022553 -3.606 0.000337 ***
+## landraceDirbo -0.0067333 0.0022553 -2.986 0.002950 **
+## landraceGefetano 0.0005667 0.0022553 0.251 0.801701
+## landraceGenticha -0.0011333 0.0022553 -0.503 0.615490
+## landraceGezewet -0.0054667 0.0022553 -2.424 0.015659 *
+## landraceGinbuwe -0.0044667 0.0022553 -1.981 0.048117 *
+## landraceGodere 0.0065333 0.0022553 2.897 0.003911 **
+## landraceHae'la -0.0077333 0.0022553 -3.429 0.000649 ***
+## landraceKuro -0.0038667 0.0022553 -1.714 0.086974 .
+## landraceLemat -0.0137000 0.0022553 -6.075 2.25e-09 ***
+## landraceMazia -0.0072333 0.0022553 -3.207 0.001414 **
+## landraceNechuwe 0.0011333 0.0022553 0.503 0.615490
+## landraceUnjame -0.0048333 0.0022553 -2.143 0.032520 *
+## landraceWachiso -0.0046000 0.0022553 -2.040 0.041839 *
+## landraceYesha -0.0030333 0.0022553 -1.345 0.179155
+## ---
+## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+##
+## Residual standard error: 0.008735 on 578 degrees of freedom
+## Multiple R-squared: 0.2285, Adjusted R-squared: 0.2004
+## F-statistic: 8.15 on 21 and 578 DF, p-value: < 2.2e-16
+library(lsmeans)
+
+slop_lsmeans=lsmeans(slop_lm,"landrace") #calculate estimated mean, standard error, DF, and confidence intervals
+slop_lsmeans=summary(slop_lsmeans)
+slop_lsmeans_df=data.frame(slop_lsmeans)# converting it data frame
+str(slop_lsmeans_df)# view data structure
+## 'data.frame': 20 obs. of 6 variables:
+## $ landrace: Factor w/ 20 levels "Abatemerza","Ado",..: 1 2 3 4 5 6 7 8 9 10 ...
+## $ lsmean : num 0.021 0.0189 0.0151 0.0147 0.0147 ...
+## $ SE : num 0.00159 0.00159 0.00159 0.00159 0.00159 ...
+## $ df : num 578 578 578 578 578 578 578 578 578 578 ...
+## $ lower.CL: num 0.0178 0.0157 0.0119 0.0115 0.0116 ...
+## $ upper.CL: num 0.0241 0.022 0.0182 0.0178 0.0179 ...
+colnames(slop_lsmeans_df)=c("landrace", "Estimate", "Stand_Error", "DF", "lower","upper")#rename column headings
+slop_lsmeans_df=slop_lsmeans_df %>% mutate_if(is.numeric, funs(round(., 3)))#round data.frame to two digits using dplyr
+
+###Extract estimated mean and confidence interval of landraces
+di_lsmeansLT=di_lsmeansLT %>% mutate_if(is.numeric, funs(round(., 2)))#round data.frame to two digist using dplyr
+slop_lsmeans=lsmeans(slop_lm,"landrace") #calculate estimated mean, standard error, DF, and confidence intervals
+slop_lsmeans=summary(slop_lsmeans)
+slop_lsmeans_df=data.frame(slop_lsmeans)# converting it data frame
+str(slop_lsmeans_df)# view data structure
+## 'data.frame': 20 obs. of 6 variables:
+## $ landrace: Factor w/ 20 levels "Abatemerza","Ado",..: 1 2 3 4 5 6 7 8 9 10 ...
+## $ lsmean : num 0.021 0.0189 0.0151 0.0147 0.0147 ...
+## $ SE : num 0.00159 0.00159 0.00159 0.00159 0.00159 ...
+## $ df : num 578 578 578 578 578 578 578 578 578 578 ...
+## $ lower.CL: num 0.0178 0.0157 0.0119 0.0115 0.0116 ...
+## $ upper.CL: num 0.0241 0.022 0.0182 0.0178 0.0179 ...
+colnames(slop_lsmeans_df)=c("landrace", "Estimate", "Stand_Error", "DF", "lower","upper")#rename column headings
+slop_lsmeans_df=slop_lsmeans_df %>% mutate_if(is.numeric, funs(round(., 3)))#round data.frame to two digist using dplyr
+
+##carry out mean separation and adjust for compact letter desply (cld)###
+cld_slop_lm=cld(glht(slop_lm, linfct = mcp(landrace = "Tukey")), test = adjusted("holm"))###Save letters
+cld_slop_lm <- cbind(levels(enset_xvm_long_useful$landrace),cld_slop_lm$mcletters$Letters) ###bind letters to columns of respective landraces
+cld_slop_lm=data.frame(cld_slop_lm)#formatextracted cld with landraces from mean separation to data.frame
+colnames(cld_slop_lm)=c("landrace","Letter")###Name columns
+rownames(cld_slop_lm) <- NULL #remove repeated rownames
+slop_lm_cld <- cbind(slop_lsmeans_df,cld_slop_lm$Letter) #combine extracted mean data from lmer using lsmeansLT and dataframe of letters from cld
+rownames(slop_lm_cld)=NULL
+colnames(slop_lm_cld)=c("landrace", "mean", "std_err", "DF", "lower", "upper", "Letters")# rename column of combined orinal data. slop_lmer_cld <- as.data.frame(slop_lmer_cld) #Coerce data containing means with confidence interval and letter display to dataframe
+slop_lm_cld=slop_lm_cld[order(slop_lm_cld$mean),]# reorder by mean value to cheat ggplot
+slop_lm_cld$landrace<-factor(slop_lm_cld$landrace,levels=slop_lm_cld$landrace)# put factors in level
+slop_cld <- left_join(slop,cld_slop_lm,by="landrace",copy=T) ###Add letter information to raw data - disease index
+slop_cld=slop_cld %>% mutate_if(is.numeric, funs(round(., 3)))#round data.frame to two digist using dplyr
+# head(slop_lm_cld)
+#Generate plot of mean+confidence interval of slop with significance letters and raw data as jittered points##
+ggplot(aes(x=landrace, y=slop_api, color=landrace),data=slop_cld) + ###Plot the origial disease index data
+ geom_crossbar(data = slop_lm_cld, aes(x = landrace, y = mean, ymin = lower, ymax = upper,fill=landrace), alpha=0.3) +
+ geom_jitter(aes(shape=block)) + ###with jitter overplotted, symbol shape defined by block
+ scale_shape_manual(values=c(9, 19, 22,23))+
+ geom_text(aes(x=landrace, y=0.04, label=Letters,size=10),color="black", data=slop_lm_cld) +###Get the letters from cld_di_lmer
+ #and write those to position y=-3
+ labs(y="disease index") + #Y-Axis label
+ ggtitle("Mean and raw slop of disease index per landrace and grouping letters") + #Title
+ theme_bw()+
+ theme(axis.title.x = element_text(face="bold", size=10), axis.text.x = element_text(angle=90, vjust=0.5, size=12,face = "bold"))+
+ theme(axis.title.y= element_text(size = 12,face = "bold"))+
+ theme(legend.position="none")
+
+Survival analysis, also known as time-to-event analysis, builds on a +different dataset, which can be generated from raw disease indices. +Survival analysis provides a way to analyze the time-to-event recordings +within one population. In the case of the data used here, what is of +interest is an event that can be referred to as death. +Correctly defining death in this context is crucial for the +outcome of the analysis. Death, here, is defined as a subject +reaching a certain disease index, from which it cannot recover. In this +script the disease index that defines the threshold to death is +called “cutoff”. We used a cutoff value of 2.23 (i.e 55.75% of the +leaves showing disease symptoms due to Xvm) which is the high percent +infection recorded for the least susceptible enset landrace tested here +‘Mazia’.
+In the following plot all of the observations that are above the +dotted line in the plot below are dead, at the day they cross +that line. Those that never cross the line are alive until the end of +observations, and are “right censored”, meaning that their event was not +observed during the time this subject was observed.
+cutoff <- c(2.23)
+ggplot(data=enset_xvm_long) +
+ geom_jitter(aes(x=dpi, y=di,color=landrace,shape=block)) +
+ geom_segment(aes(x=0, xend=max(dpi)+0.5,y=cutoff, yend=cutoff), linetype="dotted") +
+ labs(x = "Days post infection",
+ y = "disease indices",
+ title="Scatterplot of disease indices\n cutoff plotted as dotted line") +
+ coord_cartesian(xlim=c(19,66))
++## 7.1 Generate data for survival analysis A “survival table” can be +generated using the following code. This code works on the long table, +generated in the beginning, and the cutoff variable defined above.
+###Generate survival table
+surv_from_di <- data.frame(Subject=enset_xvm_long$subject,
+landrace=enset_xvm_long$landrace,
+plant=enset_xvm_long$plant,
+block=enset_xvm_long$block)
+###Fill survival table based on the enset_xvm_long table. This generates warnings. These can be ignored and come from the min()
+for (i in 1:max(enset_xvm_long$subject)) { #Go by subject
+ dummy <- enset_xvm_long[enset_xvm_long$subject==i,] #generate dummy for the subject
+ if (is.infinite(min(dummy$dpi[which(dummy$di >= cutoff)]))) { #If none of the di is greater than the cutoff (this is where warnings are generated, min on an empty object returns infinite and a warning!)
+ surv_from_di$End[i] <- max(dummy$dpi) #Generate a observation, censoring at the maximum dpi recorded
+ surv_from_di$Death[i] <- 0 #Still alive, because it did not pass the cutoff
+ } else { #If more than zero di are greater than the cutoff
+ surv_from_di$End[i] <- min(dummy$dpi[which(dummy$di >= cutoff)]) #Use the lowest dpi where condition is met
+ surv_from_di$Death[i] <- 1 #record as dead
+ }
+}
+rm(dummy)
+Kaplan-Meier estimates of survival are the basic tool of survival +analysis. These can be estimated using the survfit function from the +“survival” package.
+library(survival)
+library(stringr)
+library(tidyr)
+surv_di_fit <- survfit(Surv(End, Death) ~landrace +strata(block), data=surv_from_di)
+The survminer package provides the ggsurvplot() function. This works +nicely on datasets with few treatments. However, for the data presented +here, it is easier to initially generate a data frame that contains the +fit and plot with “normal” ggplot2
+###Strata dummy generation, modified from kmiddleton / rexamples
+strata_dummy <-NULL
+for(i in 1:length(surv_di_fit$strata)){
+ # add vector for one strata according to number of rows of strata
+ strata_dummy <- c(strata_dummy, rep(names(surv_di_fit$strata)[i], surv_di_fit$strata[i]))
+}
+###Data frame generation inspired by a post by Hadley Wickham to the ggplot2 Googlegroup
+surv_di_fit.df <- data.frame(
+ time = surv_di_fit$time,
+ n.risk = surv_di_fit$n.risk,
+ n.event = surv_di_fit$n.event,
+ surv = surv_di_fit$surv,
+ strata = strata_dummy,
+ upper = surv_di_fit$upper,
+ lower = surv_di_fit$lower
+)
+zeros <- data.frame(time = 0, surv = 1, strata = names((surv_di_fit$strata)),
+ upper = 1, lower = 1)
+
+surv_di_fit.df <- plyr::rbind.fill(zeros, surv_di_fit.df) ###I dont want to load plyr because i guess it will interfere with dplyr...
+rm(strata_dummy)
+rm(zeros)
+
+##The block and landrace columns are regenerated from the strata field, there are probably more elegant ways to do this
+
+surv_di_fit.df$block <- as.factor( str_split_fixed(
+ matrix( nrow=length(surv_di_fit.df$strata),ncol=2, unlist(strsplit(as.character(surv_di_fit.df$strata),", ")), byrow=T )[,2],"=",2)[,2])
+surv_di_fit.df$landrace <- as.factor( str_split_fixed(
+ matrix( nrow=length(surv_di_fit.df$strata),ncol=2, unlist(strsplit(as.character(surv_di_fit.df$strata),", ")), byrow=T )[,1],"=",2)[,2])
+###End of data frame generation
+###Start plotting
+ggplot(surv_di_fit.df, aes(time, surv, colour = landrace)) +
+ geom_step(aes(y = surv*100,linetype=block)) +
+ facet_wrap(~landrace) +
+ ggtitle("Survival estimates for all blockes")+
+ theme(legend.position="none")
+
+Different approaches to survival analysis, are based on analysing the +hazards. The hazard is the probability of experiencing an event at a +given timepoint. Many hazard based analysis assume that hazards are +proportional between treatments, meaning that they differ by a fixed +factor. Hazards were strongly influenced by Cox and a basic model is the +cox proportional hazards model.
+###Cox-Proportional hazards####
+#Build model
+srv_coxph <- coxph(Surv(End, Death) ~landrace + strata(block), data=surv_from_di)
+summary(srv_coxph)
+## Call:
+## coxph(formula = Surv(End, Death) ~ landrace + strata(block),
+## data = surv_from_di)
+##
+## n= 6000, number of events= 1786
+##
+## coef exp(coef) se(coef) z Pr(>|z|)
+## landraceAdo -0.466029 0.627489 0.154343 -3.019 0.002532 **
+## landraceAlagena -0.830802 0.435700 0.171872 -4.834 1.34e-06 ***
+## landraceArkiya 0.622440 1.863469 0.121764 5.112 3.19e-07 ***
+## landraceBededet -0.906910 0.403770 0.174340 -5.202 1.97e-07 ***
+## landraceBota Arkiya -0.574541 0.562963 0.157952 -3.637 0.000275 ***
+## landraceDirbo 0.008100 1.008133 0.135945 0.060 0.952486
+## landraceGefetano -0.138165 0.870955 0.141245 -0.978 0.327978
+## landraceGenticha 0.151447 1.163517 0.133143 1.137 0.255341
+## landraceGezewet -0.893940 0.409041 0.174314 -5.128 2.92e-07 ***
+## landraceGinbuwe 0.001661 1.001663 0.137084 0.012 0.990331
+## landraceGodere 0.406671 1.501810 0.127084 3.200 0.001374 **
+## landraceHae'la -1.301895 0.272016 0.201976 -6.446 1.15e-10 ***
+## landraceKuro -0.818899 0.440917 0.170735 -4.796 1.62e-06 ***
+## landraceLemat -1.470354 0.229844 0.212774 -6.910 4.83e-12 ***
+## landraceMazia -1.221370 0.294826 0.195242 -6.256 3.96e-10 ***
+## landraceNechuwe 0.100755 1.106006 0.134460 0.749 0.453656
+## landraceUnjame -0.033591 0.966967 0.137726 -0.244 0.807313
+## landraceWachiso 0.066963 1.069255 0.134996 0.496 0.619870
+## landraceYesha 0.283377 1.327606 0.129522 2.188 0.028679 *
+## ---
+## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+##
+## exp(coef) exp(-coef) lower .95 upper .95
+## landraceAdo 0.6275 1.5937 0.4637 0.8491
+## landraceAlagena 0.4357 2.2952 0.3111 0.6102
+## landraceArkiya 1.8635 0.5366 1.4678 2.3657
+## landraceBededet 0.4038 2.4767 0.2869 0.5682
+## landraceBota Arkiya 0.5630 1.7763 0.4131 0.7672
+## landraceDirbo 1.0081 0.9919 0.7723 1.3159
+## landraceGefetano 0.8710 1.1482 0.6603 1.1487
+## landraceGenticha 1.1635 0.8595 0.8963 1.5104
+## landraceGezewet 0.4090 2.4447 0.2907 0.5756
+## landraceGinbuwe 1.0017 0.9983 0.7657 1.3104
+## landraceGodere 1.5018 0.6659 1.1707 1.9266
+## landraceHae'la 0.2720 3.6763 0.1831 0.4041
+## landraceKuro 0.4409 2.2680 0.3155 0.6162
+## landraceLemat 0.2298 4.3508 0.1515 0.3488
+## landraceMazia 0.2948 3.3918 0.2011 0.4323
+## landraceNechuwe 1.1060 0.9042 0.8498 1.4395
+## landraceUnjame 0.9670 1.0342 0.7382 1.2666
+## landraceWachiso 1.0693 0.9352 0.8207 1.3931
+## landraceYesha 1.3276 0.7532 1.0300 1.7113
+##
+## Concordance= 0.681 (se = 0.008 )
+## Likelihood ratio test= 524 on 19 df, p=<2e-16
+## Wald test = 455 on 19 df, p=<2e-16
+## Score (logrank) test = 519.7 on 19 df, p=<2e-16
+###Check porportionality of hazards
+cox.zph(srv_coxph)
+## chisq df p
+## landrace 46.2 19 0.00046
+## GLOBAL 46.2 19 0.00046
+From the above output, one can see that the p-levels several +landraces such as landrace Arkia, Bededet, Dirbo, Lemat, Unjame and +Wachiso show statistical significance (p<0.05) including the global +statistics. This suggest a clear violation of proportionality of hazard +and one cannot assume the proportional hazards for further analysis. +Alternative analysis would be uisng Cox-mixed model effects similar to +linear mixed-effects model, but with a different type of response +variable.
+library("coxme")
+cxm=coxme(Surv(End,Death)~landrace+(1|block),data = surv_from_di)
+anova(cxm)
+## Analysis of Deviance Table
+## Cox model: response is Surv(End, Death)
+## Terms added sequentially (first to last)
+##
+## loglik Chisq Df Pr(>|Chi|)
+## NULL -13200
+## landrace -12918 564.25 19 < 2.2e-16 ***
+## ---
+## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+summary(cxm)
+## Cox mixed-effects model fit by maximum likelihood
+## Data: surv_from_di
+## events, n = 1786, 6000
+## Iterations= 10 63
+## NULL Integrated Fitted
+## Log-likelihood -13200.08 -12917.96 -12913.82
+##
+## Chisq df p AIC BIC
+## Integrated loglik 564.25 20.00 0 524.25 414.49
+## Penalized loglik 572.53 20.92 0 530.70 415.91
+##
+## Model: Surv(End, Death) ~ landrace + (1 | block)
+## Fixed coefficients
+## coef exp(coef) se(coef) z p
+## landraceAdo -0.467674676 0.6264573 0.1543320 -3.03 2.4e-03
+## landraceAlagena -0.833558655 0.4345003 0.1718634 -4.85 1.2e-06
+## landraceArkiya 0.623605228 1.8656420 0.1217347 5.12 3.0e-07
+## landraceBededet -0.908378038 0.4031776 0.1743350 -5.21 1.9e-07
+## landraceBota Arkiya -0.573039654 0.5638090 0.1579078 -3.63 2.8e-04
+## landraceDirbo 0.012586801 1.0126663 0.1358882 0.09 9.3e-01
+## landraceGefetano -0.137556227 0.8714853 0.1412428 -0.97 3.3e-01
+## landraceGenticha 0.151899164 1.1640429 0.1331242 1.14 2.5e-01
+## landraceGezewet -0.892740104 0.4095321 0.1742951 -5.12 3.0e-07
+## landraceGinbuwe 0.000843961 1.0008443 0.1370748 0.01 1.0e+00
+## landraceGodere 0.408629546 1.5047542 0.1270772 3.22 1.3e-03
+## landraceHae'la -1.302488708 0.2718544 0.2019726 -6.45 1.1e-10
+## landraceKuro -0.817656256 0.4414651 0.1707285 -4.79 1.7e-06
+## landraceLemat -1.470869099 0.2297257 0.2127743 -6.91 4.8e-12
+## landraceMazia -1.223905694 0.2940793 0.1952404 -6.27 3.6e-10
+## landraceNechuwe 0.097300329 1.1021913 0.1344425 0.72 4.7e-01
+## landraceUnjame -0.035060086 0.9655474 0.1377204 -0.25 8.0e-01
+## landraceWachiso 0.067332369 1.0696509 0.1349924 0.50 6.2e-01
+## landraceYesha 0.285742581 1.3307499 0.1295090 2.21 2.7e-02
+##
+## Random effects
+## Group Variable Std Dev Variance
+## block Intercept 0.19854484 0.03942005
+cxme_ph=cld(glht(cxm,linfct=mcp(landrace="Tukey")))
+cxme_ph.df=data.frame(cbind(levels(surv_from_di$landrace),cxme_ph$mcletters$Letters))
+colnames(cxme_ph.df)=c("Landrace","Letters")
+rownames(cxme_ph.df)=NULL
+cxme_ph.df
+## Landrace Letters
+## 1 Abatemerza ab
+## 2 Ado ac
+## 3 Alagena cd
+## 4 Arkiya e
+## 5 Bededet cd
+## 6 Bota Arkiya cfg
+## 7 Dirbo ab
+## 8 Gefetano afh
+## 9 Genticha bh
+## 10 Gezewet cd
+## 11 Ginbuwe ab
+## 12 Godere be
+## 13 Hae'la dg
+## 14 Kuro cd
+## 15 Lemat d
+## 16 Mazia dg
+## 17 Nechuwe bh
+## 18 Unjame abf
+## 19 Wachiso bh
+## 20 Yesha beh
+write.csv(cxme_ph.df,file = "cxme_ph.df.csv")
+Comparing the Kaplan-Meier survival estimates can be done in +different ways. Here non-parametric log-rank testing and parametric +survival regression will be used.
+The below produces all pairwise comparisons of the Kaplan Meier +estimate of survival using a logrank test.
+###Make a table of pairwise chisq pvalues, for the logrank test.
+#Based on a post to the R Mailing list by T. Therneau
+pw_logrank_test_type <- 0 ###0 for logrank, 1 for peto and peto
+pw_logrank <- matrix(0., nlevels(surv_from_di$landrace),nlevels(surv_from_di$landrace))
+for (i in 1:nlevels(surv_from_di$landrace)) {
+ for (j in (1:nlevels(surv_from_di$landrace))[-i]) {
+ datasubset <- droplevels(subset( surv_from_di,
+ surv_from_di$landrace %in% (unique(surv_from_di$landrace))[c(i,j)]))
+ temp <- survdiff(Surv(End, Death)~landrace+strata(block), data=datasubset, rho=pw_logrank_test_type)
+
+ pw_logrank[i,j] <- pchisq(temp$chisq, df=1, lower=F) ##df will always be 1 because this is pairwise
+ }
+}
+colnames(pw_logrank) <- levels(surv_from_di$landrace)
+rownames(pw_logrank) <- levels(surv_from_di$landrace)
+#Make dummy adjustment table
+pw_logrank_adjBon <- pw_logrank
+#Fill adjusted pvalue table.
+for (i in 1:ncol(pw_logrank)) {
+ pw_logrank_adjBon[,i] <- cbind(p.adjust(pw_logrank[,i], method="bonferroni"))
+}
+stargazer::stargazer(pw_logrank_adjBon,type="text",title="Pairwise Chisq p-values (Bonferroni adjusted)")
+##
+## Pairwise Chisq p-values (Bonferroni adjusted)
+## ====================================================================================================================================================================================
+## Abatemerza Ado Alagena Arkiya Bededet Bota Arkiya Dirbo Gefetano Genticha Gezewet Ginbuwe Godere Hae'la Kuro Lemat Mazia Nechuwe Unjame Wachiso Yesha
+## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+## Abatemerza 0 0 0 0 0 1 0 0 0 0.155 1 0 0.117 0.0001 0.278 0 0 0 0.001 0.324
+## Ado 0 0 1 1 1 0 1 0.262 1 0.00000 0 1 0.00000 0.001 0.00000 1 1 0.00003 0.0003 0.00000
+## Alagena 0 1 0 1 1 0 1 0.021 0.380 0.00001 0 1 0.00002 0.040 0.00001 1 1 0.00000 0.010 0.00001
+## Arkiya 0 1 1 0 1 0 1 0.103 1 0.00000 0 1 0.00000 0.012 0.00000 1 1 0.00001 0.002 0.00000
+## Bededet 0 1 1 1 0 0 1 0.187 1 0.00000 0 1 0.00000 0.003 0.00000 1 1 0.00002 0.001 0.00000
+## Bota Arkiya 1 0 0 0 0 0 0 0 0 0.544 1 0 0.407 0.0003 1 0 0 0 0.005 1
+## Dirbo 0 1 1 1 1 0 0 0.001 0.033 0.0004 0.00000 1 0.001 0.440 0.0002 0.865 1 0 0.132 0.0002
+## Gefetano 0 0.262 0.021 0.103 0.187 0 0.001 0 1 0 0 0.010 0 0 0 0.705 0.032 0.276 0 0
+## Genticha 0 1 0.380 1 1 0 0.033 1 0 0 0 0.244 0 0.00000 0 1 0.527 0.018 0.00000 0
+## Gezewet 0.155 0.00000 0.00001 0.00000 0.00000 0.544 0.0004 0 0 0 1 0.00001 1 0.760 1 0 0.00000 0 1 1
+## Ginbuwe 1 0 0 0 0 1 0.00000 0 0 1 0 0 1 0.003 1 0 0 0 0.024 1
+## Godere 0 1 1 1 1 0 1 0.010 0.244 0.00001 0 0 0.00001 0.033 0.00000 1 1 0.00000 0.009 0.00000
+## Hae'la 0.117 0.00000 0.00002 0.00000 0.00000 0.407 0.001 0 0 1 1 0.00001 0 1 1 0.00000 0.00001 0 1 1
+## Kuro 0.0001 0.001 0.040 0.012 0.003 0.0003 0.440 0 0.00000 0.760 0.003 0.033 1 0 0.428 0.0003 0.021 0 1 0.502
+## Lemat 0.278 0.00000 0.00001 0.00000 0.00000 1 0.0002 0 0 1 1 0.00000 1 0.428 0 0 0.00000 0 1 1
+## Mazia 0 1 1 1 1 0 0.865 0.705 1 0 0 1 0.00000 0.0003 0 0 1 0.0002 0.00004 0
+## Nechuwe 0 1 1 1 1 0 1 0.032 0.527 0.00000 0 1 0.00001 0.021 0.00000 1 0 0.00000 0.005 0.00000
+## Unjame 0 0.00003 0.00000 0.00001 0.00002 0 0 0.276 0.018 0 0 0.00000 0 0 0 0.0002 0.00000 0 0 0
+## Wachiso 0.001 0.0003 0.010 0.002 0.001 0.005 0.132 0 0.00000 1 0.024 0.009 1 1 1 0.00004 0.005 0 0 1
+## Yesha 0.324 0.00000 0.00001 0.00000 0.00000 1 0.0002 0 0 1 1 0.00000 1 0.502 1 0 0.00000 0 1 0
+## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+Hazards are found to be non-proportional, as is observed here, it +might be a good idea to perform survival regression analysis, or +pairwise log-rank testing (see below) instead of hazard ratio tests.
+Generally a survival regression does not assume proportionality of +hazards. A survival regression is fit to a distribution, defined by +dist=““.
+library(rms)
+####Survival Regression###
+###This is done using functions from rms.
+###psm is a survival::survreg wrapper. but the output is more handle-able for some other functions.
+ddist <- datadist(surv_from_di)
+options(datadist="ddist")
+psurv_gaus <- psm(Surv(End, Death) ~landrace, data=surv_from_di, dist="gaussian")
+psurv_logistic <- psm(Surv(End, Death) ~landrace, data=surv_from_di, dist="logistic")
+psurv_lnorm <- psm(Surv(End, Death) ~landrace, data=surv_from_di, dist="lognormal")
+psurv_wei <- psm(Surv(End, Death) ~landrace, data=surv_from_di, dist="weibull")
+###Same with survreg()
+s_reg_gaus <- survreg(Surv(End, Death) ~landrace, data=surv_from_di, dist="gaussian")
+s_reg_logistic <- survreg(Surv(End, Death) ~landrace, data=surv_from_di, dist="logistic")
+s_reg_lnorm <- survreg(Surv(End, Death) ~landrace, data=surv_from_di, dist="lognormal")
+s_reg_wei <- survreg(Surv(End, Death) ~landrace, data=surv_from_di, dist="weibull")
+aic.scores.psurv <- rbind(
+ extractAIC(psurv_wei),
+ extractAIC(psurv_gaus),
+ extractAIC(psurv_logistic),
+ extractAIC(psurv_lnorm))
+###Make useable AIC table
+rownames(aic.scores.psurv) <- c("Weibull", "Gaussian", "Logist", "Lognorm")
+colnames(aic.scores.psurv) <- c("df", "AIC")
+stargazer::stargazer(aic.scores.psurv,type="text",title="AIC Scores")
+##
+## AIC Scores
+## ======================
+## df AIC
+## ----------------------
+## Weibull 21 19,216.010
+## Gaussian 21 19,176.710
+## Logist 21 19,220.760
+## Lognorm 21 19,646.700
+## ----------------------
+###Call table
+From the table above, the model with the lowest AIC score can be +chosen. For this analysis, Gaussian model will be the choice for +analysis. Then, one can inspect this model for significant +differences.
+It is possible, but not really easy, to plot the curves generated +using parametric survival regression. These curves are the result of +fitting the data to a distribution in the earlier section. Doing this in +a manner that is compatible with ggplot2 requires a nice dataframe. +Below is code to generate plots of the KM estimates per block and the +generated fit. This is performed for the four distributions above, and +can be adapted to different distributions if necessary.
+###Step 1, extract the coefficients. These are relative to landrace1 because landrace is treatment contrasted.
+
+for (i in 1:nlevels(surv_di_fit.df$landrace)) { #For loop through landraces
+ if(i==1) { #landrace1 is relative to itself, so no change
+ coef_wei <- list()
+ coef_logistic <- list()
+ coef_gaus <- list()
+ coef_lnorm <- list()
+ coef_wei[i] <- coef(s_reg_wei)[i]
+ coef_logistic[i] <- coef(s_reg_logistic)[i]
+ coef_gaus[i] <- coef(s_reg_gaus)[i]
+ coef_lnorm[i] <- coef(s_reg_lnorm)[i]
+ } else { ###Other landraces are relative to 1
+ coef_wei[i] <- coef(s_reg_wei)[1] + coef(s_reg_wei)[i]
+ coef_logistic[i] <- coef(s_reg_logistic)[1] + coef(s_reg_logistic)[i]
+ coef_gaus[i] <- coef(s_reg_gaus)[1] + coef(s_reg_gaus)[i]
+ coef_lnorm[i] <- coef(s_reg_lnorm)[1] + coef(s_reg_lnorm)[i]
+ }
+}
+##Step 2
+####Store the coefficients and the scale in a new data frame, of parameters
+### Keep in mind that survreg.distributions$weibull is different from rweibull, hence the difference in names.
+sregparams <- data.frame(
+ landrace = rep(levels(surv_from_di$landrace),4 ), #Fill with landraces
+ scale.wei = exp(unlist(coef_wei)), #weibull fit scale parameters
+ scale.logistic = rep(s_reg_logistic$scale, nlevels(surv_from_di$landrace)), #fill with logis scales
+ scale.gaus = rep(s_reg_gaus$scale, nlevels(surv_from_di$landrace)), #fill with gaus scales
+ scale.lnorm = rep(s_reg_lnorm$scale, nlevels(surv_from_di$landrace)), #fill with lnorm scale
+ shape.wei = rep(1/s_reg_wei$scale, nlevels(surv_from_di$landrace)), #shape for weibull
+ shape.logistic = unlist(coef_logistic), #shape for logistic
+ shape.gaus = unlist(coef_gaus), #shape for gaus
+ shape.lnorm = unlist(coef_lnorm) #shape for lnorm
+ )
+##Step 3
+###Calculate the "daily" value of each curve
+for (i in 1:nlevels(surv_di_fit.df$landrace)){
+ if(i==1) {
+ wei <- list()
+ logis <- list()
+ gaus <- list()
+ lnorm <- list()
+ }
+ x <- levels(surv_di_fit.df$landrace)[i]
+ n <- c(1:max(surv_from_di$End))
+ data <- filter(sregparams, landrace==x)
+ time <- n
+ wei <- cbind(wei, pweibull(
+ q=n,
+ scale=data$scale.wei,
+ shape=data$shape.wei,
+ lower.tail=FALSE))
+ logis <- cbind(logis,plogis(
+ q=n,
+ scale=data$scale.logistic,
+ location=data$shape.logistic,
+ lower.tail=FALSE ))
+ gaus <- cbind(gaus,pnorm(
+ q=n,
+ sd=data$scale.gaus,
+ mean=data$shape.gaus,
+ lower.tail = F))
+ lnorm <- cbind(lnorm, plnorm(
+ q=n,
+ sd=data$scale.lnorm,
+ mean=data$shape.lnorm,
+ lower.tail=F))
+}
+
+##Step 4
+###Put all the curves into a data.frame that contains information on "time" and also "landrace", for compatibility with other data.frames
+sreg_curves <- data.frame(
+ wei.sreg = cbind(unlist(wei)),
+ logis.sreg = cbind(unlist(logis)),
+ gaus.sreg = cbind(unlist(gaus)),
+ lnorm.sreg = cbind(unlist(lnorm)),
+ landrace = rep(unlist(levels(surv_di_fit.df$landrace)),each=max(surv_from_di$End)),
+ time = rep(c(1:max(surv_from_di$End)), nlevels(surv_di_fit.df$landrace))
+)
+##Step 5
+###Turn that data.frame into a long data.frame (not used here but could be handy.)
+sreg_long <- sreg_curves %>%
+ group_by(landrace) %>%
+ gather(key="distribution", value = "value", logis.sreg, wei.sreg, gaus.sreg, lnorm.sreg )
+sreg_long$distribution <- as.factor(sreg_long$distribution)
+##Levels: gaus.sreg lnorm.sreg logis.sreg wei.sreg
+levels(sreg_long$distribution) <- c("Gaussian","Lognormal","Loglogistic","Weibull")
+Now, these can be plotted and inspected visually..
+###Plot of KM+Gaussian
+ ggplot(surv_di_fit.df, aes(time, surv, colour = landrace)) +
+ geom_step(aes(linetype=block)) +
+ #geom_smooth(aes(linetype=block)) +
+ geom_line(data=sreg_curves,aes(y=gaus.sreg),color="black") +
+ facet_wrap(~landrace) +
+ ggtitle("Kaplan-Meier estimates and fit to\nGaussian distribution")+
+ theme(legend.position = "none")
+
+The following scripts was used to generate figures in the
+manuscript.
+## Fig 5: Plot of actual AUDPS per landrace and, mean and CI of
+AUDPS
###For the purpose of visualization calculate summaries across all replicates
+di_summary.noblock <- enset_xvm_long %>% group_by(plant, landrace, dpi) %>% summarise(mean(di),sd(di),sd(di)/sqrt(length(di)))
+colnames(di_summary.noblock) <- c("plant", "landrace", "dpi", "mean", "sd", "se")
+
+###Generate mean and standard deviation of disease index using the pipe function ( %>% ).
+di_summary <- enset_xvm_long %>% group_by(plant, landrace, block, dpi) %>% summarise(mean(di),sd(di),sd(di)/sqrt(length(di)))
+colnames(di_summary) <- c("plant", "landrace", "block", "dpi", "mean", "sd", "se") ###Assign correct columnnames
+
+####Fig7a: Plot the actual areas per landrace
+Fig5a <-
+ ggplot(filter(di_summary),aes(x=dpi,y=mean)) + ###Use data di_summary
+ geom_area(aes(color=block,fill=block))+
+ scale_fill_manual(values = c("gray1","#999999","gray50"))+
+#geom_line(aes(linetype=block)) +#show line by each block
+#aes(x=dpi,y=mean,color=landrace) + ###Color by landrace, specify x and y.
+#geom_area(aes(fill=landrace),position="identity",alpha=0.15) + ###Area plot, colored by landrace
+facet_wrap(~landrace) + ###One plot per landrace
+ labs(x = "Days after infection", y = "Mean disease index") + #Labels
+ ggtitle("5A) Area under disease progression of landraces across blocks")+
+ theme_bw()+
+ theme(strip.text.x = element_text(size = 12,face = "bold"),
+ axis.text.x = element_text(size = 11,face = "bold"),
+ axis.title.x = element_text(size = 12,face = "bold"),
+ axis.title.y = element_text(size = 12,face = "bold"),
+ axis.text.y = element_text(size = 11,face = "bold"),
+ legend.position = "none")
+
+####Fig5b: Plot mean and confidence interval AUDPS per landrace###
+
+color_fill_gray <- c("#999999","#999999","#999999","#999999","#999999", "#999999","#999999","#999999","#999999","#999999", "#999999","#999999","#999999","#999999","#999999",
+ "#999999","#999999","#999999","#999999","#999999") # fill grey color for all landraces
+
+####Generate plot of meanCI of AUDPS with significance letters and raw data as jittered points
+cld_audps=cld_audps[order(cld_audps$audps),]# reorder by mean value to cheat ggplot
+cld_audps$landrace =factor(cld_audps$landrace,levels = cld_audps$landrace)#put factors in level
+
+Fig5b<-
+ ggplot(aes(x=landrace, y=audps, fill=landrace),data=audps_data__cld) + ###Plot the audps_data
+ scale_shape_manual(values=c(9, 19, 22))+
+ geom_crossbar(data = cld_audps, aes(x = landrace, y = audps, ymin = lower, ymax = upper,fill=landrace), alpha=0.3,fill="gray") +
+ scale_fill_manual(values = color_fill_gray)+
+ geom_jitter(aes(shape=block),data = audps_data__cld) + ###with jitter overplotted, symbol shape defined by block
+ geom_text(aes(x=landrace, y=-20, label=aud_let),color="black", size=3.5, data=audps_data__cld) + ###Get the letters from auc_cld
+ #and write those to position y=-3
+ labs(y="AUDPS") + #Y-Axis label
+ ggtitle("5B) Mean and raw audps values per landrace and grouping letters") +#Title
+ theme_bw()+
+ theme(axis.title.y = element_text(face="bold", size=12),
+ axis.title.x = element_blank(),
+ axis.text.y = element_text(size = 11,face = "bold"),
+ axis.text.x = element_text(angle=90, vjust=0.5, size=12,face = "bold"),
+ legend.position="none")
+
+library(grid)
+pushViewport(viewport(layout= grid.layout(nrow=1, ncol=2)))
+print(Fig5a, vp = viewport(layout.pos.row = 1, layout.pos.col = 1))
+print(Fig5b, vp = viewport(layout.pos.row = 1, layout.pos.col = 2))
+### Fig5 done ###
+# This figure might not appear in below and can be view running in R console on non-Rmakrdown Rscript file.
+##Fig 6 - Survival Regression
+# Script from Schandry (2017) used with modfication
+survfit_to_df <- function(x) {
+ strata_dummy <-NULL
+ for(i in 1:length(x$strata)){
+ # add vector for one strata according to number of rows of strata
+ strata_dummy <- c(strata_dummy, rep(names(x$strata)[i], x$strata[i]))
+ }
+ #make x.df from x..
+ x.df <- data.frame(
+ time = x$time,
+ n.risk = x$n.risk,
+ n.event = x$n.event,
+ surv = x$surv,
+ strata = strata_dummy,
+ upper = x$upper,
+ lower = x$lower
+ )
+ zeros <- data.frame(time = 0, surv = 1, strata = names((x$strata)),
+ upper = 1, lower = 1)
+ x.df <- plyr::rbind.fill(zeros, x.df)
+ rm(strata_dummy)
+ rm(zeros)
+ return(x.df)
+}
+
+###Generate intermediate survival tables##
+surv_table1 <- data.frame(subject=enset_xvm_long$subject,
+ landrace=enset_xvm_long$landrace,
+ plant=enset_xvm_long$plant,
+ block=enset_xvm_long$block)
+###Fill survival table based on the enset_xvm_long table. This generates warnings. These can be ignored and come from the min()
+cutoff <- c(2.23)
+for (i in 1:max(enset_xvm_long$subject)) { #Go by subject
+ dummy <- enset_xvm_long[enset_xvm_long$subject==i,] #generate dummy for the subject
+ if (is.infinite(min(dummy$dpi[which(dummy$di >= cutoff)]))) { #If none of the di is greater than the cutoff (this is where warnings are generated, min on an empty object returns infinite!)
+ surv_table1$End[i] <- max(dummy$dpi) #Generate a observation, censoring at the maximum dpi recorded
+ surv_table1$Death[i] <- 0 #Still alive, because it did not pass the cutoff
+ } else { #If more than zero di are greater than the cutoff
+ surv_table1$End[i] <- min(dummy$dpi[which(dummy$di >= cutoff)]) #Use the lowest dpi where condition is met
+ surv_table1$Death[i] <- 1 #record as dead
+ }
+}
+###Make survfits with the four survival tables..
+surv_di_fit1 <- with(surv_table1, survfit(Surv(End, Death) ~landrace +strata(block), data=surv_from_di))
+
+###Make survfits into surv_df
+surv_di_fit1.df <- survfit_to_df(surv_di_fit1)
+
+###However the strata field still needs to be split manually....
+surv_di_fit1.df$block <- as.factor( str_split_fixed(
+ matrix( nrow=length(surv_di_fit1.df$strata),ncol=2, unlist(strsplit(as.character(surv_di_fit1.df$strata),", ")), byrow=T )[,2],"=",2)[,2])
+surv_di_fit1.df$landrace <- as.factor( str_split_fixed(
+ matrix( nrow=length(surv_di_fit1.df$strata),ncol=2, unlist(strsplit(as.character(surv_di_fit1.df$strata),", ")), byrow=T )[,1],"=",2)[,2])
+###End of intermediate table generation##
+
+###Landraces that showed non-significant coxme hazard ration were removed.
+
+surv_landraces0=c("Ado", "Alagena", "Arkia", "Bededet", "Bota Arkia",
+ "Gezewet", "Godere ", "Hae'la", "Kuro", "Lemat", "Mazia","Yesha")
+
+
+kmdata <- filter(surv_di_fit1.df, landrace %in% surv_landraces0)
+curvdata <- filter(sreg_curves, landrace %in% surv_landraces0)
+#Labels
+
+###Plot of KM+Gaussian
+Fig6a <- ggplot(kmdata, aes(time, surv, colour = landrace)) +
+ geom_step(aes(linetype=block)) +
+ geom_line(data=curvdata,aes(y=gaus.sreg),color="black") +
+ facet_wrap(~landrace) + theme(legend.position="none")+
+ ggtitle("Fig 6A)")+
+ theme_bw()+
+ labs(x = "Days post infection", y = "Fraction of plants alive") +
+ theme(legend.position="none")+
+ theme(axis.title.x = element_text(size = 12,face = "bold"),
+ axis.title.y = element_text(size = 12,face = "bold"),
+ axis.text.x = element_text(size = 8,face = "bold"),
+ axis.text.y = element_text(size = 8,face = "bold"),
+ strip.text = element_text(size = 12))
+
+
+cb_palette <- c("yellow", "darkorchid", "red", "darkgrey", "#CDCD00",
+ "#CDCD00", "peachpuff", "#228B22","dodgerblue4","#54FF9F",
+ "paleturquoise2","indianred")
+
+Fig6b <- ggplot(data=filter(surv_di_fit1.df), aes(x=time, y=surv,color=landrace))+
+ #geom_step(aes(linetype=block),alpha=1) +
+ geom_line(data=curvdata,
+ aes(y=lnorm.sreg,color=landrace), size=1, alpha=1) +
+ scale_color_manual(values=cb_palette)+
+ geom_point(data=curvdata,aes(y=lnorm.sreg,shape=landrace))+
+ scale_shape_manual(values = c(0:5,8,9,11,25,15,16))+
+ labs(x="Days post infection",
+ y="Estimated survival",
+ title=("Fig 6B)")) +
+ coord_cartesian(xlim=c(54,155)) +
+ scale_x_continuous(breaks=seq(50, 155, 15))+
+ #labs(color="Legend")
+ theme_bw()+
+ theme(legend.position=c(0.01,1),
+ legend.justification=c(0,1.6),
+ legend.text = element_text(size = 12),
+ legend.title = element_blank(),
+ axis.title.x = element_text(size = 12,face = "bold"),
+ axis.title.y = element_text(size = 12,face = "bold"),
+ axis.text.x = element_text(size = 8,face = "bold"),
+ axis.text.y = element_text(size = 8,face = "bold"))
+
+
+pushViewport(viewport(layout= grid.layout(nrow=1, ncol=2)))
+print(Fig6a, vp = viewport(layout.pos.row = 1, layout.pos.col = 1))
+print(Fig6b, vp = viewport(layout.pos.row = 1, layout.pos.col = 2))
+# This figure might not appear in below and can be view running in R console on non-Rmakrdown Rscript file.
+####clear device
+sessionInfo()
+## R version 4.3.1 (2023-06-16 ucrt)
+## Platform: x86_64-w64-mingw32/x64 (64-bit)
+## Running under: Windows 10 x64 (build 19045)
+##
+## Matrix products: default
+##
+##
+## locale:
+## [1] LC_COLLATE=English_United Kingdom.utf8
+## [2] LC_CTYPE=English_United Kingdom.utf8
+## [3] LC_MONETARY=English_United Kingdom.utf8
+## [4] LC_NUMERIC=C
+## [5] LC_TIME=English_United Kingdom.utf8
+##
+## time zone: Europe/London
+## tzcode source: internal
+##
+## attached base packages:
+## [1] stats graphics grDevices utils datasets methods base
+##
+## other attached packages:
+## [1] rms_6.7-1 Hmisc_5.1-0 coxme_2.2-18.1 bdsmatrix_1.3-6
+## [5] stringr_1.5.0 lsmeans_2.30-0 emmeans_1.9.0 multcomp_1.4-25
+## [9] TH.data_1.1-2 MASS_7.3-60 survival_3.5-5 mvtnorm_1.2-4
+## [13] lmerTest_3.1-3 lme4_1.1-35.1 Matrix_1.6-4 magrittr_2.0.3
+## [17] agricolae_1.3-7 MESS_0.5.12 ggplot2_3.4.2 dplyr_1.1.2
+## [21] tibble_3.2.1 broom_1.0.5 tidyr_1.3.0
+##
+## loaded via a namespace (and not attached):
+## [1] tidyselect_1.2.0 farver_2.1.1 fastmap_1.1.1
+## [4] labelled_2.12.0 digest_0.6.33 rpart_4.1.19
+## [7] estimability_1.4.1 lifecycle_1.0.3 cluster_2.1.4
+## [10] compiler_4.3.1 rlang_1.1.1 sass_0.4.7
+## [13] tools_4.3.1 utf8_1.2.3 yaml_2.3.7
+## [16] data.table_1.14.8 geeM_0.10.1 knitr_1.43
+## [19] htmlwidgets_1.6.2 labeling_0.4.2 ggformula_0.12.0
+## [22] plyr_1.8.8 polspline_1.1.24 withr_2.5.0
+## [25] foreign_0.8-84 purrr_1.0.1 numDeriv_2016.8-1.1
+## [28] geepack_1.3.9 nnet_7.3-19 grid_4.3.1
+## [31] mosaicCore_0.9.4.0 fansi_1.0.4 AlgDesign_1.2.1
+## [34] xtable_1.8-4 colorspace_2.1-0 scales_1.2.1
+## [37] ggridges_0.5.5 cli_3.6.1 rmarkdown_2.23
+## [40] generics_0.1.3 rstudioapi_0.15.0 minqa_1.2.6
+## [43] cachem_1.0.8 splines_4.3.1 base64enc_0.1-3
+## [46] vctrs_0.6.3 boot_1.3-28.1 sandwich_3.1-0
+## [49] SparseM_1.81 jsonlite_1.8.7 hms_1.1.3
+## [52] Formula_1.2-5 htmlTable_2.4.1 stargazer_5.2.3
+## [55] clipr_0.8.0 jquerylib_0.1.4 glue_1.6.2
+## [58] nloptr_2.0.3 codetools_0.2-19 stringi_1.7.12
+## [61] gtable_0.3.3 munsell_0.5.0 pillar_1.9.0
+## [64] quantreg_5.97 htmltools_0.5.5 R6_2.5.1
+## [67] evaluate_0.21 lattice_0.21-8 haven_2.5.3
+## [70] highr_0.10 backports_1.4.1 bslib_0.5.1
+## [73] MatrixModels_0.5-3 Rcpp_1.0.11 gridExtra_2.3
+## [76] nlme_3.1-162 checkmate_2.2.0 xfun_0.40
+## [79] zoo_1.8-12 forcats_1.0.0 pkgconfig_2.0.3
+dev.off()
+## null device
+## 1
+