Fig04.Rmd

---
title: "Data analysis"
author: "Aoyue Bi, Daxing Xu"
date: "`r Sys.Date()`"
output: 
  html_document: # html_notebook, html_document
    toc: true
    toc_float:
      toc_collapsed: true
    toc_depth: 3
    number_sections: true
    theme: simplex # default, united, cerulean, journal, flatly, darkly, readable, spacelab, united, cosmo, lumen, paper, sandstone, simplex, and yeti
    highlight: haddock # default, tango, pygments, kate, monochrome, espresso, zenburn, haddock, and breezedark
    # bibliography: bibliography.json
---

# Fig4 Expansion Load

```{r setup, echo =F, message = T, warning = F}
library(tidyverse)
library(RColorBrewer)
# library(ggpubr)
# library(egg)
## eval 代码要不要运行
## echo 代码要不要输出
## include 图要不要输出
## warning 警告要不要输出
## message 默认Bin等信息要不要输出
```

#### === Fig4A ===
## Fig4A: DAF-plot by loadGroup deleterious
```{r}
### step1: df
df <- read_tsv("data/015_AAF/001_aaf_bySubspecies/001_aaf_bySubspecies.txt.gz")

### step2: DAF bin
df1 <- df %>% 
  rename(DAF=AAF) %>% 
  filter(DAF!=0) %>%
  filter(DAF!=1) %>%
  mutate(bin=cut_width(DAF,width = 0.2,boundary = 0)) %>%
  # mutate(MergedSub=if_else(Sub=="D","D","AB")) %>% 
  group_by(Group,GenomeType,LoadGroup,Sub) %>% 
  count(bin) %>% 
  mutate(fre=n/sum(n))

### step3: filter dataframe
df2 <- df1 %>% 
  filter(GenomeType=="AABBDD") %>% 
  filter(Sub == "D") %>%
  filter(LoadGroup=="Deleterious") %>% 
  droplevels()

### step4: set factor levels
levels_A <- c("LR_EA","OtherHexaploids","Cultivar","LR_EU","Tibetan semi-wild","LR_WA","LR_AM","LR_AF","LR_CSA","Spelt","Yunan wheat","Macha","Club wheat","Vavilovii","Indian dwarf wheat","Xinjiang wheat")

df2$Group <- factor(df2$Group,levels = levels_A)

### 为每个亚群添加倍性信息,dfhash 为 dfcolorDB 服务
dfhash <- read_tsv("/Users/Aoyue/project/wheatVMap2_1000/001_germplasm/021_WheatVMap2_GermplasmInfo.txt") %>%
  select(Taxa=VcfID,GroupbyContinent,Subspecies_by22_TreeValidated,GenomeType) %>%
  mutate(GroupforExpansionLoad = if_else(Subspecies_by22_TreeValidated=="Landrace",GroupbyContinent,Subspecies_by22_TreeValidated)) %>%
  select(GenomeType,GroupforExpansionLoad) %>%
  filter(!is.na(GroupforExpansionLoad)) %>%
  distinct(.,GroupforExpansionLoad,.keep_all = T)

### 每个亚群的颜色信息
dfcolorDB <- read_tsv("data/001_TaxaDB/003_VMap2_colorDB.txt") %>% 
  filter(ColorCategory %in% c("GroupforExpansionLoad")) %>% droplevels() %>% 
  left_join(.,dfhash,by=c("Group"="GroupforExpansionLoad")) %>% 
  ### 过滤数据，使之和输入的数据框保持一致
  filter(GenomeType =="AABBDD") %>% 
  ### 这里是个重点，我想自定义顺序！
  mutate(Label=factor(Label,levels = levels_A)) %>% 
  arrange(as.factor(Label))

### 设置数据框和画图颜色的因子顺序，使都保持一致
df1$Group <- factor(df1$Group,levels = dfcolorDB$Label)
colB <- dfcolorDB$ColorCode


#### facet by sub
# colB <- c("#e69f00","#004680")
p <- df2 %>% 
  # filter(Sub %in% c("A","B")) %>%
  # filter(GenomeType=="AABBDD") %>% 
  # filter(LoadGroup=="Deleterious") %>% 
  # filter(Sub == "D") %>%
  # filter(bin %in% c("[0,0.1]","(0.1,0.2]")) %>% 
  ggplot(aes(x=bin,y=fre,group=Group,color=Group))+
  geom_line(alpha=0.9)+
  labs(y="Proportion of deleterious SNPs",x="Derived allele frequency")+
  scale_color_manual(values = colB)+
  # scale_color_manual(values = colorRampPalette(rev(brewer.pal(n = 12, name = "Set3")))(16))+
  # scale_fill_manual(values = colB)+
  # scale_y_continuous(limits = c(0,1))+
  # scale_x_discrete(label=c("0.1","","0.3","","0.5","","0.7","","0.9",""))+
  # facet_grid(Sub~Group)+
  theme(strip.background = element_blank())+
  # theme(axis.text.x = element_text(vjust = 1, hjust = 1, angle = 45))+
  theme(
    # plot.margin=unit(rep(1,4),'lines'),
    panel.background = element_blank(),
    panel.border = element_rect(colour = "black", fill=NA, size=0.4),
    legend.key = element_blank(),
    legend.position = 'none',
    text = element_text(size = 6))

p
# ggsave(p,filename = "~/Documents/temp.png",height = 6,width = 8)

library("ggpubr")
ggp_legend <- get_legend(p)
ggsave(as_ggplot(ggp_legend),filename = "~/Documents/legend.pdf",height = 4,width = 1.5)

# ggsave(p,filename = "~/Documents/test.pdf",height = 3.2,width = 3.2) ### 刚好可以把所有图例展示出来
ggsave(p,filename = "~/Documents/test.pdf",height = 2.28,width = 2.28) ### 刚好可以把所有图例展示出来

```

#### === Fig4B ===
## ThetaW only hexaploid
```{r}
### 开始处理 pi 的结果
dfori <- read_tsv("/Users/Aoyue/project/wheatVMap2_1000/001_germplasm/Group/004_taxaDB_GroupforExpansionLoad.txt") %>%
  filter(!is.na(GroupforExpansionLoad)) %>% 
  ### 添加过滤条件
  filter(GenomeType=="AABBDD")

dfhash <- read_tsv("data/001_TaxaDB/004_taxaDB_GroupforExpansionLoad.txt") %>% 
  select(GroupforExpansionLoad,GroupforExpansionLoad2,GenomeType,Subspecies_by6_TreeValidated) %>% 
  distinct(GroupforExpansionLoad,.keep_all = T)

df <- read_tsv("data/014_PopParameters/003_angsd/002_angsd_summary.txt.gz") %>% 
  filter(tW_mean!=0) %>% 
  mutate(Sub=str_sub(RefChr,2,2)) %>% 
  left_join(dfhash,by=c("Group"="GroupforExpansionLoad2")) %>% 
  ### 添加过滤条件
  filter(GenomeType=="AABBDD",Sub=="D")

### 统计每个亚基因组内，按照A亚基因组排序
### dfV2 为排序后的亚洲列表
dfV1 <- df %>% 
  mutate(Subspecies_by6_TreeValidated=factor(Subspecies_by6_TreeValidated,levels = c("Landrace","Cultivar","OtherHexaploids"))) %>%
  group_by(GenomeType,Sub,Subspecies_by6_TreeValidated,GroupforExpansionLoad) %>% 
  summarise(mean = mean(tW_mean)) %>% 
  arrange(.,GenomeType,Sub,Subspecies_by6_TreeValidated,-mean) %>% 
  ungroup() %>%  
  filter(Sub=="D") %>% 
  select(GroupforExpansionLoad)
  # add_row(GroupforExpansionLoad="Strangulata")

### 因子排序
dfori$GroupforExpansionLoad <- factor(dfori$GroupforExpansionLoad,levels = dfV1$GroupforExpansionLoad)

df$GroupforExpansionLoad <- factor(df$GroupforExpansionLoad,levels = dfV1$GroupforExpansionLoad)

### 计算每个亚群的个数，并添加到taxa数据库后面
dftaxaCount <- dfori %>% 
  select(GroupforExpansionLoad) %>% 
  group_by(GroupforExpansionLoad) %>% 
  count() %>% 
  ungroup() %>%
  mutate(GroupforExpansionLoad_AddCount = str_c(GroupforExpansionLoad," (", n, ")",sep = "")) %>% 
  select(-n)

yvalue <- 30

p <- df %>% 
  ggplot(aes(x=GroupforExpansionLoad,y=tW_mean))+
  labs(y= expression(italic(theta)[italic(W)]), x = "")+ 
  ### 加阴影
  geom_rect(aes(xmin=1-0.5,xmax=1+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5,color="white")+
  geom_rect(aes(xmin=3-0.5,xmax=3+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=5-0.5,xmax=5+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=7-0.5,xmax=7+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=9-0.5,xmax=9+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=11-0.5,xmax=11+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=13-0.5,xmax=13+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=15-0.5,xmax=15+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  # geom_rect(aes(xmin=17-0.5,xmax=17+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  # geom_rect(aes(xmin=19-0.5,xmax=19+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  # geom_rect(aes(xmin=21-0.5,xmax=21+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  # geom_rect(aes(xmin=23-0.5,xmax=23+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  # geom_rect(aes(xmin=25-0.5,xmax=25+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  # 最上面的条纹
  # geom_rect(aes(xmin=1-0.5,xmax=1+0.5,ymin=yvalue,ymax=Inf),fill="#ffd702",alpha=0.5)+
  # geom_rect(aes(xmin=2-0.5,xmax=2+0.5,ymin=yvalue,ymax=Inf),fill="#7f5701",alpha=0.5)+
  # geom_rect(aes(xmin=3-0.5,xmax=6+0.5,ymin=yvalue,ymax=Inf),fill="#016699",alpha=0.5)+
  # geom_rect(aes(xmin=7-0.5,xmax=9+0.5,ymin=yvalue,ymax=Inf),fill="#00f3ff",alpha=0.5)+
  geom_rect(aes(xmin=1-0.5,xmax=6+0.5,ymin=yvalue,ymax=Inf),fill="#fc6e6e",alpha=0.5)+
  geom_rect(aes(xmin=7-0.5,xmax=7+0.5,ymin=yvalue,ymax=Inf),fill="#9900ff",alpha=0.5)+
  geom_rect(aes(xmin=8-0.5,xmax=16+0.5,ymin=yvalue,ymax=Inf),fill="#fe63c2",alpha=0.5)+
  # geom_rect(aes(xmin=26-0.5,xmax=26+0.5,ymin=yvalue,ymax=Inf),fill="#87cef9",alpha=0.5)+
  # 开始画图
  # stat_boxplot(geom = "errorbar",width=0.15,position = position_dodge(1))+ # add error bar
  # geom_point(position = position_jitterdodge(0.5),alpha = 0.6,aes(col=Sub),size=0.3)+
  # stat_summary(fun.data=mean_sdl,geom="pointrange",size=0.3)+
  ### boxplot
  geom_boxplot(
    # aes(color=Sub),
    color="black",
    position = position_dodge(0.8),
               outlier.color = NA,
               # outlier.alpha = 0.8,
               # outlier.size = 0.7,
               alpha=0.8,width=0.7)+ #,width=0.0.7
  stat_summary(
              aes(group=Sub),
              color="red",
              fun=mean, geom="point",position = position_dodge(0.8),size=0.4)+
  ### boxplot and point
  # geom_boxplot(position = position_dodge(0.9),outlier.color = 'white',alpha=0.5,outlier.size = 0.5)+
  # geom_point(position = position_jitterdodge(0.6),alpha = 0.6,aes(color=GenomeType),size=0.8)+
  geom_vline(xintercept = 7.5,linetype="dashed",color="gray")+
  # geom_vline(xintercept = 25.5,linetype="dashed",color="black")+
  # scale_color_manual(values = c("#fd8582","#967bce","#4bcdc6"),name='')+
  scale_x_discrete(labels = dftaxaCount$GroupforExpansionLoad_AddCount)+
  theme(axis.text.x= element_text(angle=45, vjust=1,hjust = 1))+
  # theme(plot.margin=unit(rep(1.3,4),'lines'))+
  theme(
    panel.background = element_blank(),
    panel.border = element_rect(colour = "black", fill=NA, size=0.5),
    legend.position = 'none',
    # legend.position = c(0.9,0.6),
    text = element_text(size = 7))
p
# ggsave(p,filename = "/Users/Aoyue/Documents/test.pdf",height = 4,width = 7.2)
ggsave(p,filename = "/Users/Aoyue/Documents/test.pdf",height = 2.12,width = 4.23)
ggsave(p,filename = "/Users/Aoyue/Documents/test.pdf",height = 2,width = 3.5)

```


#### === Fig4CDE===
## World map
```{r}
choiceA <- "Ratio_031_GERP16way_1.5andDerived_PolyPhen2_0.5"
### 先处理del文件
dfdel <- read_tsv("data/004_delCount/delRatio.txt.gz") 

dftaxaDB <- read_tsv("data/001_TaxaDB/WheatVMap2_GermplasmInfo.txt") %>%
  select(Taxa=VcfID,GenomeType,Longitude,Latitude)

df <- dfdel %>% right_join(.,dftaxaDB,by="Taxa") %>% 
  filter(GenomeType=="AABBDD") %>% 
  filter(Subgenome=="D") %>% 
  filter(DelCountGroup == "TotalDerivedSNPCount") %>% 
  filter(Stand_LoadGroup == choiceA) %>% 
  select(-c("GenomeType","Subgenome","DelCountGroup","Stand_LoadGroup")) %>% filter(!is.na(Longitude)) ### 802个个体 -> 543 个有经纬度信息

mp<-NULL #定义一个空的地图  
# 注意： borders函数中的colour是地图国家边界线的颜色,fill是国家的填充颜色
mapworld<-borders("world",colour = "#dedfe0",fill="#dedfe0") #绘制基本地图
mpvoid<-ggplot()+ mapworld + ylim(-55,90)+theme_void()

colB <- c("#ffd702","#fc6e6e","#87cef9")
mp<-mpvoid +
  geom_point(data=df,aes(x=Longitude,y=Latitude
),shape=21,alpha=0.8,size=2,fill="orange")+
  theme(legend.position = 'none')
mp

ggsave(mp,filename = "figs/map_s1062_1.pdf", height = 2.76,width = 6.89)
```


## Bubble map
```{r}
choiceA <- "Ratio_031_GERP16way_1.5andDerived_PolyPhen2_0.5"

### 先处理del文件
dfdel <- read_tsv("data/004_delCount/delRatio.txt.gz") 

dftaxaDB <- read_tsv("data/001_TaxaDB/WheatVMap2_GermplasmInfo.txt") %>%
  select(Taxa=VcfID,GenomeType,Longitude,Latitude)

df <- dfdel %>% right_join(.,dftaxaDB,by="Taxa") %>% 
  filter(GenomeType=="AABBDD") %>% 
  filter(Subgenome=="D") %>% 
  filter(DelCountGroup == "TotalDerivedSNPCount") %>% 
  filter(Stand_LoadGroup == choiceA) %>% ###******* choice
  select(-c("GenomeType","Subgenome","DelCountGroup","Stand_LoadGroup")) %>% filter(!is.na(Longitude)) %>%  ### 802个个体 -> 543 个有经纬度信息
  arrange(Stand_delCount)  #### 排序

mp<-NULL #定义一个空的地图  
# 注意： borders函数中的colour是地图国家边界线的颜色,fill是国家的填充颜色
mapworld<-borders("world",colour = "#dedfe0",fill="#dedfe0") #绘制基本地图
mpvoid<-ggplot()+ mapworld + ylim(-55,90)+theme_void()

library(viridis)
mp<-mpvoid +
  geom_point(data = df,mapping = aes(x=Longitude,y=Latitude,
  size=Stand_delCount, color=Stand_delCount),
  alpha = 0.9)+
  scale_size_continuous(range=c(1,3))+ ## 0.5-3
  # scale_color_manual(values = colorRampPalette(brewer.pal(n = 3, name = "RdBu"))(100))+
  # scale_fill_manual(values = colorRampPalette(brewer.pal(n = 3, name = "RdBu"))(100))
  # scale_fill_gradientn(colours =rev(colorRampPalette(brewer.pal(n = 3, name = "RdBu"))(100)))+
  scale_color_gradientn(colours = rev(colorRampPalette(brewer.pal(n = 3, name = "RdBu"))(100)))
  # scale_color_viridis(trans="log") +
  # scale_fill_viridis(trans="log")
  # theme(legend.position = 'none')
mp

ggsave(mp,filename = "figs/map_s1062_2.pdf", height = 2.76,width = 6.89)
```


## Spatstat world map
```{r}
choiceA <- "Ratio_031_GERP16way_1.5andDerived_PolyPhen2_0.5"

### 先处理del文件
dfdel <- read_tsv("data/004_delCount/delRatio.txt.gz")

dftaxaDB <- read_tsv("data/001_TaxaDB/WheatVMap2_GermplasmInfo.txt") %>%
  select(Taxa=VcfID,GenomeType,Longitude,Latitude)

set.seed(123)
df1 <- dfdel %>% right_join(.,dftaxaDB,by="Taxa") %>%
  filter(GenomeType=="AABBDD") %>%
  filter(Subgenome=="D") %>%
  filter(DelCountGroup == "TotalDerivedSNPCount") %>%
  filter(Stand_LoadGroup == choiceA) %>%
  select(-IfSelected,-Group_refobj) %>%
  select(-c("GenomeType","Subgenome","DelCountGroup","Stand_LoadGroup")) %>%
  filter(!is.na(Longitude)) %>%  ### 802个个体 -> 543 个有经纬度信息
  arrange(Stand_delCount)

df <- df1 %>%
  mutate(Longitude = Longitude + runif(nrow(df1),0.001,0.005)) ##通过添加随机数，避免重复
  
```

### plot
```{r}
# library(raster)
# # detach(package:raster,unload = T)
# library(spatstat)
# library(RColorBrewer)
# 
# ppp.df <- ppp(df$Longitude, df$Latitude, c(-180, 180), c(-60, 90))
# marks(ppp.df) <- df[, 2] ### load ratio 做为 marks 信息
# # plot(density(ppp.df)) ###画出点的聚集程度 测试
# ppp.df2 <- Smooth(ppp.df) ### 求每个点对应的load值的大小
# ppp.df2.m <- ppp.df2$v  ### 获取矩阵内的密度数据
# ###矩阵上下颠倒128 -> 1 ,依次减一
# ppp.df2.m1 <- ppp.df2.m[seq(128, 1, by = -1), ]
# 
# ### 创建 raster 类，并将密度的结果填充进去
# df.raster <- raster(nrows = 128, ncols = 128, xmn = -180, xmx = 180, ymn = -60, ymx = 90)
# values(df.raster) <- as.vector(t(as.matrix(ppp.df2.m1)))
# plot(df.raster)
# 
# world.shp <- shapefile("data/002_spatstat/spatstat/shapefile/TM_WORLD_BORDERS_SIMPL-0.3/TM_WORLD_BORDERS_SIMPL-0.3.shp")
# df.raster.final <- mask(df.raster, world.shp)
# par(mfrow = c(1,1))
# 
# # pdf(file = str_c("figs/Fig4/misc",choiceA,"_wholeWorld.pdf"),height = 3,width = 4)
# ### whole world
# png(filename = str_c("~/Documents/",choiceA,"_wholemap.png"),units = 'in',height = 4,width = 5.3,res = 300)
# plot(df.raster.final, col = colorRampPalette(rev(brewer.pal(n = 3, name = "RdBu")))(100),xaxt="n",yaxt="n")
# # points(df$Longitude, df$Latitude, pch = 16, cex = 0.5,col=rgb(252, 110, 110,90,maxColorValue=255))
# axis(1,at=seq(-150,180,60))
# axis(2,at=seq(-90,90,60))
# dev.off()
# 
# ### 亚欧大陆和非洲
# png(filename =str_c("~/Documents/",choiceA,"_Eurasia.png"),units = 'in',height = 4,width = 5.3,res = 300)
# # pdf( "~/Documents/test.pdf")
# plot(df.raster.final, col = colorRampPalette(rev(brewer.pal(n = 3, name = "RdBu")))(100),xlim=c(-20,150),ylim=c(-35,62),xaxt="n",yaxt="n")
# # points(df$Longitude, df$Latitude, pch = 16, cex = 0.8,col=rgb(252, 110, 110,90,maxColorValue=255))
# axis(1,at=seq(0,150,30))
# axis(2,at=seq(-20,62,20))
# # axis(1,at=seq(0,150,30),labels = c("0","","","","",""))
# # axis(2,at=seq(0,62,20))
# dev.off()
# 
# ### 美洲和非洲
# png(filename = str_c("~/Documents/",choiceA,"_American.png"),units = 'in',height = 4,width = 5.3,res = 300)
# plot(df.raster.final, col = colorRampPalette(rev(brewer.pal(n = 3, name = "RdBu")))(100),xlim=c(-150,60),ylim=c(-60,62),xaxt="n",yaxt="n")
# # points(df$Longitude, df$Latitude, pch = 16, cex = 0.8,col=rgb(252, 110, 110,90,maxColorValue=255))
# axis(1,at=seq(-150,60,30))
# axis(2,at=seq(-60,62,30))
# dev.off()

```


#### === Fig4F===
## boxplot load
- 可以选择定义有害突变的方式 
unique(dfdel$Stand_LoadGroup)
```{r}
choiceA <- "Ratio_031_GERP16way_1.5andDerived_PolyPhen2_0.5"
dfdel <- read_tsv("data/004_delCount/delRatio.txt.gz") 
### ********************** Section1 *********************************###
### taxa 属性数据框
factorA <- c("Landrace","Cultivar","OtherHexaploids","compactum","sphaerococcum","tibeticum","vavilovii","petropavlovskyi","yunna-nense","macha","spelta","dicoccoides","dicoccum","durum","turanicum","polonicum","turgidum","karamyschevii","ispahanicum","carthlicum","strangulata")

labelsA <- c("Landrace","Cultivar","OtherHexaploids","Club wheat","Indian dwarf wheat","Tibetan semi-wild","Vavilovii","Xinjiang wheat","Yunan wheat","Macha","Spelt","Wild emmer","Domesticated emmer","Durum","Khorasan wheat ","Polish wheat","Rivet wheat","Georgian wheat","Ispahanicum","Persian wheat","Strangulata")

dfhash <- tibble(Subspecies_by22_TreeValidated=factorA,Subspecies22=labelsA) 

dftaxaDB <- read_tsv("data/001_TaxaDB/WheatVMap2_GermplasmInfo.txt") %>%
  select(Taxa=VcfID,GenomeType,Subspecies_by22_TreeValidated,GroupbyContinent) %>%
  left_join(.,dfhash,by="Subspecies_by22_TreeValidated") %>% 
  mutate(GroupforExpansionLoad = if_else(Subspecies22=="Landrace",GroupbyContinent,Subspecies22)) %>% 
  select(Taxa,GenomeType,GroupforExpansionLoad)

### ********************** Section2 *********************************###
### del load 数据框
df <- dfdel %>% left_join(.,dftaxaDB,by="Taxa") %>% 
  filter(Subgenome=="B") %>% 
  filter(DelCountGroup=="TotalDerivedSNPCount") %>% 
  filter(GenomeType=="AABBDD") %>% 
  select(-c(DelCountGroup,IfSelected,Group_refobj,GenomeType)) %>% 
  filter(!is.na(GroupforExpansionLoad))

### 统计每个亚基因组内，以亚洲为单位的均值，按照D亚基因组排序
### dfV2 为排序后的亚洲列表
dfV1 <- df %>% 
  group_by(Subgenome,Stand_LoadGroup,GroupforExpansionLoad) %>%
  summarise(mean = mean(Stand_delCount)) %>%
  arrange(.,Subgenome,Stand_LoadGroup,mean) %>% 
  ungroup() %>%  
  filter(Subgenome=="B",Stand_LoadGroup== choiceA) %>% 
  select(GroupforExpansionLoad)
### 因子排序,排序后计数
df$GroupforExpansionLoad <- factor(df$GroupforExpansionLoad,levels = dfV1$GroupforExpansionLoad)

### 计算每个亚群的个数
### 千万注意数据框的分组情况，如sub VariantsGroup ，计算后核查确认！！！
dftaxaCount <- df %>% 
  group_by(Subgenome,Stand_LoadGroup,GroupforExpansionLoad) %>%
  count() %>% ungroup() %>%  
  distinct(.,GroupforExpansionLoad,.keep_all = T) %>%
  select(-Subgenome) %>% 
  mutate(GroupforExpansionLoad_AddCount = str_c(GroupforExpansionLoad," (", n, ")",sep = "")) %>%
  select(-n)
```

### plot
```{r}
library(RColorBrewer)
p <- df %>% 
  filter(Stand_LoadGroup==choiceA) %>%
  ggplot(aes(x=GroupforExpansionLoad,y= Stand_delCount))+
  labs(y=str_c("Mutation load per accession\nby ",choiceA),x="")+
  geom_boxplot(aes(fill=GroupforExpansionLoad), outlier.color =NA,alpha=0.6)+ ### width=0.1
  geom_point(position = position_jitterdodge(0.6),alpha = 0.4,aes(color=Subgenome),size=0.2)+
  stat_summary(aes(group = Subgenome), fun=mean, geom="point", color="blue",position = position_dodge(1),size=0.7)+
  scale_color_manual(values =c("#fd8582","#967bce","#4bcdc6"),name='')+
  scale_fill_manual(values = colorRampPalette(rev(brewer.pal(n = 1, name = "RdBu")))(16))+
  scale_x_discrete(labels = dftaxaCount$GroupforExpansionLoad_AddCount)+
  theme(strip.background = element_blank())+
  # theme(plot.margin=unit(rep(1.3,4),'lines'))+
  coord_flip()+
  theme(
    panel.background = element_blank(),
    panel.border = element_rect(colour = "black", fill=NA, size=0.4),
    text = element_text(size = 7),legend.position = 'none')
p
### 竖版
# size=7
ggsave(p,filename = "/Users/Aoyue/Documents/test.pdf",height = 4.2,width = 2.6)
### 横版
# size=14
# ggsave(p,filename = "/Users/Aoyue/Documents/test.pdf",height = 4.2,width = 2.6) 

```


#### === Fig4G===
## TajimaD
```{r message=FALSE, warning=FALSE}
df <- read_tsv("data/014_PopParameters/003_angsd/001_angsd_subspecies26_geneRegion_RefChr.txt.gz") %>% 
  mutate(Sub=str_sub(RefChr,2))

### taxa 属性数据框
factorA <- c("OtherHexaploids","Cultivar","OtherHexaploids","compactum","sphaerococcum","tibeticum","vavilovii","petropavlovskyi","yunna-nense","macha","spelta","dicoccoides","dicoccum","durum","turanicum","polonicum","turgidum","karamyschevii","ispahanicum","carthlicum","strangulata", "LR_WA","LR_EA","LR_EU","LR_CSA","LR_AM","LR_AF")

labelsA <- c("OtherHexaploids","Cultivar","OtherHexaploids","Club wheat","Indian dwarf wheat","Tibetan semi-wild","Vavilovii","Xinjiang wheat","Yunan wheat","Macha","Spelt","Wild emmer","Domesticated emmer","Durum","Khorasan wheat ","Polish wheat","Rivet wheat","Georgian wheat","Ispahanicum","Persian wheat","Strangulata","LR_WA","LR_EA","LR_EU","LR_CSA","LR_AM","LR_AF")

dfhash <- tibble(Group=factorA,Subspecies22=labelsA) 

### 得到3列数据，GenomeType,GroupforExpansionLoad,Group
dftaxaDB <- read_tsv("data/001_TaxaDB/WheatVMap2_GermplasmInfo.txt") %>%
  select(Taxa=VcfID,GenomeType,Subspecies_by22_TreeValidated,GroupbyContinent) %>%
  mutate(Group=if_else(Subspecies_by22_TreeValidated=="Landrace",GroupbyContinent,Subspecies_by22_TreeValidated)) %>% 
  left_join(.,dfhash,by="Group") %>% 
  mutate(GroupforExpansionLoad = if_else(Subspecies22=="Landrace",GroupbyContinent,Subspecies22)) %>% 
  select(GenomeType,GroupforExpansionLoad,Group) %>% 
  filter(!is.na(GroupforExpansionLoad)) %>% 
  distinct(GroupforExpansionLoad,.keep_all = T) 

### ********************** Section2 *********************************###
### 
df <- df %>% left_join(.,dftaxaDB,by="Group")

```


### plot density_ridges TajimaD

```{r}

fun_getLoadOrder <- function(){
  ### ********************** Section1 *********************************###
### 求作图的顺序和load一致
choiceA <- "Ratio_031_GERP16way_1.5andDerived_PolyPhen2_0.5"
dfdel <- read_tsv("data/004_delCount/delRatio.txt.gz") 
### taxa 属性数据框
factorA <- c("Landrace","Cultivar","OtherHexaploids","compactum","sphaerococcum","tibeticum","vavilovii","petropavlovskyi","yunna-nense","macha","spelta","dicoccoides","dicoccum","durum","turanicum","polonicum","turgidum","karamyschevii","ispahanicum","carthlicum","strangulata")

labelsA <- c("Landrace","Cultivar","OtherHexaploids","Club wheat","Indian dwarf wheat","Tibetan semi-wild","Vavilovii","Xinjiang wheat","Yunan wheat","Macha","Spelt","Wild emmer","Domesticated emmer","Durum","Khorasan wheat ","Polish wheat","Rivet wheat","Georgian wheat","Ispahanicum","Persian wheat","Strangulata")

dfhash <- tibble(Subspecies_by22_TreeValidated=factorA,Subspecies22=labelsA) 

dftaxaDB <- read_tsv("data/001_TaxaDB/WheatVMap2_GermplasmInfo.txt") %>%
  select(Taxa=VcfID,GenomeType,Subspecies_by22_TreeValidated,GroupbyContinent) %>%
  left_join(.,dfhash,by="Subspecies_by22_TreeValidated") %>% 
  mutate(GroupforExpansionLoad = if_else(Subspecies22=="Landrace",GroupbyContinent,Subspecies22)) %>% 
  select(Taxa,GenomeType,GroupforExpansionLoad)

### del load 数据框
df <- dfdel %>% left_join(.,dftaxaDB,by="Taxa") %>% 
  filter(Subgenome=="D") %>% 
  filter(DelCountGroup=="TotalDerivedSNPCount") %>% 
  filter(GenomeType=="AABBDD") %>% 
  select(-c(DelCountGroup,IfSelected,Group_refobj,GenomeType)) %>% 
  filter(!is.na(GroupforExpansionLoad))

### 统计每个亚基因组内，以亚洲为单位的均值，按照D亚基因组排序
### dfV2 为排序后的亚洲列表
dfV1 <- df %>% 
  group_by(Subgenome,Stand_LoadGroup,GroupforExpansionLoad) %>%
  summarise(mean = mean(Stand_delCount)) %>%
  arrange(.,Subgenome,Stand_LoadGroup,mean) %>% 
  ungroup() %>%  
  filter(Subgenome=="D",Stand_LoadGroup== choiceA) %>% 
  select(GroupforExpansionLoad)

order <- dfV1$GroupforExpansionLoad
### 因子排序,排序后计数
df$GroupforExpansionLoad <- factor(df$GroupforExpansionLoad,levels = order)

return(order)
}

labelA <- fun_getLoadOrder()

df$GroupforExpansionLoad <- factor(df$GroupforExpansionLoad,levels = labelA)

```


```{r}
library(reshape2)
library(ggridges)

### ********************** Section2 *********************************###

 df1 <- df %>% 
  filter(Sub=="D") %>% 
  filter(GenomeType=="AABBDD") %>% 
  select(Chr,WinCenter,tW,tP,Tajima,nSites,GroupforExpansionLoad,RefChr,RefPos,Sub) %>% 
  # filter(tW>0) %>% 
  filter(!is.na(Tajima)) %>% 
  mutate(num=as.numeric(factor(GroupforExpansionLoad)))

p <-
 # ggplot(df1,aes(x=Tajima,y=GroupforExpansionLoad,fill = 0.5 - abs(0.5 - stat(ecdf))))+
  ggplot(df1,aes(x=Tajima,y=GroupforExpansionLoad,fill =  stat(ecdf)))+

  # stat_density_ridges(quantile_lines = TRUE, quantiles = 2
  #                      ,scale=4,alpha=0.6, color='gray'
  #                      )+
  stat_density_ridges(geom = "density_ridges_gradient", calc_ecdf = TRUE,scale=4,alpha=0.6, color='gray') +
  # geom_density_ridges_gradient(scale = 3,size=0.3) +
    scale_fill_gradientn(colours = rev(colorRampPalette(brewer.pal(n = 3, name = "RdBu"))(100)))+
  # scale_fill_viridis_c(name = "Temp. [F]", option = "C") +
  # scale_y_discrete(expand = c(0.01, 0)) +
  # scale_x_continuous(expand = c(0, 0),limits = c(-2,4),breaks = seq(-2,4,2))+
  coord_cartesian(xlim = c(-1.5,4))+
  # theme_ridges(grid = FALSE,center_axis_labels = TRUE)+ 
  # theme_pubr()+
  # labs(x= expression(paste("Tajima's ",italic(D))), y = "Group number")+
  labs(x= expression(paste("Tajima's ",italic(D))), y = "")+
  geom_vline(xintercept = 0, linetype="dotted", color = "blue")+
  theme(strip.background = element_blank())+
  theme(panel.grid= element_blank(), 
        panel.background = element_blank(),
        axis.line = element_line(size=0.4, colour = "black"),
        text = element_text(size = 14.5),legend.position = 'none')

p

### AI size=7
ggsave(p,filename = "~/Documents/test.pdf",height = 3.9,width = 1.96)

### PPT size=14.5
ggsave(p,filename = "~/Documents/test.pdf",height = 6,width = 6)

```


#### =======================
#### ==== Extended Data ====
#### =======================
#### ====== 
## ****Map by subspecies
- 这里只画有经纬度信息的
```{r}
### ********************** Section1 *********************************###
### taxa 属性数据框
factorA <- c("dicoccoides","dicoccum","durum","turanicum","polonicum","turgidum","karamyschevii","ispahanicum","carthlicum","Landrace","Cultivar","OtherHexaploids","compactum","sphaerococcum","tibeticum","vavilovii","petropavlovskyi","yunna-nense","macha","spelta","strangulata")

labelsA <- c("Wild emmer","Domesticated emmer","Durum","Khorasan wheat","Polish wheat","Rivet wheat","Georgian wheat","Ispahanicum","Persian wheat","Landrace","Cultivar","OtherHexaploids","Club wheat","Indian dwarf wheat","Tibetan semi-wild","Vavilovii","Xinjiang wheat","Yunan wheat","Macha","Spelt","Strangulata")

dfhash <- tibble(Subspecies_by22_TreeValidated=factorA,Subspecies22=labelsA) 

dftaxaDB <- read_tsv("data/001_TaxaDB/WheatVMap2_GermplasmInfo.txt") %>%
  select(Taxa=VcfID,GenomeType,Subspecies_by22_TreeValidated,GroupbyContinent,Latitude,Longitude,orgCty,Country) %>%
  left_join(.,dfhash,by="Subspecies_by22_TreeValidated") %>% 
  mutate(GroupforExpansionLoad = if_else(Subspecies22=="Landrace",GroupbyContinent,Subspecies22)) %>% 
  mutate(GroupforExpansionLoad2=
if_else(Subspecies22=="Landrace",GroupbyContinent,Subspecies_by22_TreeValidated)) %>%
  select(Taxa,GenomeType,GroupforExpansionLoad,GroupforExpansionLoad2,Latitude,Longitude,orgCty,Country) 

# write_tsv(dftaxaDB,file = "/Users/Aoyue/project/wheatVMap2_1000/001_germplasm/Group/002_taxaDB_GroupforExpansionLoad.txt")


### ********************** Section 2 *********************************###
### 先将有经纬度信息的材料进行画图查看，将异常点的位置信息进行校正
dftaxaDB2 <- dftaxaDB %>% 
  filter(!is.na(Latitude))

### 写出没有经纬度信息的336份材料，根据国家添加人为估计的经纬度信息
dftaxaDB3_noPos <- dftaxaDB %>% 
  filter(is.na(Latitude))

# write_csv(dftaxaDB3_noPos,file = "/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/026_heter_load_distance/001_noPosition_table/001_taxa_noPosition.csv")

### 写出具体的亚群，手动添加地区注释
dftable <- dftaxaDB %>% 
  filter(!is.na(GroupforExpansionLoad)) %>% 
  distinct(GroupforExpansionLoad,.keep_all = T) %>% 
  select(GenomeType,GroupforExpansionLoad,GroupforExpansionLoad2)
# write_csv(dftable,file = "~/Documents/taxaDB_category.csv")

plotfun_map <- function(fun_df,fun_factor){
  mp <- NULL #定义一个空的地图
  # 注意： borders函数中的colour是地图国家边界线的颜色,fill是国家的填充颜色
  mapworld <- borders("world", colour = "#dedfe0", fill = "#dedfe0") #绘制基本地图
  mpvoid <- ggplot() + mapworld + ylim(-55, 90) + theme_void()
  
  mp <- mpvoid +
    geom_point(
      data = fun_df,
      aes(x = Longitude, y = Latitude),
      shape = 21,
      alpha = 0.8,
      size = 1.5,
      fill = "orange"
    ) +
    labs(title = fun_factor)+
    theme(plot.title = element_text(hjust = 0.5)) +
    theme(text = element_text(size = 6))+
    theme(legend.position = 'none')
  mp
  
  ggsave(mp,filename = str_c("~/Documents/",fun_factor,"_map.pdf"),height = 1.38,width = 3.445)
  
  return(mp)

}

#### 需要修饰的方面
# 1.调节因子顺序
# 2.标题居中，调节字体大小
# 3.调节圆点大小

### 建立2个一模一样的表格，为创建矩阵做准备
df2 <- dftaxaDB2 %>% 
  filter(!is.na(GroupforExpansionLoad)) %>% 
  # mutate(GroupforExpansionLoad=factor(GroupforExpansionLoad,levels = factorAo,labels = factorAo)) %>%
  # arrange(factor(GroupforExpansionLoad,levels = factorAo)) %>%
  group_by(as.character(GroupforExpansionLoad)) %>%
  group_map(~plotfun_map(.x,.y))

### 图片保存
q <- ggpubr::ggarrange(plotlist=df2,ncol = 4,nrow = 6)

q

cowplot::save_plot(q,filename = "~/Documents/test.pdf",base_height = 7.2,base_width = 7.2)


# factorAo <- c("Wild emmer","Domesticated emmer","Durum","Khorasan wheat","Polish wheat","Rivet wheat","Georgian wheat","Ispahanicum","Persian wheat","LR_EA","LR_CSA","LR_WA","LR_AF","LR_EU","LR_AM","Cultivar","OtherHexaploids","Club wheat","Indian dwarf wheat","Tibetan semi-wild","Vavilovii","Xinjiang wheat","Yunan wheat","Macha","Spelt","Strangulata")

# factorAo <- c("Wild emmer","Domesticated emmer","Durum","Khorasan wheat ","Polish wheat","Rivet wheat","Ispahanicum","Persian wheat","LR_EA","LR_CSA","LR_WA","LR_AF","LR_EU","LR_AM","Cultivar","OtherHexaploids","Club wheat","Indian dwarf wheat","Tibetan semi-wild","Xinjiang wheat","Macha","Spelt","Strangulata")

### "Georgian wheat","Vavilovii","Yunan wheat"


# q <- ggpubr::ggarrange(plotlist=list(df2[[22]],df2[[3]],df2[[4]],df2[[7]],df2[[17]],
#                                      df2[[18]],df2[[]],df2[[]],df2[[]],df2[[]],
#                                      df2[[]],df2[[]],df2[[]],df2[[]],df2[[]],
#                                      df2[[]],df2[[]],df2[[]],df2[[]],df2[[]],
#                                      df2[[]],df2[[]],df2[[]])
#                        ,ncol = 4,nrow = 6)

```

## 重要 MAP & Latitude estimation by manual
- 这里根据国家信息手动估计了经纬度
```{r}
### ********************** Section1 *********************************###
# df_all <- read_tsv("/Users/Aoyue/project/wheatVMap2_1000/001_germplasm/Group/002_taxaDB_GroupforExpansionLoad.txt")
#
### step1: 写出没有经纬度信息的336份材料，根据国家添加人为估计的经纬度信息
### "Georgian wheat","Vavilovii","Yunan wheat"  47个国家
#
# df_noPosition <- read_csv("/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/026_heter_load_distance/001_noPosition_table/001_taxa_noPosition.csv")
#
### step2: 输出没有地理位置信息的国家，手动添加信息
# dftest <- df_noPosition %>% 
#   distinct(Country,.keep_all = T) %>% 
#   select(4:7)
### write_csv(dftest,file = "~/Documents/country_noLatitude.csv")
#
### step3: 336份没有经纬度信息的，加上经纬度信息和新的一列（是否是自己手动添加经纬度）
# df_manualPosition <- read_csv("/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/026_heter_load_distance/001_noPosition_table/002_country_noLatitude.csv")
#
# df_336 <- df_noPosition %>% 
#   select(Taxa,GenomeType,GroupforExpansionLoad,GroupforExpansionLoad2,Country) %>% 
#   left_join(df_manualPosition,by="Country") %>% 
#   mutate(IfLatitudebyManually="Yes") %>% 
#   relocate(Country,.after = orgCty)
### write_csv(df_336,file = "/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/026_heter_load_distance/001_noPosition_table/003_taxa_noPosition_addbyManually.csv")
#
### step4: 有些没有国家的材料，有种群信息，没有国家信息，但是可以根据种群信息手动添加经纬度信息。
### 这一步进行剩余10几份材料(15)的手动添加
### df_336_withManuallyPosition 
#  
### step5: 1062-336= 726 份材料添加注释信息
# df_726_withPosition <- df_all %>% 
#   anti_join(df_336,by="Taxa") %>% 
#   mutate(IfLatitudebyManually="No") %>% 
#   mutate(IfHasCountryInfo="Yes")
#
# df_336_withManuallyPosition <- read_csv("/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/026_heter_load_distance/001_noPosition_table/004_taxa_noPosition_addbyManually_secondTime.csv")
# 
# df_1062_withPosition <- df_726_withPosition %>% 
#   bind_rows(df_336_withManuallyPosition) %>% 
#   arrange(Taxa)
#
### write_tsv(df_1062_withPosition,file = "/Users/Aoyue/project/wheatVMap2_1000/001_germplasm/Group/003_taxaDB_GroupforExpansionLoad.txt")

### write_csv(df_1062_withPosition,file = "/Users/Aoyue/project/wheatVMap2_1000/001_germplasm/Group/003_taxaDB_GroupforExpansionLoad.csv")

### step6: 添加 6个群的信息
# dftaxaDB <- read_tsv("/Users/Aoyue/project/wheatVMap2_1000/001_germplasm/021_WheatVMap2_GermplasmInfo.txt") %>% select(Taxa=VcfID,Subspecies_by6_TreeValidated)
# 
# df_1062_withPosition <- read_tsv("/Users/Aoyue/project/wheatVMap2_1000/001_germplasm/Group/003_taxaDB_GroupforExpansionLoad.txt")
# 
# df_1062_update <- df_1062_withPosition %>% 
#   left_join(dftaxaDB,by="Taxa") %>% 
#   relocate(Subspecies_by6_TreeValidated,.after = GroupforExpansionLoad2)

###write_tsv(df_1062_update,file = "/Users/Aoyue/project/wheatVMap2_1000/001_germplasm/Group/004_taxaDB_GroupforExpansionLoad.txt")

### ------------ 开始画地图 ------------ ###
### 先建立函数
plotfun_map <- function(fun_df,fun_factor){
  mp <- NULL #定义一个空的地图
  # 注意： borders函数中的colour是地图国家边界线的颜色,fill是国家的填充颜色
  mapworld <- borders("world", colour = "#dedfe0", fill = "#dedfe0") #绘制基本地图
  mpvoid <- ggplot() + mapworld + ylim(-55, 90) + theme_void()
  
  mp <- mpvoid +
    geom_point(
      data = fun_df,
      aes(x = Longitude, y = Latitude),
      shape = 21,
      alpha = 0.8,
      size = 1.5,
      fill = "orange"
    ) +
    labs(title = fun_factor)+
    theme(plot.title = element_text(hjust = 0.5)) +
    theme(text = element_text(size = 6))+
    theme(legend.position = 'none')
  mp
  
  ggsave(mp,filename = str_c("~/Documents/",fun_factor,"_map.pdf"),height = 1.38,width = 3.445)
  
  return(mp)

}

#### 需要修饰的方面
# 1.调节因子顺序
# 2.标题居中，调节字体大小
# 3.调节圆点大小

### 建立2个一模一样的表格，为创建矩阵做准备
### 写出具体的亚群，手动添加地区注释

df2 <- read_tsv("/Users/Aoyue/project/wheatVMap2_1000/001_germplasm/Group/003_taxaDB_GroupforExpansionLoad.txt") %>% 
  filter(!is.na(GroupforExpansionLoad)) %>% 
  # mutate(GroupforExpansionLoad=factor(GroupforExpansionLoad,levels = factorAo,labels = factorAo)) %>%
  # arrange(factor(GroupforExpansionLoad,levels = factorAo)) %>%
  group_by(as.character(GroupforExpansionLoad)) %>%
  group_map(~plotfun_map(.x,.y))

### 图片保存
# q <- ggpubr::ggarrange(plotlist=df2,ncol = 4,nrow = 7)
# 
# q
# 
# cowplot::save_plot(q,filename = "~/Documents/test.pdf",base_height = 7.2,base_width = 7.2)

### 按照自己定义的顺序进行图片保存
q <- ggpubr::ggarrange(plotlist=list(df2[[24]],df2[[3]],df2[[4]],df2[[8]],df2[[18]],
                                     df2[[19]],df2[[5]],df2[[7]],df2[[17]],
                                     df2[[15]],df2[[20]],
                                     df2[[9]],df2[[10]],df2[[11]],df2[[12]],df2[[13]],
                                     df2[[14]],df2[[2]],df2[[16]],
                                     df2[[1]],df2[[6]],df2[[22]],df2[[23]],df2[[25]],df2[[26]],
                                     df2[[21]]
                                     ),
                       labels = as.character(c(1:26)),
                       font.label = list(size = 7),
                       ncol = 4,nrow = 7)

cowplot::save_plot(q,filename = "~/Documents/test.pdf",base_height = 7.2,base_width = 7.2)

```

#### ======
## ****DAF
### plot by 642 nonsyn and syn
- 注意：该方法将 del 列为 nonsyn，不画出 del
```{r}
### step1: df
df <- read_tsv("data/015_AAF/001_aaf_bySubspecies/001_aaf_bySubspecies.txt.gz",show_col_types = FALSE)

### step2: DAF bin
df1 <- df %>% 
  mutate(VariantsType = ifelse(LoadGroup=="Synonymous","Synonymous","Nonsynonymous")) %>%
  select(-LoadGroup) %>% 
  rename(LoadGroup=VariantsType) %>% 
  rename(DAF=AAF) %>% 
  filter(DAF!=0) %>%
  filter(DAF!=1) %>%
  mutate(bin=cut_width(DAF,width = 0.1,boundary = 0)) %>%
  group_by(GenomeType,Sub,LoadGroup,Group) %>% 
  count(bin) %>% 
  mutate(fre=n/sum(n)) 

df3 <- df %>% 
  mutate(VariantsType = ifelse(LoadGroup=="Synonymous","Synonymous","Nonsynonymous")) %>%
  select(-LoadGroup) %>% 
  rename(LoadGroup=VariantsType) %>% 
  rename(DAF=AAF) %>% 
  filter(DAF!=0) %>%
  filter(DAF!=1) %>%
  mutate(bin=cut_width(DAF,width = 0.2,boundary = 0)) %>%
  group_by(GenomeType,Sub,LoadGroup,Group) %>% 
  count(bin) %>% 
  mutate(fre=n/sum(n)) 

### step3: plot_function
fun_plotDAF <- function(df_fun,factor_fun){
  
  ### df_fun 提取因子信息
  fun_genometype <- factor_fun$GenomeType
  fun_sub <- factor_fun$Sub
  fun_loadgroup <- factor_fun$LoadGroup
  
  ### 设置因子顺序,由于 设置的 0.1 bin 导致有些亚群没有颜色信息
  factorA <- df3 %>% 
    filter(GenomeType == fun_genometype,
           Sub == fun_sub,
           LoadGroup == fun_loadgroup) %>% 
    filter(bin=="[0,0.2]") %>% 
    arrange(-fre) %>% 
    pull(Group)
    
  ### 为每个亚群添加倍性信息,dfhash 为 dfcolorDB 服务
  dfhash <-
    read_tsv("data/001_TaxaDB/004_taxaDB_GroupforExpansionLoad.txt",show_col_types = FALSE) %>%
    select(GenomeType, GroupforExpansionLoad,GroupforExpansionLoad2) %>%
    filter(!is.na(GroupforExpansionLoad)) %>%
    distinct(., GroupforExpansionLoad, .keep_all = T)
  
  ### 每个亚群的颜色信息
  dfcolorDB <- read_tsv("data/001_TaxaDB/003_VMap2_colorDB.txt",show_col_types = FALSE) %>%
    filter(ColorCategory %in% c("GroupforExpansionLoad")) %>% droplevels() %>%
    left_join(dfhash, by = c("Group" = "GroupforExpansionLoad2")) %>%
    ### 过滤数据，使之和输入的数据框 df_fun 保持一致
    filter(GenomeType == fun_genometype) %>% 
    mutate(Label=factor(Label,levels = factorA)) %>% 
    arrange(as.factor(Label))
  
  ### 设置数据框和画图颜色的因子顺序，使都保持一致
  colB <- dfcolorDB$ColorCode
  df_fun$Group <- factor(df_fun$Group,levels = factorA)
  
  
  q <- df_fun %>%
    ggplot(aes(
      x = bin,
      y = fre,
      group = Group,
      color = Group
    )) +
    geom_line(alpha = 0.9) +
    labs(y=str_c("Proportion of ",str_to_lower(fun_loadgroup)," SNPs"),
         x="Derived allele frequency")+
    # labs(y = "Proportion of nonsynonymous SNPs", x = "Derived allele frequency") +
    scale_color_manual(values = colB) +
    # scale_color_manual(values = colorRampPalette(rev(brewer.pal(n = 12, name = "Set3")))(16))+
    # scale_fill_manual(values = colB)+
    # scale_y_continuous(limits = c(0,1))+
    scale_x_discrete(label = c("0.1", "", "0.3", "", "0.5", "", "0.7", "", "0.9", "")) +
    # facet_grid(Sub~Group)+
    theme(strip.background = element_blank()) +
    # theme(axis.text.x = element_text(vjust = 1, hjust = 1, angle = 45))+
    theme(
      # plot.margin=unit(rep(1,4),'lines'),
      panel.background = element_blank(),
      panel.border = element_rect(
        colour = "black",
        fill = NA,
        size = 0.7
      ),
      legend.key = element_blank(),
      legend.position = 'none',
      text = element_text(size = 9)
    )
  
  q
  # ggsave(q,filename = "~/Documents/temp.png",height = 6,width = 8)
  
  fun_genometype <- factor_fun$GenomeType
  fun_sub <- factor_fun$Sub
  fun_loadgroup <- factor_fun$LoadGroup
  
  filename_fun <- str_c("~/Documents/",fun_genometype,"_",fun_sub,"Sub","_",fun_loadgroup,".pdf")
  ggsave(q,
         # filename = "~/Documents/test.pdf",
         filename = filename_fun,
         height = 4.2,
         width = 7) ### 刚好可以把所有图例展示出来
  
  ### 打出出图顺序
  print(filename_fun)
  
  return(q)
}

### step4: invoke the plot function
df2 <- df1 %>% 
  group_by(GenomeType,Sub,LoadGroup) %>% 
  group_map(~fun_plotDAF(.x,.y))

## step5: combine figs automatically or manually

### 先保存6倍体的，再保存4倍体的，最后2倍体
q <- ggpubr::ggarrange(plotlist=df2,ncol = 3)

q

### 六倍体
q <- ggpubr::ggarrange(plotlist=list(df2[[6]],df2[[8]],df2[[10]],df2[[5]],df2[[7]],df2[[9]]),
                       ncol = 3,nrow = 2)

q

cowplot::save_plot(q,filename = "~/Documents/AABBDD.pdf",base_height = 4.45,base_width = 7.2)

### 四倍体
q <- ggpubr::ggarrange(plotlist=list(df2[[2]],df2[[4]],
                       df2[[1]],df2[[3]]),
                       ncol = 2,nrow = 2)

q

cowplot::save_plot(q,filename = "~/Documents/AABB.pdf",base_height = 4.45,base_width = 7.2)

```

### plot by 642 del nonsyn and syn
- 注意：该方法将 del 列为 nonsyn，不画出 del

```{r}
### step1: df
df <- read_tsv("data/015_AAF/001_aaf_bySubspecies/001_aaf_bySubspecies.txt.gz",show_col_types = FALSE)

### step2: DAF bin
df1 <- df %>% 
  rename(DAF=AAF) %>% 
  filter(DAF!=0) %>%
  filter(DAF!=1) %>%
  mutate(bin=cut_width(DAF,width = 0.1,boundary = 0)) %>%
  group_by(GenomeType,Sub,LoadGroup,Group) %>% 
  count(bin) %>% 
  mutate(fre=n/sum(n))

df3 <- df %>% 
  rename(DAF=AAF) %>% 
  filter(DAF!=0) %>%
  filter(DAF!=1) %>%
  mutate(bin=cut_width(DAF,width = 0.2,boundary = 0)) %>%
  group_by(GenomeType,Sub,LoadGroup,Group) %>% 
  count(bin) %>% 
  mutate(fre=n/sum(n)) 

### step3: plot_function
fun_plotDAF <- function(df_fun,factor_fun){
  
  ### df_fun 提取因子信息
  fun_genometype <- factor_fun$GenomeType
  fun_sub <- factor_fun$Sub
  fun_loadgroup <- factor_fun$LoadGroup
  
  ### 设置因子顺序,由于 设置的 0.1 bin 导致有些亚群没有颜色信息
  factorA <- df3 %>% 
    filter(GenomeType == fun_genometype,
           Sub == fun_sub,
           LoadGroup == fun_loadgroup) %>% 
    filter(bin=="[0,0.2]") %>% 
    arrange(-fre) %>% 
    pull(Group)
    
  ### 为每个亚群添加倍性信息,dfhash 为 dfcolorDB 服务
  dfhash <-
    read_tsv("data/001_TaxaDB/004_taxaDB_GroupforExpansionLoad.txt",show_col_types = FALSE) %>%
    select(GenomeType, GroupforExpansionLoad,GroupforExpansionLoad2) %>%
    filter(!is.na(GroupforExpansionLoad)) %>%
    distinct(., GroupforExpansionLoad, .keep_all = T)
  
  ### 每个亚群的颜色信息
  dfcolorDB <- read_tsv("data/001_TaxaDB/003_VMap2_colorDB.txt",show_col_types = FALSE) %>%
    filter(ColorCategory %in% c("GroupforExpansionLoad")) %>% droplevels() %>%
    left_join(dfhash, by = c("Group" = "GroupforExpansionLoad2")) %>%
    ### 过滤数据，使之和输入的数据框 df_fun 保持一致
    filter(GenomeType == fun_genometype) %>% 
    mutate(Label=factor(Label,levels = factorA)) %>% 
    arrange(as.factor(Label))
  
  ### 设置数据框和画图颜色的因子顺序，使都保持一致
  colB <- dfcolorDB$ColorCode
  df_fun$Group <- factor(df_fun$Group,levels = factorA)
  
  
  q <- df_fun %>%
    ggplot(aes(
      x = bin,
      y = fre,
      group = Group,
      color = Group
    )) +
    geom_line(alpha = 0.9) +
    labs(y=str_c("Proportion of ",str_to_lower(fun_loadgroup)," SNPs"),
         x="Derived allele frequency")+
    # labs(y = "Proportion of nonsynonymous SNPs", x = "Derived allele frequency") +
    scale_color_manual(values = colB) +
    # scale_color_manual(values = colorRampPalette(rev(brewer.pal(n = 12, name = "Set3")))(16))+
    # scale_fill_manual(values = colB)+
    # scale_y_continuous(limits = c(0,1))+
    scale_x_discrete(label = c("0.1", "", "0.3", "", "0.5", "", "0.7", "", "0.9", "")) +
    # facet_grid(Sub~Group)+
    theme(strip.background = element_blank()) +
    # theme(axis.text.x = element_text(vjust = 1, hjust = 1, angle = 45))+
    theme(
      # plot.margin=unit(rep(1,4),'lines'),
      panel.background = element_blank(),
      panel.border = element_rect(
        colour = "black",
        fill = NA,
        size = 0.7
      ),
      legend.key = element_blank(),
      legend.position = 'none',
      text = element_text(size = 9)
    )
  
  q
  # ggsave(q,filename = "~/Documents/temp.png",height = 6,width = 8)
  
  fun_genometype <- factor_fun$GenomeType
  fun_sub <- factor_fun$Sub
  fun_loadgroup <- factor_fun$LoadGroup
  
  filename_fun <- str_c("~/Documents/",fun_genometype,"_",fun_sub,"Sub","_",fun_loadgroup,".pdf")
  ggsave(q,
         # filename = "~/Documents/test.pdf",
         filename = filename_fun,
         height = 4.2,
         width = 7) ### 刚好可以把所有图例展示出来
  
  ### 打出出图顺序
  print(filename_fun)
  
  return(q)
}

### step4: invoke the plot function
df2 <- df1 %>% 
  group_by(GenomeType,Sub,LoadGroup) %>% 
  group_map(~fun_plotDAF(.x,.y))

## step5: combine figs automatically or manually

# ### 先保存6倍体的，再保存4倍体的，最后2倍体
# q <- ggpubr::ggarrange(plotlist=df2,ncol = 3)
# 
# q

### 六倍体
q <- ggpubr::ggarrange(plotlist=list(df2[[9]],df2[[12]],df2[[15]],
                                     df2[[8]],df2[[11]],df2[[14]],
                                     df2[[7]],df2[[10]],df2[[13]]),
                       ncol = 3,nrow = 3)

q

cowplot::save_plot(q,filename = "~/Documents/AABBDD_syn_nonsyn_del.pdf",base_height = 7.2,base_width = 7.2)

### 四倍体
q <- ggpubr::ggarrange(plotlist=list(df2[[3]],df2[[6]],
                       df2[[2]],df2[[5]],
                       df2[[1]],df2[[4]]),
                       ncol = 2,nrow = 3)

q

cowplot::save_plot(q,filename = "~/Documents/AABB_syn_nonsyn_del.pdf",base_height = 7.2,base_width = 7.2)


```

#### ======
## ****Heter by individual
```{r}

factorAo <- c("Wild emmer","Domesticated emmer","Durum","Khorasan wheat","Polish wheat","Rivet wheat","Georgian wheat","Ispahanicum","Persian wheat","LR_EA","LR_CSA","LR_WA","LR_AF","LR_EU","LR_AM","Cultivar","OtherHexaploids","Club wheat","Indian dwarf wheat","Tibetan semi-wild","Vavilovii","Xinjiang wheat","Yunan wheat","Macha","Spelt","Strangulata")

### 该数据框是为了根据26亚群找到对应的基因组类型
dfhash <- read_tsv("data/001_TaxaDB/004_taxaDB_GroupforExpansionLoad.txt") %>% 
  select(Taxa,GroupforExpansionLoad,Subspecies_by6_TreeValidated)

### 该数据框是画图 heter 时所用的数据
df1 <- read.delim("data/003_VCF_QC/AABBDD_taxa_QC.txt.gz")
df2 <- read.delim("data/003_VCF_QC/AABB_taxa_QC.txt.gz")
df3 <- read.delim("data/003_VCF_QC/DD_taxa_QC.txt.gz")
df <- rbind(df1,df2,df3) %>%
  mutate(GenomeType = factor(GenomeType,levels = c("AABB","AABBDD","DD"))) %>% 
  left_join(dfhash,by = "Taxa") %>% 
  filter(!is.na(GroupforExpansionLoad))
  
### 统计个体按照 heter 的排序
dfV1 <- df %>% 
  mutate(Subspecies_by6_TreeValidated=factor(Subspecies_by6_TreeValidated,
    levels = c( "Wild_emmer","Domesticated_emmer","Free_threshing_tetraploids",
                      "OtherTetraploids",
                      "Landrace","Cultivar","OtherHexaploids","Strangulata"))) %>% 
  ### 下一步记得排序的时候加上大群分类
  group_by(GenomeType,Subspecies_by6_TreeValidated,GroupforExpansionLoad) %>% 
  summarise(HeterozygousProportion = mean(HeterozygousProportion)) %>% 
  arrange(GenomeType,Subspecies_by6_TreeValidated,-HeterozygousProportion) %>% 
  distinct(GroupforExpansionLoad,.keep_all = F)

### 因子排序
df$GroupforExpansionLoad <- factor(df$GroupforExpansionLoad,levels = dfV1$GroupforExpansionLoad)
  
### 计算每个亚群的个数，并添加到taxa数据库后面
dftaxaCount <- df %>% 
  group_by(GroupforExpansionLoad) %>% 
  count() %>% 
  ungroup() %>%  
  distinct(.,GroupforExpansionLoad,.keep_all = T) %>% 
  mutate(GroupforExpansionLoad_AddCount = 
           str_c(GroupforExpansionLoad," (", n, ")",sep = "")) %>% 
  select(-n)

yvalue <- 0.17

p <- ggplot(df,aes(x=GroupforExpansionLoad,y=HeterozygousProportion))+
  ylab("Heterozygous genotype\nproportion by individual")+
  xlab("")+
  ### 加阴影
  geom_rect(aes(xmin=1-0.5,xmax=1+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5,color="white")+
  geom_rect(aes(xmin=3-0.5,xmax=3+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=5-0.5,xmax=5+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=7-0.5,xmax=7+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=9-0.5,xmax=9+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=11-0.5,xmax=11+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=13-0.5,xmax=13+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=15-0.5,xmax=15+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=17-0.5,xmax=17+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=19-0.5,xmax=19+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=21-0.5,xmax=21+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=23-0.5,xmax=23+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=25-0.5,xmax=25+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  # 最上面的条纹
  geom_rect(aes(xmin=1-0.5,xmax=1+0.5,ymin=yvalue,ymax=Inf),fill="#ffd702",alpha=0.5)+
  geom_rect(aes(xmin=2-0.5,xmax=2+0.5,ymin=yvalue,ymax=Inf),fill="#7f5701",alpha=0.5)+
  geom_rect(aes(xmin=3-0.5,xmax=6+0.5,ymin=yvalue,ymax=Inf),fill="#016699",alpha=0.5)+
  geom_rect(aes(xmin=7-0.5,xmax=9+0.5,ymin=yvalue,ymax=Inf),fill="#00f3ff",alpha=0.5)+
  geom_rect(aes(xmin=10-0.5,xmax=15+0.5,ymin=yvalue,ymax=Inf),fill="#fc6e6e",alpha=0.5)+
  geom_rect(aes(xmin=16-0.5,xmax=16+0.5,ymin=yvalue,ymax=Inf),fill="#9900ff",alpha=0.5)+
  geom_rect(aes(xmin=17-0.5,xmax=25+0.5,ymin=yvalue,ymax=Inf),fill="#fe63c2",alpha=0.5)+
  geom_rect(aes(xmin=26-0.5,xmax=26+0.5,ymin=yvalue,ymax=Inf),fill="#87cef9",alpha=0.5)+
  # 开始画图
  # stat_boxplot(geom = "errorbar",width=0.15,position = position_dodge(1))+ # add error bar
  # geom_point(position = position_jitterdodge(0.5),alpha = 0.6,aes(col=Sub),size=0.3)+
  # stat_summary(fun.data=mean_sdl,geom="pointrange",size=0.3)+
  ### boxplot
  geom_boxplot(position = position_dodge(0.7),
               outlier.color = 'black',
               outlier.alpha = 0.8,
               outlier.size = 0.7,
               alpha=0.8,width=0.7)+ #,width=0.0.7
  stat_summary(fun=mean, geom="point", color="red",position = position_dodge(1),size=0.5)+
  ### boxplot and point
  # geom_boxplot(position = position_dodge(0.9),outlier.color = 'white',alpha=0.5,outlier.size = 0.5)+
  # geom_point(position = position_jitterdodge(0.6),alpha = 0.6,aes(color=GenomeType),size=0.8)+
  geom_vline(xintercept = 9.5,linetype="dashed",color="black")+
  geom_vline(xintercept = 25.5,linetype="dashed",color="black")+
  scale_color_manual(values = c('#ffd702','#fc6e6e',"#87cef9"),name='')+
  scale_x_discrete(labels = dftaxaCount$GroupforExpansionLoad_AddCount)+
  theme(axis.text.x= element_text(angle=45, vjust=1,hjust = 1))+
  theme(plot.margin=unit(rep(1.3,4),'lines'))+
  theme(
    panel.background = element_blank(),
    panel.border = element_rect(colour = "black", fill=NA, size=0.5),
    text = element_text(size = 10),legend.position = 'none')
p
# ggsave(p,filename = "/Users/Aoyue/Documents/test.pdf",height = 4,width = 7.2)
ggsave(p,filename = "/Users/Aoyue/Documents/test.pdf",height = 3.5,width = 7)


### 求四倍体 六倍体 二倍体的杂合度平均值
dfave <- df %>% 
  group_by(GenomeType) %>% 
  summarise(mean_heter=mean(HeterozygousProportion))

dfave
# AABB	0.014694332			
# AABBDD	0.006543548			
# DD	0.016855904	
```
#### ======
## ****ANGSD
## thetaW_ANGSD
```{r}
### 对区间的 pi 进行总结，求平均值
# df <- read_tsv("data/014_PopParameters/003_angsd/001_angsd_subspecies26_geneRegion_RefChr.txt.gz")
# 
# df1 <- df %>%
#   group_by(RefChr,Group) %>%
#   summarise(across(.cols = c(4:13),.fns = ~ mean(.x, na.rm = TRUE),.names = "{.col}_mean"))
# 
### write_tsv(df1,file = "data/014_PopParameters/003_angsd/002_angsd_summary.txt.gz") ### 注意结果中含有0 NA

### 分组数据
dfhash <- read_tsv("data/001_TaxaDB/004_taxaDB_GroupforExpansionLoad.txt") %>% 
  select(GroupforExpansionLoad,GroupforExpansionLoad2,GenomeType,Subspecies_by6_TreeValidated) %>% 
  distinct(GroupforExpansionLoad,.keep_all = T)

### 画图数据
df <- read_tsv("data/014_PopParameters/003_angsd/002_angsd_summary.txt.gz") %>% 
  filter(tW_mean!=0) %>% 
  mutate(Sub=str_sub(RefChr,2,2)) %>% 
  left_join(dfhash,by=c("Group"="GroupforExpansionLoad2"))

### 统计每个亚基因组内，按照A亚基因组排序
dfV1 <- df %>% 
  mutate(Subspecies_by6_TreeValidated=factor(Subspecies_by6_TreeValidated,levels = c("Wild_emmer","Domesticated_emmer","Free_threshing_tetraploids","OtherTetraploids","Landrace","Cultivar","OtherHexaploids","Strangulata"))) %>%
  group_by(GenomeType,Sub,Subspecies_by6_TreeValidated,GroupforExpansionLoad) %>% 
  summarise(mean = mean(tW_mean)) %>% 
  arrange(.,GenomeType,Sub,Subspecies_by6_TreeValidated,-mean) %>% 
  ungroup() %>%  
  filter(Sub=="A") %>% 
  select(GroupforExpansionLoad) %>% 
  add_row(GroupforExpansionLoad="Strangulata")

### 因子排序
### 计数数据 
dfori <- read_tsv("data/001_TaxaDB/004_taxaDB_GroupforExpansionLoad.txt") %>%
  filter(!is.na(GroupforExpansionLoad))

dfori$GroupforExpansionLoad <- factor(dfori$GroupforExpansionLoad,levels = dfV1$GroupforExpansionLoad)

df$GroupforExpansionLoad <- factor(df$GroupforExpansionLoad,levels = dfV1$GroupforExpansionLoad)

### 计算每个亚群的个数，并添加到taxa数据库后面
dftaxaCount <- dfori %>% 
  select(GroupforExpansionLoad) %>% 
  group_by(GroupforExpansionLoad) %>% 
  count() %>% 
  ungroup() %>%
  mutate(GroupforExpansionLoad_AddCount = str_c(GroupforExpansionLoad," (", n, ")",sep = "")) %>% 
  select(-n)

yvalue <- 90

p <- ggplot(df,aes(x=GroupforExpansionLoad,y=tW_mean))+
  labs(y= expression(italic(theta)[italic(W)]), x = "")+ 
  ### 加阴影
  geom_rect(aes(xmin=1-0.5,xmax=1+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5,color="white")+
  geom_rect(aes(xmin=3-0.5,xmax=3+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=5-0.5,xmax=5+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=7-0.5,xmax=7+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=9-0.5,xmax=9+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=11-0.5,xmax=11+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=13-0.5,xmax=13+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=15-0.5,xmax=15+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=17-0.5,xmax=17+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=19-0.5,xmax=19+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=21-0.5,xmax=21+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=23-0.5,xmax=23+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=25-0.5,xmax=25+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  # 最上面的条纹
  geom_rect(aes(xmin=1-0.5,xmax=1+0.5,ymin=yvalue,ymax=Inf),fill="#ffd702",alpha=0.5)+
  geom_rect(aes(xmin=2-0.5,xmax=2+0.5,ymin=yvalue,ymax=Inf),fill="#7f5701",alpha=0.5)+
  geom_rect(aes(xmin=3-0.5,xmax=6+0.5,ymin=yvalue,ymax=Inf),fill="#016699",alpha=0.5)+
  geom_rect(aes(xmin=7-0.5,xmax=9+0.5,ymin=yvalue,ymax=Inf),fill="#00f3ff",alpha=0.5)+
  geom_rect(aes(xmin=10-0.5,xmax=15+0.5,ymin=yvalue,ymax=Inf),fill="#fc6e6e",alpha=0.5)+
  geom_rect(aes(xmin=16-0.5,xmax=16+0.5,ymin=yvalue,ymax=Inf),fill="#9900ff",alpha=0.5)+
  geom_rect(aes(xmin=17-0.5,xmax=25+0.5,ymin=yvalue,ymax=Inf),fill="#fe63c2",alpha=0.5)+
  geom_rect(aes(xmin=26-0.5,xmax=26+0.5,ymin=yvalue,ymax=Inf),fill="#87cef9",alpha=0.5)+
  # 开始画图
  # stat_boxplot(geom = "errorbar",width=0.15,position = position_dodge(1))+ # add error bar
  # geom_point(position = position_jitterdodge(0.5),alpha = 0.6,aes(col=Sub),size=0.3)+
  # stat_summary(fun.data=mean_sdl,geom="pointrange",size=0.3)+
  ### boxplot
  geom_boxplot(
    aes(color=Sub),
    position = position_dodge(0.8),
               outlier.color = NA,
               # outlier.alpha = 0.8,
               # outlier.size = 0.7,
               alpha=0.8,width=0.7)+ #,width=0.0.7
  stat_summary(
              aes(group=Sub),
              color="black",
              fun=mean, geom="point",position = position_dodge(0.8),size=0.4)+
  ### boxplot and point
  # geom_boxplot(position = position_dodge(0.9),outlier.color = 'white',alpha=0.5,outlier.size = 0.5)+
  # geom_point(position = position_jitterdodge(0.6),alpha = 0.6,aes(color=GenomeType),size=0.8)+
  geom_vline(xintercept = 9.5,linetype="dashed",color="black")+
  geom_vline(xintercept = 25.5,linetype="dashed",color="black")+
  scale_color_manual(values = c("#fd8582","#967bce","#4bcdc6"),name='')+
  scale_x_discrete(labels = dftaxaCount$GroupforExpansionLoad_AddCount)+
  theme(axis.text.x= element_text(angle=45, vjust=1,hjust = 1))+
  theme(plot.margin=unit(rep(1.3,4),'lines'))+
  theme(
    panel.background = element_blank(),
    panel.border = element_rect(colour = "black", fill=NA, size=0.5),
    # legend.position = 'none',
    # legend.position = c(0.9,0.6),
    text = element_text(size = 10))
p
# ggsave(p,filename = "/Users/Aoyue/Documents/test.pdf",height = 4,width = 7.2)
ggsave(p,filename = "/Users/Aoyue/Documents/thetaW_ANGSD.pdf",height = 3.5,width = 7)

```


## thetaP_ANGSD
```{r}
dfhash <- read_tsv("data/001_TaxaDB/004_taxaDB_GroupforExpansionLoad.txt") %>% 
  select(GroupforExpansionLoad,GroupforExpansionLoad2,GenomeType,Subspecies_by6_TreeValidated) %>% 
  distinct(GroupforExpansionLoad,.keep_all = T)


df <- read_tsv("data/014_PopParameters/003_angsd/002_angsd_summary.txt.gz") %>% 
  filter(tP_mean!=0) %>% 
  mutate(Sub=str_sub(RefChr,2,2)) %>% 
  left_join(dfhash,by=c("Group"="GroupforExpansionLoad2"))

### 统计每个亚基因组内，按照A亚基因组排序
### dfV2 为排序后的亚洲列表
dfV1 <- df %>% 
  mutate(Subspecies_by6_TreeValidated=factor(Subspecies_by6_TreeValidated,levels = c("Wild_emmer","Domesticated_emmer","Free_threshing_tetraploids","OtherTetraploids","Landrace","Cultivar","OtherHexaploids","Strangulata"))) %>%
  group_by(GenomeType,Sub,Subspecies_by6_TreeValidated,GroupforExpansionLoad) %>% 
  summarise(mean = mean(tP_mean)) %>% 
  arrange(.,GenomeType,Sub,Subspecies_by6_TreeValidated,-mean) %>% 
  ungroup() %>%  
  filter(Sub=="A") %>% 
  select(GroupforExpansionLoad) %>% 
  add_row(GroupforExpansionLoad="Strangulata")

### 因子排序
dfori <- read_tsv("data/001_TaxaDB/004_taxaDB_GroupforExpansionLoad.txt") %>%
  filter(!is.na(GroupforExpansionLoad))

dfori$GroupforExpansionLoad <- factor(dfori$GroupforExpansionLoad,levels = dfV1$GroupforExpansionLoad)

df$GroupforExpansionLoad <- factor(df$GroupforExpansionLoad,levels = dfV1$GroupforExpansionLoad)

### 计算每个亚群的个数，并添加到taxa数据库后面
dftaxaCount <- dfori %>% 
  select(GroupforExpansionLoad) %>% 
  group_by(GroupforExpansionLoad) %>% 
  count() %>% 
  ungroup() %>%
  mutate(GroupforExpansionLoad_AddCount = str_c(GroupforExpansionLoad," (", n, ")",sep = "")) %>% 
  select(-n)

yvalue <- 150

p <- ggplot(df,aes(x=GroupforExpansionLoad,y=tP_mean))+
  labs(y= expression(italic(theta)[italic(pi)]), x = "")+ 
  ### 加阴影
  geom_rect(aes(xmin=1-0.5,xmax=1+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5,color="white")+
  geom_rect(aes(xmin=3-0.5,xmax=3+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=5-0.5,xmax=5+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=7-0.5,xmax=7+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=9-0.5,xmax=9+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=11-0.5,xmax=11+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=13-0.5,xmax=13+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=15-0.5,xmax=15+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=17-0.5,xmax=17+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=19-0.5,xmax=19+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=21-0.5,xmax=21+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=23-0.5,xmax=23+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=25-0.5,xmax=25+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  # 最上面的条纹
  geom_rect(aes(xmin=1-0.5,xmax=1+0.5,ymin=yvalue,ymax=Inf),fill="#ffd702",alpha=0.5)+
  geom_rect(aes(xmin=2-0.5,xmax=2+0.5,ymin=yvalue,ymax=Inf),fill="#7f5701",alpha=0.5)+
  geom_rect(aes(xmin=3-0.5,xmax=6+0.5,ymin=yvalue,ymax=Inf),fill="#016699",alpha=0.5)+
  geom_rect(aes(xmin=7-0.5,xmax=9+0.5,ymin=yvalue,ymax=Inf),fill="#00f3ff",alpha=0.5)+
  geom_rect(aes(xmin=10-0.5,xmax=15+0.5,ymin=yvalue,ymax=Inf),fill="#fc6e6e",alpha=0.5)+
  geom_rect(aes(xmin=16-0.5,xmax=16+0.5,ymin=yvalue,ymax=Inf),fill="#9900ff",alpha=0.5)+
  geom_rect(aes(xmin=17-0.5,xmax=25+0.5,ymin=yvalue,ymax=Inf),fill="#fe63c2",alpha=0.5)+
  geom_rect(aes(xmin=26-0.5,xmax=26+0.5,ymin=yvalue,ymax=Inf),fill="#87cef9",alpha=0.5)+
  # 开始画图
  # stat_boxplot(geom = "errorbar",width=0.15,position = position_dodge(1))+ # add error bar
  # geom_point(position = position_jitterdodge(0.5),alpha = 0.6,aes(col=Sub),size=0.3)+
  # stat_summary(fun.data=mean_sdl,geom="pointrange",size=0.3)+
  ### boxplot
  geom_boxplot(
    aes(color=Sub),
    position = position_dodge(0.8),
               outlier.color = NA,
               # outlier.alpha = 0.8,
               # outlier.size = 0.7,
               alpha=0.8,width=0.7)+ #,width=0.0.7
  stat_summary(
              aes(group=Sub),
              color="black",
              fun=mean, geom="point",position = position_dodge(0.8),size=0.4)+
  ### boxplot and point
  # geom_boxplot(position = position_dodge(0.9),outlier.color = 'white',alpha=0.5,outlier.size = 0.5)+
  # geom_point(position = position_jitterdodge(0.6),alpha = 0.6,aes(color=GenomeType),size=0.8)+
  geom_vline(xintercept = 9.5,linetype="dashed",color="black")+
  geom_vline(xintercept = 25.5,linetype="dashed",color="black")+
  scale_color_manual(values = c("#fd8582","#967bce","#4bcdc6"),name='')+
  scale_x_discrete(labels = dftaxaCount$GroupforExpansionLoad_AddCount)+
  theme(axis.text.x= element_text(angle=45, vjust=1,hjust = 1))+
  theme(plot.margin=unit(rep(1.3,4),'lines'))+
  theme(
    panel.background = element_blank(),
    panel.border = element_rect(colour = "black", fill=NA, size=0.5),
    # legend.position = 'none',
    # legend.position = c(0.9,0.6),
    text = element_text(size = 10))
p
# ggsave(p,filename = "/Users/Aoyue/Documents/test.pdf",height = 4,width = 7.2)
ggsave(p,filename = "/Users/Aoyue/Documents/thetaP_ANGSD.pdf",height = 3.5,width = 7)
```


## TajimaD_ANGSD
```{r}
dfhash <- read_tsv("data/001_TaxaDB/004_taxaDB_GroupforExpansionLoad.txt") %>% 
  select(GroupforExpansionLoad,GroupforExpansionLoad2,GenomeType,Subspecies_by6_TreeValidated) %>% 
  distinct(GroupforExpansionLoad,.keep_all = T)

df <- read_tsv("data/014_PopParameters/003_angsd/002_angsd_summary.txt.gz") %>% 
  filter(tW_mean!=0) %>% 
  mutate(Sub=str_sub(RefChr,2,2)) %>% 
  left_join(dfhash,by=c("Group"="GroupforExpansionLoad2"))

### 统计每个亚基因组内，按照A亚基因组排序
### dfV2 为排序后的亚洲列表
dfV1 <- df %>% 
  mutate(Subspecies_by6_TreeValidated=factor(Subspecies_by6_TreeValidated,levels = c("Wild_emmer","Domesticated_emmer","Free_threshing_tetraploids","OtherTetraploids","Landrace","Cultivar","OtherHexaploids","Strangulata"))) %>%
  group_by(GenomeType,Sub,Subspecies_by6_TreeValidated,GroupforExpansionLoad) %>% 
  summarise(mean = mean(Tajima_mean)) %>% 
  arrange(.,GenomeType,Sub,Subspecies_by6_TreeValidated,-mean) %>% 
  ungroup() %>%  
  filter(Sub=="A") %>% 
  select(GroupforExpansionLoad) %>% 
  add_row(GroupforExpansionLoad="Strangulata")

### 因子排序
dfori <- read_tsv("data/001_TaxaDB/004_taxaDB_GroupforExpansionLoad.txt") %>%
  filter(!is.na(GroupforExpansionLoad))

dfori$GroupforExpansionLoad <- factor(dfori$GroupforExpansionLoad,levels = dfV1$GroupforExpansionLoad)

df$GroupforExpansionLoad <- factor(df$GroupforExpansionLoad,levels = dfV1$GroupforExpansionLoad)

### 计算每个亚群的个数，并添加到taxa数据库后面
dftaxaCount <- dfori %>% 
  select(GroupforExpansionLoad) %>% 
  group_by(GroupforExpansionLoad) %>% 
  count() %>% 
  ungroup() %>%
  mutate(GroupforExpansionLoad_AddCount = str_c(GroupforExpansionLoad," (", n, ")",sep = "")) %>% 
  select(-n)

yvalue <- 4.5

p <- ggplot(df,aes(x=GroupforExpansionLoad,y=Tajima_mean))+
  labs(y= expression(paste("Tajima's ",italic(D))), x = "")+
  ### 加阴影
  geom_rect(aes(xmin=1-0.5,xmax=1+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5,color="white")+
  geom_rect(aes(xmin=3-0.5,xmax=3+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=5-0.5,xmax=5+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=7-0.5,xmax=7+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=9-0.5,xmax=9+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=11-0.5,xmax=11+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=13-0.5,xmax=13+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=15-0.5,xmax=15+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=17-0.5,xmax=17+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=19-0.5,xmax=19+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=21-0.5,xmax=21+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=23-0.5,xmax=23+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=25-0.5,xmax=25+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  # 最上面的条纹
  geom_rect(aes(xmin=1-0.5,xmax=1+0.5,ymin=yvalue,ymax=Inf),fill="#ffd702",alpha=0.5)+
  geom_rect(aes(xmin=2-0.5,xmax=2+0.5,ymin=yvalue,ymax=Inf),fill="#7f5701",alpha=0.5)+
  geom_rect(aes(xmin=3-0.5,xmax=6+0.5,ymin=yvalue,ymax=Inf),fill="#016699",alpha=0.5)+
  geom_rect(aes(xmin=7-0.5,xmax=9+0.5,ymin=yvalue,ymax=Inf),fill="#00f3ff",alpha=0.5)+
  geom_rect(aes(xmin=10-0.5,xmax=15+0.5,ymin=yvalue,ymax=Inf),fill="#fc6e6e",alpha=0.5)+
  geom_rect(aes(xmin=16-0.5,xmax=16+0.5,ymin=yvalue,ymax=Inf),fill="#9900ff",alpha=0.5)+
  geom_rect(aes(xmin=17-0.5,xmax=25+0.5,ymin=yvalue,ymax=Inf),fill="#fe63c2",alpha=0.5)+
  geom_rect(aes(xmin=26-0.5,xmax=26+0.5,ymin=yvalue,ymax=Inf),fill="#87cef9",alpha=0.5)+
  # 开始画图
  # stat_boxplot(geom = "errorbar",width=0.15,position = position_dodge(1))+ # add error bar
  # geom_point(position = position_jitterdodge(0.5),alpha = 0.6,aes(col=Sub),size=0.3)+
  # stat_summary(fun.data=mean_sdl,geom="pointrange",size=0.3)+
  ### boxplot
  geom_boxplot(
    aes(color=Sub),
    position = position_dodge(0.8),
               outlier.color = NA,
               # outlier.alpha = 0.8,
               # outlier.size = 0.7,
               alpha=0.8,width=0.7)+ #,width=0.0.7
  stat_summary(
              aes(group=Sub),
              color="black",
              fun=mean, geom="point",position = position_dodge(0.8),size=0.4)+
  ### boxplot and point
  # geom_boxplot(position = position_dodge(0.9),outlier.color = 'white',alpha=0.5,outlier.size = 0.5)+
  # geom_point(position = position_jitterdodge(0.6),alpha = 0.6,aes(color=GenomeType),size=0.8)+
  geom_vline(xintercept = 9.5,linetype="dashed",color="black")+
  geom_vline(xintercept = 25.5,linetype="dashed",color="black")+
  scale_color_manual(values = c("#fd8582","#967bce","#4bcdc6"),name='')+
  scale_x_discrete(labels = dftaxaCount$GroupforExpansionLoad_AddCount)+
  theme(axis.text.x= element_text(angle=45, vjust=1,hjust = 1))+
  theme(plot.margin=unit(rep(1.3,4),'lines'))+
  theme(
    panel.background = element_blank(),
    panel.border = element_rect(colour = "black", fill=NA, size=0.5),
    # legend.position = 'none',
    # legend.position = c(0.9,0.6),
    text = element_text(size = 10))
p
# ggsave(p,filename = "/Users/Aoyue/Documents/test.pdf",height = 4,width = 7.2)
ggsave(p,filename = "/Users/Aoyue/Documents/TajimaD_ANGSD.pdf",height = 3.5,width = 7)
```

#### ======
## ****VCFtools
## pi_VCFtools by individual
```{r}
### 对区间的 pi 进行总结，求平均值
# df <- read_tsv("/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/RProject/R4VMap2FigsS1062/data/014_PopParameters/001_pi/001_pi.txt.gz")
# 
# df1 <- df %>% 
#   group_by(CHROM,Group) %>% 
#   summarise(mean_pi = mean(PI))
# 
# write_tsv(df1,file = "data/014_PopParameters/001_pi/002_pi_summary.txt.gz")

### 开始处理 pi 的结果
dfori <- read_tsv("data/001_TaxaDB/004_taxaDB_GroupforExpansionLoad.txt") %>% 
  filter(!is.na(GroupforExpansionLoad))

dfhash <- read_tsv("data/001_TaxaDB/004_taxaDB_GroupforExpansionLoad.txt") %>% 
  select(GroupforExpansionLoad,GroupforExpansionLoad2,GenomeType,Subspecies_by6_TreeValidated) %>% 
  distinct(GroupforExpansionLoad,.keep_all = T)

df <- read_tsv("data/014_PopParameters/001_pi/002_pi_summary.txt.gz") %>% 
  mutate(Sub=str_sub(CHROM,2,2)) %>% 
  filter(Group!="No1B1R", Group != "With1B1R") %>% 
  left_join(dfhash,by=c("Group"="GroupforExpansionLoad2"))

### 统计每个亚基因组内，按照B亚基因组排序
### dfV2 为排序后的亚洲列表
dfV1 <- df %>% 
  mutate(Subspecies_by6_TreeValidated=factor(Subspecies_by6_TreeValidated,
                                             levels = c("Wild_emmer","Domesticated_emmer","Free_threshing_tetraploids","OtherTetraploids","Landrace","Cultivar","OtherHexaploids","Strangulata"))) %>% 
  group_by(GenomeType,Sub,Subspecies_by6_TreeValidated,GroupforExpansionLoad) %>% 
  summarise(mean = mean(mean_pi)) %>% 
  ### 下一步记得排序的时候加上大群分类
  arrange(.,GenomeType,Sub,Subspecies_by6_TreeValidated,-mean) %>% 
  ungroup() %>%  
  filter(Sub=="A") %>% 
  select(GroupforExpansionLoad) %>% 
  add_row(GroupforExpansionLoad="Strangulata")

### 因子排序
dfori$GroupforExpansionLoad <- factor(dfori$GroupforExpansionLoad,levels = dfV1$GroupforExpansionLoad)

df$GroupforExpansionLoad <- factor(df$GroupforExpansionLoad,levels = dfV1$GroupforExpansionLoad)

### 计算每个亚群的个数，并添加到taxa数据库后面
dftaxaCount <- dfori %>% 
  select(GroupforExpansionLoad) %>% 
  group_by(GroupforExpansionLoad) %>% 
  count() %>%
  ungroup() %>%
  mutate(GroupforExpansionLoad_AddCount = str_c(GroupforExpansionLoad," (", n, ")",sep = "")) %>% 
  select(-n)

yvalue <- 2.6

p <- ggplot(df,aes(x=GroupforExpansionLoad,y=mean_pi*1000))+
  labs(y = expression(italic(theta)[pi]~"("~10^-3~")"),x="")+
  ### 加阴影
  geom_rect(aes(xmin=1-0.5,xmax=1+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5,color="white")+
  geom_rect(aes(xmin=3-0.5,xmax=3+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=5-0.5,xmax=5+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=7-0.5,xmax=7+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=9-0.5,xmax=9+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=11-0.5,xmax=11+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=13-0.5,xmax=13+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=15-0.5,xmax=15+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=17-0.5,xmax=17+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=19-0.5,xmax=19+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=21-0.5,xmax=21+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=23-0.5,xmax=23+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=25-0.5,xmax=25+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  # 最上面的条纹
  geom_rect(aes(xmin=1-0.5,xmax=1+0.5,ymin=yvalue,ymax=Inf),fill="#ffd702",alpha=0.5)+
  geom_rect(aes(xmin=2-0.5,xmax=2+0.5,ymin=yvalue,ymax=Inf),fill="#7f5701",alpha=0.5)+
  geom_rect(aes(xmin=3-0.5,xmax=6+0.5,ymin=yvalue,ymax=Inf),fill="#016699",alpha=0.5)+
  geom_rect(aes(xmin=7-0.5,xmax=9+0.5,ymin=yvalue,ymax=Inf),fill="#00f3ff",alpha=0.5)+
  geom_rect(aes(xmin=10-0.5,xmax=15+0.5,ymin=yvalue,ymax=Inf),fill="#fc6e6e",alpha=0.5)+
  geom_rect(aes(xmin=16-0.5,xmax=16+0.5,ymin=yvalue,ymax=Inf),fill="#9900ff",alpha=0.5)+
  geom_rect(aes(xmin=17-0.5,xmax=25+0.5,ymin=yvalue,ymax=Inf),fill="#fe63c2",alpha=0.5)+
  geom_rect(aes(xmin=26-0.5,xmax=26+0.5,ymin=yvalue,ymax=Inf),fill="#87cef9",alpha=0.5)+
  # 开始画图
  # stat_boxplot(geom = "errorbar",width=0.15,position = position_dodge(1))+ # add error bar
  # geom_point(position = position_jitterdodge(0.5),alpha = 0.6,aes(col=Sub),size=0.3)+
  # stat_summary(fun.data=mean_sdl,geom="pointrange",size=0.3)+
  ### boxplot
  geom_boxplot(
    aes(color=Sub),
    position = position_dodge(0.8),
               outlier.color = NA,
               # outlier.alpha = 0.8,
               # outlier.size = 0.7,
               alpha=0.8,width=0.7)+ #,width=0.0.7
  stat_summary(
              aes(group=Sub),
              color="black",
              fun=mean, geom="point",position = position_dodge(0.8),size=0.4)+
  ### boxplot and point
  # geom_boxplot(position = position_dodge(0.9),outlier.color = 'white',alpha=0.5,outlier.size = 0.5)+
  # geom_point(position = position_jitterdodge(0.6),alpha = 0.6,aes(color=GenomeType),size=0.8)+
  geom_vline(xintercept = 9.5,linetype="dashed",color="black")+
  geom_vline(xintercept = 25.5,linetype="dashed",color="black")+
  scale_color_manual(values = c("#fd8582","#967bce","#4bcdc6"),name='')+
  scale_x_discrete(labels = dftaxaCount$GroupforExpansionLoad_AddCount)+
  theme(axis.text.x= element_text(angle=45, vjust=1,hjust = 1))+
  theme(plot.margin=unit(rep(1.3,4),'lines'))+
  theme(
    panel.background = element_blank(),
    panel.border = element_rect(colour = "black", fill=NA, size=0.5),
    # legend.position = 'none',
    # legend.position = c(0.9,0.6),
    text = element_text(size = 10))
p
# ggsave(p,filename = "/Users/Aoyue/Documents/test.pdf",height = 4,width = 7.2)
ggsave(p,filename = "/Users/Aoyue/Documents/pi_VCFtools.pdf",height = 3.5,width = 7)

```


### pi_VCFtools by individual 竖版
```{r}
### 对区间的 pi 进行总结，求平均值
# df <- read_tsv("/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/RProject/R4VMap2FigsS1062/data/014_PopParameters/001_pi/001_pi.txt.gz")
# 
# df1 <- df %>% 
#   group_by(CHROM,Group) %>% 
#   summarise(mean_pi = mean(PI))
# 
# write_tsv(df1,file = "data/014_PopParameters/001_pi/002_pi_summary.txt.gz")

### 开始处理 pi 的结果
dfori <- read_tsv("data/001_TaxaDB/004_taxaDB_GroupforExpansionLoad.txt") %>% 
  filter(!is.na(GroupforExpansionLoad))

dfhash <- read_tsv("data/001_TaxaDB/004_taxaDB_GroupforExpansionLoad.txt") %>% 
  select(GroupforExpansionLoad,GroupforExpansionLoad2,GenomeType,Subspecies_by6_TreeValidated) %>% 
  distinct(GroupforExpansionLoad,.keep_all = T)

df <- read_tsv("data/014_PopParameters/001_pi/002_pi_summary.txt.gz") %>% 
  mutate(Sub=str_sub(CHROM,2,2)) %>% 
  filter(Group!="No1B1R", Group != "With1B1R") %>% 
  left_join(dfhash,by=c("Group"="GroupforExpansionLoad2"))

### 统计每个亚基因组内，按照B亚基因组排序
### dfV2 为排序后的亚洲列表
dfV1 <- df %>% 
  mutate(Subspecies_by6_TreeValidated=factor(Subspecies_by6_TreeValidated,
                                             levels = c("Wild_emmer","Domesticated_emmer","Free_threshing_tetraploids","OtherTetraploids","Landrace","Cultivar","OtherHexaploids","Strangulata"))) %>% 
  group_by(GenomeType,Sub,Subspecies_by6_TreeValidated,GroupforExpansionLoad) %>% 
  summarise(mean = mean(mean_pi)) %>% 
  ### 下一步记得排序的时候加上大群分类
  arrange(.,GenomeType,Sub,Subspecies_by6_TreeValidated,-mean) %>% 
  ungroup() %>%  
  filter(Sub=="A") %>% 
  select(GroupforExpansionLoad) %>% 
  add_row(GroupforExpansionLoad="Strangulata")

### 因子排序
dfori$GroupforExpansionLoad <- factor(dfori$GroupforExpansionLoad,levels = dfV1$GroupforExpansionLoad)

df$GroupforExpansionLoad <- factor(df$GroupforExpansionLoad,levels = dfV1$GroupforExpansionLoad)

### 计算每个亚群的个数，并添加到taxa数据库后面
dftaxaCount <- dfori %>% 
  select(GroupforExpansionLoad) %>% 
  group_by(GroupforExpansionLoad) %>% 
  count() %>% 
  ungroup() %>%
  mutate(GroupforExpansionLoad_AddCount = str_c(GroupforExpansionLoad," (", n, ")",sep = "")) %>% 
  select(-n)

yvalue <- 2.6

p <- ggplot(df,aes(x=GroupforExpansionLoad,y=mean_pi*1000))+
  labs(y = expression(italic(theta)[pi]~"("~10^-3~")"),x="")+
  ### 加阴影
  geom_rect(aes(xmin=1-0.5,xmax=1+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5,color="white")+
  geom_rect(aes(xmin=3-0.5,xmax=3+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=5-0.5,xmax=5+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=7-0.5,xmax=7+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=9-0.5,xmax=9+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=11-0.5,xmax=11+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=13-0.5,xmax=13+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=15-0.5,xmax=15+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=17-0.5,xmax=17+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=19-0.5,xmax=19+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=21-0.5,xmax=21+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=23-0.5,xmax=23+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=25-0.5,xmax=25+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  # 最上面的条纹
  geom_rect(aes(xmin=1-0.5,xmax=1+0.5,ymin=yvalue,ymax=Inf),fill="#ffd702",alpha=0.5)+
  geom_rect(aes(xmin=2-0.5,xmax=2+0.5,ymin=yvalue,ymax=Inf),fill="#7f5701",alpha=0.5)+
  geom_rect(aes(xmin=3-0.5,xmax=6+0.5,ymin=yvalue,ymax=Inf),fill="#016699",alpha=0.5)+
  geom_rect(aes(xmin=7-0.5,xmax=9+0.5,ymin=yvalue,ymax=Inf),fill="#00f3ff",alpha=0.5)+
  geom_rect(aes(xmin=10-0.5,xmax=15+0.5,ymin=yvalue,ymax=Inf),fill="#fc6e6e",alpha=0.5)+
  geom_rect(aes(xmin=16-0.5,xmax=16+0.5,ymin=yvalue,ymax=Inf),fill="#9900ff",alpha=0.5)+
  geom_rect(aes(xmin=17-0.5,xmax=25+0.5,ymin=yvalue,ymax=Inf),fill="#fe63c2",alpha=0.5)+
  geom_rect(aes(xmin=26-0.5,xmax=26+0.5,ymin=yvalue,ymax=Inf),fill="#87cef9",alpha=0.5)+
  # 开始画图
  # stat_boxplot(geom = "errorbar",width=0.15,position = position_dodge(1))+ # add error bar
  # geom_point(position = position_jitterdodge(0.5),alpha = 0.6,aes(col=Sub),size=0.3)+
  # stat_summary(fun.data=mean_sdl,geom="pointrange",size=0.3)+
  geom_hline(yintercept = 1,linetype="dashed",color="black")+
  ### boxplot
  geom_boxplot(
    aes(color=Sub),
    position = position_dodge(0.8),
               outlier.color = NA,
               # outlier.alpha = 0.8,
               # outlier.size = 0.7,
               alpha=0.8,width=0.7)+ #,width=0.0.7
  stat_summary(
              aes(group=Sub),
              color="black",
              fun=mean, geom="point",position = position_dodge(0.8),size=0.4)+
  ### boxplot and point
  # geom_boxplot(position = position_dodge(0.9),outlier.color = 'white',alpha=0.5,outlier.size = 0.5)+
  # geom_point(position = position_jitterdodge(0.6),alpha = 0.6,aes(color=GenomeType),size=0.8)+
  geom_vline(xintercept = 9.5,linetype="dashed",color="black")+
  geom_vline(xintercept = 25.5,linetype="dashed",color="black")+
  scale_color_manual(values = c("#fd8582","#967bce","#4bcdc6"),name='')+
  scale_x_discrete(labels = dftaxaCount$GroupforExpansionLoad_AddCount)+
  coord_flip()+
  theme(axis.text.x= element_text(angle=45, vjust=1,hjust = 1))+
  theme(plot.margin=unit(rep(1.3,4),'lines'))+
  theme(
    panel.background = element_blank(),
    panel.border = element_rect(colour = "black", fill=NA, size=0.5),
    legend.position = 'none',
    # legend.position = c(0.9,0.6),
    text = element_text(size = 10))
p
# ggsave(p,filename = "/Users/Aoyue/Documents/test.pdf",height = 4,width = 7.2)
ggsave(p,filename = "/Users/Aoyue/Documents/pi_VCFtools_竖版.pdf",height = 7,width = 3.5)

```


### TajiamaD_VCFtools
```{r}
### 对区间的 pi 进行总结，求平均值
# df <- read_tsv("/Users/Aoyue/project/wheatVMap2_1000/002_dataAnalysis/RProject/R4VMap2FigsS1062/data/014_PopParameters/002_tajimaD/001_TajimaD.txt.gz")
# 
# df1 <- df %>%
#   group_by(CHROM,Group) %>%
#   summarise(mean_tajimaD = mean(TajimaD,na.rm = T))
# 
# write_tsv(df1,file = "data/014_PopParameters/002_tajimaD/002_TajimaD_summary.txt.gz")

### 开始处理 pi 的结果
dfori <- read_tsv("/Users/Aoyue/project/wheatVMap2_1000/001_germplasm/Group/004_taxaDB_GroupforExpansionLoad.txt") %>% 
  filter(!is.na(GroupforExpansionLoad))

dfhash <- read_tsv("/Users/Aoyue/project/wheatVMap2_1000/001_germplasm/Group/004_taxaDB_GroupforExpansionLoad.txt") %>% 
  select(GroupforExpansionLoad,GroupforExpansionLoad2,GenomeType,Subspecies_by6_TreeValidated) %>% 
  distinct(GroupforExpansionLoad,.keep_all = T)

df <- read_tsv("data/014_PopParameters/002_tajimaD/002_TajimaD_summary.txt.gz") %>% 
  mutate(Sub=str_sub(CHROM,2,2)) %>% 
  left_join(dfhash,by=c("Group"="GroupforExpansionLoad2"))

### 统计每个亚基因组内，按照B亚基因组排序
dfV1 <- df %>% 
  mutate(Subspecies_by6_TreeValidated=factor(Subspecies_by6_TreeValidated,levels = c("Wild_emmer","Domesticated_emmer","Free_threshing_tetraploids","OtherTetraploids","Landrace","Cultivar","OtherHexaploids","Strangulata"))) %>% 
  group_by(Sub,GenomeType,Subspecies_by6_TreeValidated,GroupforExpansionLoad) %>% 
  summarise(mean = mean(mean_tajimaD)) %>% 
  arrange(.,GenomeType,Sub,Subspecies_by6_TreeValidated,-mean) %>% 
  ungroup() %>% 
  filter(Sub=="A") %>% 
  select(GroupforExpansionLoad) %>% 
  add_row(GroupforExpansionLoad="Strangulata")

### 因子排序
dfori$GroupforExpansionLoad <- factor(dfori$GroupforExpansionLoad,levels = dfV1$GroupforExpansionLoad)

df$GroupforExpansionLoad <- factor(df$GroupforExpansionLoad,levels = dfV1$GroupforExpansionLoad)

### 计算每个亚群的个数，并添加到taxa数据库后面
dftaxaCount <- dfori %>% 
  select(GroupforExpansionLoad) %>% 
  group_by(GroupforExpansionLoad) %>% 
  count() %>% 
  ungroup() %>%
  mutate(GroupforExpansionLoad_AddCount = str_c(GroupforExpansionLoad," (", n, ")",sep = "")) %>% 
  select(-n)

yvalue <- 1.6

p <- ggplot(df,aes(x=GroupforExpansionLoad,y=mean_tajimaD))+
  labs(y= expression(paste("Tajima's ",italic(D))), x = "")+
  ### 加阴影
  geom_rect(aes(xmin=1-0.5,xmax=1+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5,color="white")+
  geom_rect(aes(xmin=3-0.5,xmax=3+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=5-0.5,xmax=5+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=7-0.5,xmax=7+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=9-0.5,xmax=9+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=11-0.5,xmax=11+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=13-0.5,xmax=13+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=15-0.5,xmax=15+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=17-0.5,xmax=17+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=19-0.5,xmax=19+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=21-0.5,xmax=21+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=23-0.5,xmax=23+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  geom_rect(aes(xmin=25-0.5,xmax=25+0.5,ymin=-Inf,ymax=Inf),fill="#f2f2f2",alpha=0.5)+
  # 最上面的条纹
  geom_rect(aes(xmin=1-0.5,xmax=1+0.5,ymin=yvalue,ymax=Inf),fill="#ffd702",alpha=0.5)+
  geom_rect(aes(xmin=2-0.5,xmax=2+0.5,ymin=yvalue,ymax=Inf),fill="#7f5701",alpha=0.5)+
  geom_rect(aes(xmin=3-0.5,xmax=6+0.5,ymin=yvalue,ymax=Inf),fill="#016699",alpha=0.5)+
  geom_rect(aes(xmin=7-0.5,xmax=9+0.5,ymin=yvalue,ymax=Inf),fill="#00f3ff",alpha=0.5)+
  geom_rect(aes(xmin=10-0.5,xmax=15+0.5,ymin=yvalue,ymax=Inf),fill="#fc6e6e",alpha=0.5)+
  geom_rect(aes(xmin=16-0.5,xmax=16+0.5,ymin=yvalue,ymax=Inf),fill="#9900ff",alpha=0.5)+
  geom_rect(aes(xmin=17-0.5,xmax=25+0.5,ymin=yvalue,ymax=Inf),fill="#fe63c2",alpha=0.5)+
  geom_rect(aes(xmin=26-0.5,xmax=26+0.5,ymin=yvalue,ymax=Inf),fill="#87cef9",alpha=0.5)+
  # 开始画图
  # stat_boxplot(geom = "errorbar",width=0.15,position = position_dodge(1))+ # add error bar
  # geom_point(position = position_jitterdodge(0.5),alpha = 0.6,aes(col=Sub),size=0.3)+
  # stat_summary(fun.data=mean_sdl,geom="pointrange",size=0.3)+
  ### boxplot
  geom_boxplot(
    aes(color=Sub),
    position = position_dodge(0.8),
               outlier.color = NA,
               # outlier.alpha = 0.8,
               # outlier.size = 0.7,
               alpha=0.8,width=0.7)+ #,width=0.0.7
  stat_summary(
              aes(group=Sub),
              color="black",
              fun=mean, geom="point",position = position_dodge(0.8),size=0.4)+
  ### boxplot and point
  # geom_boxplot(position = position_dodge(0.9),outlier.color = 'white',alpha=0.5,outlier.size = 0.5)+
  # geom_point(position = position_jitterdodge(0.6),alpha = 0.6,aes(color=GenomeType),size=0.8)+
  geom_vline(xintercept = 9.5,linetype="dashed",color="black")+
  geom_vline(xintercept = 25.5,linetype="dashed",color="black")+
  scale_color_manual(values = c("#fd8582","#967bce","#4bcdc6"),name='')+
  scale_x_discrete(labels = dftaxaCount$GroupforExpansionLoad_AddCount)+
  theme(axis.text.x= element_text(angle=45, vjust=1,hjust = 1))+
  theme(plot.margin=unit(rep(1.3,4),'lines'))+
  theme(
    panel.background = element_blank(),
    panel.border = element_rect(colour = "black", fill=NA, size=0.5),
    # legend.position = 'none',
    # legend.position = c(0.9,0.6),
    text = element_text(size = 10))
p
# ggsave(p,filename = "/Users/Aoyue/Documents/test.pdf",height = 4,width = 7.2)
ggsave(p,filename = "/Users/Aoyue/Documents/TajimaD_VCFtools.pdf",height = 3.5,width = 7)

```

#### ======
## PCA

### Hexaploid by subspecise by founder effect
```{r}
library(factoextra)
library(FactoMineR)

df_colDB <- read_tsv("data/001_TaxaDB/003_VMap2_colorDB.txt") %>% 
  filter(ColorCategory=="GroupforExpansionLoad") %>% 
  select(Label,ColorCode)

dftaxaDB <- read_tsv("data/001_TaxaDB/004_taxaDB_GroupforExpansionLoad.txt") %>%
  filter(!is.na(GroupforExpansionLoad)) %>% 
  select(Taxa,GenomeType,GroupforExpansionLoad,GroupforExpansionLoad2,Subspecies_by6_TreeValidated) %>% 
  left_join(df_colDB,by=c("GroupforExpansionLoad"="Label"))

df_factor <- dftaxaDB %>% 
  distinct(GroupforExpansionLoad,.keep_all = T) %>% 
  filter(GenomeType=="AABBDD") %>% 
  mutate(Subspecies_by6_TreeValidated=factor(Subspecies_by6_TreeValidated,
         levels = c("Landrace","Cultivar","OtherHexaploids"))) %>% 
  arrange(as.factor(Subspecies_by6_TreeValidated),GroupforExpansionLoad) %>% 
  mutate(ColorCode=if_else(Subspecies_by6_TreeValidated %in% c("Landrace","Cultivar") | GroupforExpansionLoad=="OtherHexaploids" ,"gray90",ColorCode)) %>% 
  mutate(ColorCode=if_else(GroupforExpansionLoad=="Club wheat","black",ColorCode))

df <- read_tsv("data/016_mds/hexaploid_mds.txt") %>% 
  left_join(dftaxaDB,by="Taxa") %>% 
  filter(!is.na(GroupforExpansionLoad)) %>% 
  mutate(Subspecies_by6_TreeValidated=factor(Subspecies_by6_TreeValidated,
         levels = c("Landrace","Cultivar","OtherHexaploids"))) %>% 
  arrange(as.factor(Subspecies_by6_TreeValidated),GroupforExpansionLoad)
 

### 设置因子顺序及颜色
df$GroupforExpansionLoad <- factor(df$GroupforExpansionLoad,levels = df_factor$GroupforExpansionLoad)

colB <- df_factor$ColorCode

df_eig <- read_tsv("data/016_mds/hexaploid_mds_eigenvalues.txt")


### 提取变异解释度并合并数据
variance1 <-  paste(round(df_eig[1,2],3),"%",sep="")  ##第一个成分
variance2 <-  paste(round(df_eig[2,2],3),"%",sep="") ##第二个成分

### start to plot
p <- df %>% 
  ggplot(aes(x=PC1,y=PC2,fill= GroupforExpansionLoad))+
  # geom_point(aes(color=Subspecies),alpha=0.5,size=2)+
  geom_point(alpha=0.8,size=2,color="gray",shape=21)+
  geom_hline(yintercept=0, linetype="dashed", color = "black")+
  geom_vline(xintercept = 0, linetype="dashed", color = "black")+
  # scale_shape_manual(values=c(23, 22,21,24,25), name="Continent")+
  # scale_shape_manual(values=c(15, 16,17,18,19), name="Continent")+
  # scale_color_manual(values = colB,name="Genetic group")+
  scale_fill_manual(values = colB,name="Hexaploid by subspecies")+
  # scale_shape_manual(values = shapeB)+
  labs(x=paste("PC1 ","(",variance1,")",sep = ""), y=paste("PC2 ","(",variance2,")",sep = ""))+
  theme_classic()+
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()
      ,panel.background = element_blank()
      ,axis.line = element_line(size=0.3, colour = "black")
      # ,legend.position = 'none'
      ,text = element_text(size = 12))  
p
ggsave(p,filename = "~/Documents/p4.pdf",height = 3,width = 3)
ggsave(p,filename = "~/Documents/test.pdf",height = 3,width = 6)

library(plotly)
fig <- df %>%
  # filter(Subspecies %in% c("Landrace_Yunnan","yunna-nense")) %>%
  # filter(Subspecies == "Spelt_CAU" |Subspecies == "spelta") %>%
  # filter(Subspecies %in% c("petropavlovskyi","Landrace_Xinjiang")) %>%
  # filter(Subspecies == "Landrace",Group %in% c("JiaoLab","LuLab","CAAS") ) %>%
  plot_ly(x = ~ PC1, y = ~ PC2,color = ~ GroupforExpansionLoad,
               text = ~paste(Taxa),
               colors = colB)

fig

```


### Hexaploid by 26pops 只加 landrace
```{r}
library(factoextra)
library(FactoMineR)

df_colDB <- read_tsv("data/001_TaxaDB/003_VMap2_colorDB.txt") %>% 
  filter(ColorCategory=="GroupforExpansionLoad") %>% 
  select(Label,ColorCode)

dftaxaDB <- read_tsv("data/001_TaxaDB/004_taxaDB_GroupforExpansionLoad.txt") %>%
  filter(!is.na(GroupforExpansionLoad)) %>% 
  select(Taxa,GenomeType,GroupforExpansionLoad,GroupforExpansionLoad2,Subspecies_by6_TreeValidated) %>% 
  left_join(df_colDB,by=c("GroupforExpansionLoad"="Label"))

df_factor <- dftaxaDB %>% 
  distinct(GroupforExpansionLoad,.keep_all = T) %>% 
  filter(GenomeType=="AABBDD") %>% 
  mutate(Subspecies_by6_TreeValidated=factor(Subspecies_by6_TreeValidated,
         levels = c("Landrace","Cultivar","OtherHexaploids"))) %>% 
  arrange(as.factor(Subspecies_by6_TreeValidated),GroupforExpansionLoad) %>% 
  mutate(ColorCode=if_else(Subspecies_by6_TreeValidated %in% c("Landrace","Cultivar") ,ColorCode ,"gray90"))

df <- read_tsv("data/016_mds/hexaploid_mds.txt") %>% 
  left_join(dftaxaDB,by="Taxa") %>% 
  filter(!is.na(GroupforExpansionLoad)) %>% 
  mutate(Subspecies_by6_TreeValidated=factor(Subspecies_by6_TreeValidated,
         levels = c("OtherHexaploids","Cultivar","Landrace"))) %>% 
  arrange(as.factor(Subspecies_by6_TreeValidated),GroupforExpansionLoad)
 

### 设置因子顺序及颜色
df$GroupforExpansionLoad <- factor(df$GroupforExpansionLoad,levels = df_factor$GroupforExpansionLoad)

colB <- df_factor$ColorCode

df_eig <- read_tsv("data/016_mds/hexaploid_mds_eigenvalues.txt")


### 提取变异解释度并合并数据
variance1 <-  paste(round(df_eig[1,2],3),"%",sep="")  ##第一个成分
variance2 <-  paste(round(df_eig[2,2],3),"%",sep="") ##第二个成分

### start to plot
p <- df %>% 
  ggplot(aes(x=PC1,y=PC2,fill= GroupforExpansionLoad))+
  # geom_point(aes(color=Subspecies),alpha=0.5,size=2)+
  geom_point(alpha=0.8,size=2,color="gray",shape=21)+
  geom_hline(yintercept=0, linetype="dashed", color = "black")+
  geom_vline(xintercept = 0, linetype="dashed", color = "black")+
  # scale_shape_manual(values=c(23, 22,21,24,25), name="Continent")+
  # scale_shape_manual(values=c(15, 16,17,18,19), name="Continent")+
  # scale_color_manual(values = colB,name="Genetic group")+
  scale_fill_manual(values = colB,name="Hexaploid by subspecies")+
  # scale_shape_manual(values = shapeB)+
  labs(x=paste("PC1 ","(",variance1,")",sep = ""), y=paste("PC2 ","(",variance2,")",sep = ""))+
  theme_classic()+
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()
      ,panel.background = element_blank()
      ,axis.line = element_line(size=0.3, colour = "black")
      # ,legend.position = 'none'
      ,text = element_text(size = 12))  
p
ggsave(p,filename = "~/Documents/p4.pdf",height = 3,width = 3)
ggsave(p,filename = "~/Documents/test.pdf",height = 3,width = 6)

library(plotly)
fig <- df %>%
  # filter(Subspecies %in% c("Landrace_Yunnan","yunna-nense")) %>%
  # filter(Subspecies == "Spelt_CAU" |Subspecies == "spelta") %>%
  # filter(Subspecies %in% c("petropavlovskyi","Landrace_Xinjiang")) %>%
  # filter(Subspecies == "Landrace",Group %in% c("JiaoLab","LuLab","CAAS") ) %>%
  plot_ly(x = ~ PC1, y = ~ PC2,color = ~ GroupforExpansionLoad,
               text = ~paste(Taxa),
               colors = colB)

fig

```


## macha
### A subgenome
```{r}
library(factoextra)
library(FactoMineR)
### 导入种质数据库,并选取特定的列
dataFile <- c("data/001_TaxaDB/WheatVMap2_GermplasmInfo.txt")
dftaxaDB <- read.delim(dataFile) %>% 
  select(Taxa=VcfID,GenomeType,Subspecies_by22, Subspecies_by22_TreeValidated,Subspecies_by6_TreeValidated,Continent_forTree,Group,GroupByOriPublication_Abbre)

### import matrix
dataFile <- c("data/002_pca/004_matrix_Asub.txt.gz")
df <- read.delim(dataFile)

### 建立Taxa对应的属性变量数据框
dfTaxaInfo <- df %>% select(Taxa) %>% left_join(.,dftaxaDB,by="Taxa")

### pca analysis
c1 <- which(colnames(df) == 'Taxa')
df.pca <- PCA(df[,-c(c1)], graph = F)

### 提取变异解释度并合并数据
variance1 <-  paste(round(df.pca$eig[1,3],2),"%",sep="")  ##第一个成分
variance2 <-  paste(round(df.pca$eig[2,3] - df.pca$eig[1,3],2),"%",sep="") ##第二个成分
dfpca <- as.data.frame(df.pca$ind$coord)
df <- cbind(dfTaxaInfo,dfpca)

factorA <- c("Wild_emmer","Domesticated_emmer","Free_threshing_tetraploids","OtherTetraploids", "Landrace","Cultivar","OtherHexaploids")

labelsA <- c("Wild emmer","Domesticated emmer","Free-threshing tetraploids","OtherTetraploids", "Landrace","Cultivar","OtherHexaploids")

colB <- c("#ffd702","#7f5701","#016699","#00f3ff", "#fc6e6e","#9900ff","#fe63c2")

df$Subspecies_by6_TreeValidated <- factor(df$Subspecies_by6_TreeValidated,levels=factorA,labels=labelsA)

```

```{r}
### start to plot
p <- df %>% 
  # filter(! Taxa %in%  c("AB_183","AB_185","AB_039")) %>%
  ggplot(aes(x=Dim.1,y=Dim.2,fill= Subspecies_by22_TreeValidated))+
  # geom_point(aes(color=Subspecies),alpha=0.5,size=2)+
  geom_point(alpha=0.8,size=2,color="gray",shape=21)+
  geom_hline(yintercept=0, linetype="dashed", color = "black")+
  geom_vline(xintercept = 0, linetype="dashed", color = "black")+
  # scale_shape_manual(values=c(23, 22,21,24,25), name="Continent")+
  # scale_shape_manual(values=c(15, 16,17,18,19), name="Continent")+
  # scale_color_manual(values = colB,name="Genetic group")+
  # scale_fill_manual(values = colB,name="Genetic group")+
  # scale_shape_manual(values = shapeB)+
  labs(x=paste("PC1 ","(",variance1,")",sep = ""), y=paste("PC2 ","(",variance2,")",sep = ""))+
  theme_classic()+
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()
      ,panel.background = element_blank()
      ,axis.line = element_line(size=0.3, colour = "black")
      ,legend.position = 'none'
      ,text = element_text(size = 12))  
p
ggsave(p,filename = "~/Documents/p4.pdf",height = 3,width = 3)


library(plotly)
fig <- df %>%
  mutate(Subspecies_by22_TreeValidated=if_else(Subspecies_by6_TreeValidated=="Free-threshing tetraploids","OtherTetraploids",Subspecies_by22_TreeValidated)) %>% 
  plot_ly(x = ~ Dim.1, y = ~ Dim.2,color = ~ Subspecies_by22_TreeValidated,
               text = ~paste(Taxa),
               colors = colB)

fig
```
### macha 聚类查看
```{r}
library(factoextra)
library(FactoMineR)

df_colDB <- read_tsv("data/001_TaxaDB/003_VMap2_colorDB.txt") %>% 
  filter(ColorCategory=="GroupforExpansionLoad") %>% 
  select(Label,ColorCode)

dftaxaDB <- read_tsv("data/001_TaxaDB/004_taxaDB_GroupforExpansionLoad.txt") %>%
  filter(!is.na(GroupforExpansionLoad)) %>% 
  select(Taxa,GenomeType,GroupforExpansionLoad,GroupforExpansionLoad2,Subspecies_by6_TreeValidated) %>% 
  left_join(df_colDB,by=c("GroupforExpansionLoad"="Label"))

df_factor <- dftaxaDB %>% 
  distinct(GroupforExpansionLoad,.keep_all = T) %>% 
  filter(GenomeType=="AABBDD") %>% 
  mutate(Subspecies_by6_TreeValidated=factor(Subspecies_by6_TreeValidated,
         levels = c("Landrace","Cultivar","OtherHexaploids"))) %>% 
  arrange(as.factor(Subspecies_by6_TreeValidated),GroupforExpansionLoad) %>% 
  mutate(ColorCode=if_else(Subspecies_by6_TreeValidated %in% c("Landrace","Cultivar") | GroupforExpansionLoad=="OtherHexaploids" ,"gray90",ColorCode)) %>% 
  mutate(ColorCode=if_else(GroupforExpansionLoad=="Club wheat","black",ColorCode))

df <- read_tsv("data/016_mds/hexaploid_mds.txt") %>% 
  left_join(dftaxaDB,by="Taxa") %>% 
  filter(!is.na(GroupforExpansionLoad)) %>% 
  mutate(Subspecies_by6_TreeValidated=factor(Subspecies_by6_TreeValidated,
         levels = c("Landrace","Cultivar","OtherHexaploids"))) %>% 
  arrange(as.factor(Subspecies_by6_TreeValidated),GroupforExpansionLoad)
 

### 设置因子顺序及颜色
df$GroupforExpansionLoad <- factor(df$GroupforExpansionLoad,levels = df_factor$GroupforExpansionLoad)

colB <- df_factor$ColorCode

df_eig <- read_tsv("data/016_mds/hexaploid_mds_eigenvalues.txt")


### 提取变异解释度并合并数据
variance1 <-  paste(round(df_eig[1,2],3),"%",sep="")  ##第一个成分
variance2 <-  paste(round(df_eig[2,2],3),"%",sep="") ##第二个成分

### start to plot
p <- df %>% 
  ggplot(aes(x=PC1,y=PC2,fill= GroupforExpansionLoad))+
  # geom_point(aes(color=Subspecies),alpha=0.5,size=2)+
  geom_point(alpha=0.8,size=2,color="gray",shape=21)+
  geom_hline(yintercept=0, linetype="dashed", color = "black")+
  geom_vline(xintercept = 0, linetype="dashed", color = "black")+
  # scale_shape_manual(values=c(23, 22,21,24,25), name="Continent")+
  # scale_shape_manual(values=c(15, 16,17,18,19), name="Continent")+
  # scale_color_manual(values = colB,name="Genetic group")+
  scale_fill_manual(values = colB,name="Hexaploid by subspecies")+
  # scale_shape_manual(values = shapeB)+
  labs(x=paste("PC1 ","(",variance1,")",sep = ""), y=paste("PC2 ","(",variance2,")",sep = ""))+
  theme_classic()+
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()
      ,panel.background = element_blank()
      ,axis.line = element_line(size=0.3, colour = "black")
      # ,legend.position = 'none'
      ,text = element_text(size = 12))  
p
ggsave(p,filename = "~/Documents/p4.pdf",height = 3,width = 3)
ggsave(p,filename = "~/Documents/test.pdf",height = 3,width = 6)

library(plotly)
fig <- df %>%
  # filter(Subspecies %in% c("Landrace_Yunnan","yunna-nense")) %>%
  # filter(Subspecies == "Spelt_CAU" |Subspecies == "spelta") %>%
  # filter(Subspecies %in% c("petropavlovskyi","Landrace_Xinjiang")) %>%
  # filter(Subspecies == "Landrace",Group %in% c("JiaoLab","LuLab","CAAS") ) %>%
  plot_ly(x = ~ PC1, y = ~ PC2,color = ~ GroupforExpansionLoad,
               text = ~paste(Taxa),
               colors = colB)

fig

```

#### ====== Load Map and boxplot
## plot Bubble map vivid 主题
```{r}
choiceA <- "Ratio_031_GERP16way_1.5andDerived_PolyPhen2_0.5"

### 先处理del文件
dfdel <- read_tsv("data/004_delCount/delRatio.txt.gz") 

dftaxaDB <- read_tsv("data/001_TaxaDB/WheatVMap2_GermplasmInfo.txt") %>%
  select(Taxa=VcfID,GenomeType,Longitude,Latitude)

# dftaxaDB <- read_tsv("data/001_TaxaDB/004_taxaDB_GroupforExpansionLoad.txt") %>% 
#   select(Taxa,GenomeType,Longitude,Latitude,GroupforExpansionLoad,Subspecies_by6_TreeValidated)

df <- dfdel %>% 
  left_join(dftaxaDB,by="Taxa") %>% 
  filter(GenomeType=="AABBDD") %>% 
  filter(Subgenome=="D") %>% 
  filter(DelCountGroup == "TotalDerivedSNPCount") %>% 
  filter(Stand_LoadGroup == choiceA) %>% ### choice
  select(-c("GenomeType","Subgenome","DelCountGroup","Stand_LoadGroup")) %>% filter(!is.na(Longitude)) %>%  ### 802个个体 -> 543 个有经纬度信息
  filter(!Taxa=="ABD_0047") %>% 
  arrange(Stand_delCount)  #### 排序

mp<-NULL #定义一个空的地图  
# 注意： borders函数中的colour是地图国家边界线的颜色,fill是国家的填充颜色
mapworld<-borders("world",colour = "#dedfe0",fill="#dedfe0") #绘制基本地图
# mapworld<-borders("world",colour = "#dedfe0",fill="gray") #绘制基本地图
mpvoid<-ggplot()+ mapworld + ylim(-55,90)+theme_void()
# mpvoid<-ggplot()+ mapworld + ylim(0,70)+xlim(-20,147)+theme_void()

library(viridis)
mp<-mpvoid +
  geom_point(data = df,mapping = aes(x=Longitude,y=Latitude,
  size=Stand_delCount, color=Stand_delCount),
  alpha = 0.6)+
  scale_size_continuous(range=c(0.35,1.2))+ ## 0.5-3
  # scale_color_manual(values = colorRampPalette(brewer.pal(n = 3, name = "RdBu"))(100))+
  # scale_fill_manual(values = colorRampPalette(brewer.pal(n = 3, name = "RdBu"))(100))
  # scale_fill_gradientn(colours =rev(colorRampPalette(brewer.pal(n = 3, name = "RdBu"))(100)))+
  # scale_color_gradientn(colours = rev(colorRampPalette(brewer.pal(n = 3, name = "RdBu"))(100)))
  scale_color_viridis(trans="log") +
  # scale_fill_viridis(trans="log")+
    # scale_color_gradient2(low = "blue", high = "red", mid = "white", 
    #   midpoint = 0.082, limit = c(0.07,0.0865),name="test") +
  # theme(legend.position = 'none')
mp

ggsave(mp,filename = "figs/map_s1062_2.pdf", height = 2.76,width = 6.89)
      
```


## plot Bubble map 循环画图
- 附件将 nonsyn  pph2 gerp1.5 gerp2.14 gerp3.1 vep  画出来
```{r}
### plot function
fun_plotMapLoad <- function(df_fun,factor_fun){
  # factor_fun <- str_sub(factor_fun,11)
  
  mp <- NULL #定义一个空的地图
  mapworld <- borders("world", colour = "#dedfe0", fill = "#dedfe0")
  mpvoid <- ggplot() + mapworld + ylim(-55, 90) + theme_void()
  
  library(viridis)
  mp <- mpvoid +
    geom_point(
      data = df_fun,
      mapping = aes(
        x = Longitude,
        y = Latitude,
        size = Stand_delCount,
        color = Stand_delCount
        # fill = Stand_delCount
      ),
      alpha = 0.8
    ) +
    labs(title = factor_fun)+
    theme(plot.title = element_text(hjust = 0.5))+
    scale_size_continuous(range = c(1, 3)) + ## 0.5-3
    # scale_color_manual(values = colorRampPalette(brewer.pal(n = 3, name = "RdBu"))(100))
    # scale_fill_manual(values = colorRampPalette(brewer.pal(n = 3, name = "RdBu"))(100))
    scale_color_viridis(trans = "log")+
    # scale_fill_viridis(trans = "log")+
    guides(color = guide_colorbar(order = 1,title = "Mutation load"),
           size = "none"
           # size = guide_legend(order = 2,reverse = T)
           )

  
  mp
  
  filename_fun <- str_c("~/Documents/","map_",factor_fun,".pdf") 
  print(filename_fun)
  ggsave(mp,
         filename = filename_fun,
         height = 2.76,
         width = 6.89)
  
  return(mp)
}


dfdel <- read_tsv("data/004_delCount/delRatio.txt.gz")

dftaxaDB <- read.delim("/Users/Aoyue/project/wheatVMap2_1000/001_germplasm/020_WheatVMap2_GermplasmInfo.txt") %>% select(Taxa=VcfID,GenomeType,Longitude,Latitude)

factorA <- c("Ratio_002_nonsynonymous","Ratio_033_Derived_PolyPhen2_0.5","Ratio_026_Derived_PolyPhen2_probably","Ratio_032_GERP16way_1.5","Ratio_014_GERP16way_2.14_max","Ratio_025_GERP16way_3.1","Ratio_007_VEP")

factorA_label <- c("Nonsynonymous","PPH2","PPH2_Probably","GERP>1.5","GERP>2.14","GERP>3.1","VEP_HighImpact")

df <- dfdel %>% 
  right_join(dftaxaDB,by="Taxa") %>% 
  filter(GenomeType=="AABBDD") %>% 
  filter(Subgenome=="D") %>% 
  filter(DelCountGroup == "TotalDerivedSNPCount") %>% 
  # filter(Stand_LoadGroup == choiceA) %>% ### choice
  filter(Stand_LoadGroup %in% factorA) %>% 
  mutate(Stand_LoadGroup=factor(Stand_LoadGroup,levels = factorA,labels = factorA_label)) %>% 
  mutate(Stand_LoadGroup=as.character(Stand_LoadGroup)) %>% 
  filter(Stand_delCount!= 0) %>% 
  select(-c("GenomeType","Subgenome","DelCountGroup")) %>%
  filter(!is.na(Longitude)) %>%  ### 802个个体 -> 543 个有经纬度信息
  group_by(Stand_LoadGroup) %>% 
  arrange(Stand_delCount) %>%  #### 排序
  group_map(~fun_plotMapLoad(.x,.y))


q <- ggpubr::ggarrange(plotlist=list(df[[4]],df[[5]],df[[6]],
                                     df[[1]],df[[2]],df[[3]],
                                     df[[7]]),
                       # labels = as.character(c(1:26)),
                       font.label = list(size = 7),
                       ncol = 2,nrow = 4)
  
cowplot::save_plot(q,filename = "~/Documents/test.pdf",base_height = 7.2,base_width = 7.2)

```


## boxplot load hexaploid 多种情况循环画图
- 可以选择定义有害突变的方式 
unique(dfdel$Stand_LoadGroup)
```{r}

fun_plotMutationLoadBoxplot <- function(df,factor_fun){
  
  dfV1 <- df %>%
    group_by(Subgenome, GroupforExpansionLoad) %>%
    summarise(mean = mean(Stand_delCount)) %>%
    arrange(Subgenome, mean) %>%
    ungroup() %>%
    filter(Subgenome == "D") %>%
    select(GroupforExpansionLoad)

  ### 因子排序,排序后计数
  df$GroupforExpansionLoad <-
    factor(df$GroupforExpansionLoad, levels = dfV1$GroupforExpansionLoad)

  ### 计算每个亚群的个数
  ### 千万注意数据框的分组情况，如sub VariantsGroup ，计算后核查确认！！！
  dftaxaCount <- df %>%
    group_by(Subgenome, GroupforExpansionLoad) %>%
    count() %>% ungroup() %>%
    distinct(GroupforExpansionLoad, .keep_all = T) %>%
    select(-Subgenome) %>%
    mutate(GroupforExpansionLoad_AddCount = str_c(GroupforExpansionLoad, " (", n, ")", sep = "")) %>%
    select(-n)
  
  library(RColorBrewer)
  p <- df %>%
    ggplot(aes(x = GroupforExpansionLoad, y = Stand_delCount)) +
    labs(y = str_c("Individual-based mutation load"),
         x = "",
         title = factor_fun) +
    geom_boxplot(aes(fill = GroupforExpansionLoad),
                 outlier.color = NA,
                 alpha = 0.6) + ### width=0.1
    geom_point(
      position = position_jitterdodge(0.6),
      alpha = 0.4,
      aes(color = Subgenome),
      size = 0.2
    ) +
    stat_summary(
      aes(group = Subgenome),
      fun = mean,
      geom = "point",
      color = "blue",
      position = position_dodge(1),
      size = 0.7
    ) +
    scale_color_manual(values = c("#fd8582", "#967bce", "#4bcdc6"),
                       name = '') +
    scale_fill_manual(values = colorRampPalette(rev(brewer.pal(
      n = 1, name = "RdBu"
    )))(16)) +
    scale_x_discrete(labels = dftaxaCount$GroupforExpansionLoad_AddCount) +
    theme(strip.background = element_blank()) +
    # theme(plot.margin=unit(rep(1.3,4),'lines'))+
    coord_flip() +
    theme(
      plot.title = element_text(hjust = 0.5),
      panel.background = element_blank(),
      panel.border = element_rect(
        colour = "black",
        fill = NA,
        size = 0.4
      ),
      text = element_text(size = 7),
      legend.position = 'none'
    )
  p
  
  filename_fun <- str_c("~/Documents/",factor_fun,".pdf")
  ggsave(p,
         # filename = "/Users/Aoyue/Documents/test.pdf",
         filename = filename_fun,
         height = 4.2,
         width = 2.8)
  
  print(filename_fun)
  
  return(p)
  
}


### Begin
factorA <- c("Ratio_002_nonsynonymous","Ratio_033_Derived_PolyPhen2_0.5","Ratio_026_Derived_PolyPhen2_probably","Ratio_032_GERP16way_1.5","Ratio_014_GERP16way_2.14_max","Ratio_025_GERP16way_3.1","Ratio_007_VEP")

factorA_label <- c("Nonsynonymous","PPH2","PPH2_Probably","GERP>1.5","GERP>2.14","GERP>3.1","VEP_HighImpact")


### ********************** Section1 *********************************###
dftaxaDB <- read_tsv("data/001_TaxaDB/004_taxaDB_GroupforExpansionLoad.txt") %>% 
  select(Taxa,GenomeType,GroupforExpansionLoad)

### ********************** Section2 *********************************###
### del load 数据框
dfdel <- read_tsv("data/004_delCount/delRatio.txt.gz")
df <- dfdel %>%
  left_join(dftaxaDB, by = "Taxa") %>%
  filter(GenomeType == "AABBDD") %>%
  filter(Subgenome == "D") %>%
  filter(DelCountGroup == "TotalDerivedSNPCount") %>%
  select(Taxa,Subgenome,Stand_LoadGroup,Stand_delCount,GroupforExpansionLoad) %>% 
  filter(!is.na(GroupforExpansionLoad)) %>% 
  filter(Stand_LoadGroup %in% factorA) %>% 
  mutate(Stand_LoadGroup=factor(Stand_LoadGroup,levels = factorA,labels = factorA_label)) %>% 
  mutate(Stand_LoadGroup=as.character(Stand_LoadGroup)) %>% 
  group_by(Stand_LoadGroup) %>% 
  group_map(~fun_plotMutationLoadBoxplot(.x,.y))

    
q <- ggpubr::ggarrange(plotlist=list(df[[4]],df[[5]],df[[6]],
                                     df[[1]],df[[2]],df[[3]],
                                     df[[7]]),
                       # labels = as.character(c(1:26)),
                       font.label = list(size = 7),
                       ncol = 3,nrow = 3)
  
cowplot::save_plot(q,filename = "~/Documents/test.pdf",base_height = 7.2,base_width = 7.2)

```


## boxplot load tetraploid 多种情况循环画图
- 可以选择定义有害突变的方式 
unique(dfdel$Stand_LoadGroup)
```{r}

fun_plotMutationLoadBoxplot <- function(df,factor_fun){
  
  dfV1 <- df %>%
    group_by(Subgenome, GroupforExpansionLoad) %>%
    summarise(mean = mean(Stand_delCount)) %>%
    arrange(Subgenome, mean) %>%
    ungroup() %>%
    filter(Subgenome == "A") %>%
    select(GroupforExpansionLoad)

  ### 因子排序,排序后计数
  df$GroupforExpansionLoad <-
    factor(df$GroupforExpansionLoad, levels = dfV1$GroupforExpansionLoad)

  ### 计算每个亚群的个数
  ### 千万注意数据框的分组情况，如sub VariantsGroup ，计算后核查确认！！！
  dftaxaCount <- df %>%
    group_by(Subgenome, GroupforExpansionLoad) %>%
    count() %>% ungroup() %>%
    distinct(GroupforExpansionLoad, .keep_all = T) %>%
    select(-Subgenome) %>%
    mutate(GroupforExpansionLoad_AddCount = str_c(GroupforExpansionLoad, " (", n, ")", sep = "")) %>%
    select(-n)
  
  library(RColorBrewer)
  p <- df %>%
    ggplot(aes(x = GroupforExpansionLoad, y = Stand_delCount)) +
    labs(y = str_c("Individual-based mutation load"),
         x = "",
         title = factor_fun) +
    geom_boxplot(aes(fill = GroupforExpansionLoad),
                 outlier.color = NA,
                 alpha = 0.6) + ### width=0.1
    geom_point(
      position = position_jitterdodge(0.6),
      alpha = 0.4,
      aes(color = Subgenome),
      size = 0.2
    ) +
    stat_summary(
      aes(group = Subgenome),
      fun = mean,
      geom = "point",
      color = "blue",
      position = position_dodge(1),
      size = 0.7
    ) +
    scale_color_manual(values = c("#fd8582", "#967bce", "#4bcdc6"),
                       name = '') +
    scale_fill_manual(values = colorRampPalette(rev(brewer.pal(
      n = 3, name = "RdBu"
    )))(9)) +
    scale_x_discrete(labels = dftaxaCount$GroupforExpansionLoad_AddCount) +
    theme(strip.background = element_blank()) +
    # theme(plot.margin=unit(rep(1.3,4),'lines'))+
    coord_flip() +
    theme(
      plot.title = element_text(hjust = 0.5),
      panel.background = element_blank(),
      panel.border = element_rect(
        colour = "black",
        fill = NA,
        size = 0.4
      ),
      text = element_text(size = 7),
      legend.position = 'none'
    )
  p
  
  filename_fun <- str_c("~/Documents/",factor_fun,".pdf")
  ggsave(p,
         # filename = "/Users/Aoyue/Documents/test.pdf",
         filename = filename_fun,
         height = 4.2,
         width = 2.8)
  
  print(filename_fun)
  
  return(p)
  
}


### Begin
factorA <- c("Ratio_002_nonsynonymous","Ratio_033_Derived_PolyPhen2_0.5","Ratio_026_Derived_PolyPhen2_probably","Ratio_032_GERP16way_1.5","Ratio_014_GERP16way_2.14_max","Ratio_025_GERP16way_3.1","Ratio_007_VEP")

factorA_label <- c("Nonsynonymous","PPH2","PPH2_Probably","GERP>1.5","GERP>2.14","GERP>3.1","VEP_HighImpact")


### ********************** Section1 *********************************###
dftaxaDB <- read_tsv("data/001_TaxaDB/004_taxaDB_GroupforExpansionLoad.txt") %>% 
  select(Taxa,GenomeType,GroupforExpansionLoad)

### ********************** Section2 *********************************###
### del load 数据框
dfdel <- read_tsv("data/004_delCount/delRatio.txt.gz")
df <- dfdel %>%
  left_join(dftaxaDB, by = "Taxa") %>%
  filter(GenomeType == "AABB") %>%
  filter(Subgenome == "A") %>%
  filter(DelCountGroup == "TotalDerivedSNPCount") %>%
  select(Taxa,Subgenome,Stand_LoadGroup,Stand_delCount,GroupforExpansionLoad) %>% 
  filter(!is.na(GroupforExpansionLoad)) %>% 
  filter(Stand_LoadGroup %in% factorA) %>% 
  mutate(Stand_LoadGroup=factor(Stand_LoadGroup,levels = factorA,labels = factorA_label)) %>% 
  mutate(Stand_LoadGroup=as.character(Stand_LoadGroup)) %>% 
  group_by(Stand_LoadGroup) %>% 
  group_map(~fun_plotMutationLoadBoxplot(.x,.y))

    
q <- ggpubr::ggarrange(plotlist=list(df[[4]],df[[5]],df[[6]],
                                     df[[1]],df[[2]],df[[3]],
                                     df[[7]]),
                       # labels = as.character(c(1:26)),
                       font.label = list(size = 7),
                       ncol = 3,nrow = 3)
  
cowplot::save_plot(q,filename = "~/Documents/test.pdf",base_height = 7.2,base_width = 7.2)

```


#### ====== Proportion in intron 3_UTR 5_UTR
## dftotal
```{r}
dftotal <- read_tsv("data/005_SNPAnnoDB/geneSNPAnno.txt.gz") %>%
  mutate(Sub=str_sub(Transcript, 9, 9))

  # filter(!is.na(Ancestral),Ancestral==Ref|Ancestral==Alt) %>%
  # mutate(IfRefisAnc = if_else(Ref == Ancestral,"Anc","Der")) %>% 
  # mutate(Sub=str_sub(Transcript, 9, 9)) %>% 

dftotal_ancestral <- dftotal %>% 
  filter(!is.na(Ancestral),Ancestral == Ref | Ancestral == Alt) 
```


```{r}
### 每个位点的注释信息
df1 <- dftotal %>% 
  group_by(Region) %>% 
  filter(!is.na(Region)) %>%
  filter(Region %in% c("Intron","UTR_3","UTR_5")) %>% 
  select(ID,Chr,Pos,Sub,Variants_Group=Region)

df2 <- dftotal %>% 
  anti_join(df1,by="ID") %>% 
  filter(!is.na(Variant_type)) %>% 
  filter( Variant_type %in%  c("NONSYNONYMOUS","SYNONYMOUS","STOP-GAIN")) %>% 
  select(ID,Chr,Pos,Sub,Variants_Group=Variant_type)

df3 <- dftotal %>% 
  anti_join(df1,by="ID") %>% 
  anti_join(df2,by="ID") %>% 
  filter(Impact_VEP=="HIGH") %>%
  filter(str_detect(Effect_VEP,"splice_")) %>% 
  mutate(Variants_Group="Essential splice") %>% 
  select(ID,Chr,Sub,Pos,Variants_Group)

factorA <- c("Intron","UTR_5","UTR_3","SYNONYMOUS","NONSYNONYMOUS","Essential splice","STOP-GAIN")
labelsA <- c("Intron","UTR_5","UTR_3","Synonymous","Nonsynonymous","Essential splice","Nonsense")

df_anno <- rbind(df1,df2,df3) %>% 
  mutate(Variants_Group=factor(Variants_Group,levels=factorA,labels = labelsA)) %>% 
  select(ID,Sub,Variants_Group)

### 合计 2670892 个位点得到注释

### ---------------------------------------------------------------------------
###每个亚群的AAF
### 注意这里的AAF中，有 ancestral 状态的为DAF，没有ancestral 状态的 AAF
### 因为使用的是 ancestralVCF 计算的AAF
dftotal_AAF <- read_tsv("data/015_AAF/001_aaf_bySubspecies/source/aaf_bySubspecies_filterAAF0.txt.gz") 

dftotal_AAF_filter1 <- dftotal_AAF %>% 
    filter(AAF!=1)
### 合计 5343891 AAF 在0-1之间

### 联合上文求出的 ID,Chr,Pos,Variants_Group 对每个点进行注释,
### 联合 dftaxaDB,对每个亚群进行倍性的注释

dftaxaDB <- read_tsv("data/001_TaxaDB/004_taxaDB_GroupforExpansionLoad.txt") %>% 
  distinct(GroupforExpansionLoad,.keep_all = T) %>% 
  select(GenomeType,GroupforExpansionLoad,GroupforExpansionLoad2)

dfSNPdb <- dftotal_AAF_filter1 %>% 
  mutate(ID=str_c(Chr,"-",Pos)) %>% 
  left_join(df_anno,by="ID") %>% 
  filter(!is.na(Variants_Group))

dfSNPdb_count <- dfSNPdb %>% 
  group_by(Group,Variants_Group) %>% 
  count() %>% 
  left_join(dftaxaDB,by=c("Group"="GroupforExpansionLoad2"))
```


### plot 全基因组各种变异类型的比例
```{r}

factorA <- c("Wild emmer","Domesticated emmer","Durum","Khorasan wheat","Polish wheat","Rivet wheat","Georgian wheat","Ispahanicum","Persian wheat","LR_WA","LR_AF","LR_EU","LR_AM","LR_EA","LR_CSA","Cultivar","OtherHexaploids","Club wheat","Indian dwarf wheat","Tibetan semi-wild","Vavilovii","Xinjiang wheat","Yunan wheat","Macha","Spelt","Strangulata")

df <- dfSNPdb_count %>% 
  mutate(GroupforExpansionLoad=factor(GroupforExpansionLoad,levels = factorA)) %>% 
  mutate(GenomeType2=if_else(GenomeType=="AABBDD",GenomeType,"AABB_DD")) %>% 
  group_by(GenomeType2) %>% 
  group_map(~{
    ### 画图函数
   p <- .x %>% 
      ggplot(aes(x=GroupforExpansionLoad, y=n,fill=Variants_Group)) +
      geom_bar(
        stat = "identity",
        # alpha = 1,
        position = "fill",
        alpha = 0.9
      ) +
      # geom_vline(xintercept = 4.5,linetype = "solid",color = "black") +
      # geom_text(aes(label=  paste(prettyNum(ObservedCount,big.mark = ","))),position = position_fill(),hjust=1, size=3) +
      # scale_fill_manual(values=colB)+
      scale_fill_brewer(palette = "Accent") +
      labs(y = "Frequency of categories by subspecies", x = "") +
      coord_flip() +
      theme(
        panel.background = element_rect(
          size = 0.7,
          colour = 'black',
          fill = 'white'
        ),
        panel.grid = element_blank(),
        text = element_text(size = 12)
        # ,legend.position = 'none'
      )
    
    p
    ggsave(p,
           filename = str_c("~/Documents/",.y,"_proportion_VariantsGroup.pdf"),
           height = 4,
           width = 6)
    
    return(p)
    
  })
  
  q <- ggpubr::ggarrange(plotlist=list(df[[2]],df[[1]]),
                       # labels = as.character(c(1:26)),
                       font.label = list(size = 7),
                       ncol = 1,nrow = 2)
  
cowplot::save_plot(q,filename = "~/Documents/test.pdf",base_height = 7.2,base_width = 7.2)

```

## plot 不同变异类型，不同亚群的比较-分面图
```{r}
factorA <- c("Wild emmer","Domesticated emmer","Durum","Khorasan wheat","Polish wheat","Rivet wheat","Georgian wheat","Ispahanicum","Persian wheat","LR_WA","LR_AF","LR_EU","LR_AM","LR_EA","LR_CSA","Cultivar","OtherHexaploids","Club wheat","Indian dwarf wheat","Tibetan semi-wild","Vavilovii","Xinjiang wheat","Yunan wheat","Macha","Spelt","Strangulata")


df <- dfSNPdb_count %>% 
  mutate(GroupforExpansionLoad=factor(GroupforExpansionLoad,levels = factorA)) %>% 
  mutate(GenomeType2=if_else(GenomeType=="AABBDD",GenomeType,"AABB_DD")) %>% 
  group_by(GenomeType2) %>% 
  group_map(~{
    group1 <- c("Intron","UTR_5","UTR_3","Syn","Nonsyn","Essential splice","Nonsense")
    
    p <- .x %>%
      mutate(Group = factor(Variants_Group, levels = group1)) %>%
      ggplot(aes(x = GroupforExpansionLoad, y = n, fill = Variants_Group)) +
      geom_bar(stat = "identity") +
      scale_fill_brewer(palette = "Accent") +
      facet_wrap( ~ Variants_Group, scales = "free",ncol = 2) +
      labs(x = "Subspecies", y = "Number of sites observed\nto be variable") +
      scale_y_continuous(labels = scales::comma_format(big.mark = ',')) +
      theme_bw() +
      theme(legend.position = "none") +
      theme(axis.text.x = element_text(angle = 45,vjust = 1,hjust = 1,size = 9))
    p
    
       ggsave(p,
           filename = str_c("~/Documents/",.y,"_count_VariantsGroup.pdf"),
           height = 7,
           width = 7)
    
    return(p)
    
  })
```

#### ====== Proportion in del non syn
```{r}
###每个亚群的AAF
### 注意这里的AAF中，指得是DAF，全是有 ancestral 状态的位点
### 因为使用的是 ancestralVCF 计算的AAF
dftotal_withAncestral_AAF <- read_tsv("data/015_AAF/001_aaf_bySubspecies/001_aaf_bySubspecies.txt.gz")

dftotal_withAncestral_AAF_filter1 <- dftotal_withAncestral_AAF %>% 
  filter(AAF!=1)

dfSNPdb_count <- dftotal_withAncestral_AAF_filter1 %>% 
  group_by(Group,LoadGroup,GenomeType) %>% 
  count() %>% 
  rename(GroupforExpansionLoad=Group,Variants_Group=LoadGroup)

```


### plot derived 各种变异类型的比例
```{r}

factorA <- c("Wild emmer","Domesticated emmer","Durum","Khorasan wheat","Polish wheat","Rivet wheat","Georgian wheat","Ispahanicum","Persian wheat","LR_WA","LR_AF","LR_EU","LR_AM","LR_EA","LR_CSA","Cultivar","OtherHexaploids","Club wheat","Indian dwarf wheat","Tibetan semi-wild","Vavilovii","Xinjiang wheat","Yunan wheat","Macha","Spelt","Strangulata")

df <- dfSNPdb_count %>% 
  mutate(GroupforExpansionLoad=factor(GroupforExpansionLoad,levels = factorA)) %>% 
  mutate(GenomeType2=if_else(GenomeType=="AABBDD",GenomeType,"AABB_DD")) %>% 
  group_by(GenomeType2) %>% 
  group_map(~{
    ### 画图函数
   p <- .x %>% 
     mutate(Variants_Group=factor(Variants_Group,levels = c("Deleterious","Nonsynonymous","Synonymous"))) %>% 
      ggplot(aes(x=GroupforExpansionLoad, y=n,fill=Variants_Group)) +
      geom_bar(
        stat = "identity",
        # alpha = 1,
        position = "fill",
        alpha = 0.9
      ) +
      # geom_vline(xintercept = 4.5,linetype = "solid",color = "black") +
      # geom_text(aes(label=  paste(prettyNum(ObservedCount,big.mark = ","))),position = position_fill(),hjust=1, size=3) +
      # scale_fill_manual(values=colB)+
      scale_fill_manual(values = c("#d5311c","#e69f00","#004680"))+
      labs(y = "Frequency of categories by subspecies", x = "") +
      coord_flip() +
      theme(
        panel.background = element_rect(
          size = 0.7,
          colour = 'black',
          fill = 'white'
        ),
        panel.grid = element_blank(),
        text = element_text(size = 12)
        # ,legend.position = 'none'
      )
    
    p
    ggsave(p,
           filename = str_c("~/Documents/",.y,"_proportion_VariantsGroup.pdf"),
           height = 4,
           width = 6)
    
    return(p)
    
  })
  
  q <- ggpubr::ggarrange(plotlist=list(df[[2]],df[[1]]),
                       # labels = as.character(c(1:26)),
                       font.label = list(size = 7),
                       ncol = 1,nrow = 2)
  
cowplot::save_plot(q,filename = "~/Documents/test.pdf",base_height = 7.2,base_width = 7.2)

```

## plot 不同LoadGroup变异类型，不同亚群的比较-分面图
```{r}
factorA <- c("Wild emmer","Domesticated emmer","Durum","Khorasan wheat","Polish wheat","Rivet wheat","Georgian wheat","Ispahanicum","Persian wheat","LR_WA","LR_AF","LR_EU","LR_AM","LR_EA","LR_CSA","Cultivar","OtherHexaploids","Club wheat","Indian dwarf wheat","Tibetan semi-wild","Vavilovii","Xinjiang wheat","Yunan wheat","Macha","Spelt","Strangulata")


df <- dfSNPdb_count %>% 
  mutate(GroupforExpansionLoad=factor(GroupforExpansionLoad,levels = factorA)) %>% 
  mutate(GenomeType2=if_else(GenomeType=="AABBDD",GenomeType,"AABB_DD")) %>% 
  group_by(GenomeType2) %>% 
  group_map(~{
    group1 <- c("Intron","UTR_5","UTR_3","Syn","Nonsyn","Essential splice","Nonsense")
    
    p <- .x %>%
     mutate(Variants_Group=factor(Variants_Group,levels = c("Deleterious","Nonsynonymous","Synonymous"))) %>% 
      mutate(Group = factor(Variants_Group, levels = group1)) %>%
      ggplot(aes(x = GroupforExpansionLoad, y = n, fill = Variants_Group)) +
      geom_bar(stat = "identity") +
      scale_fill_manual(values = c("#d5311c","#e69f00","#004680"))+
      facet_wrap( ~ Variants_Group, scales = "free",ncol = 2) +
      labs(x = "Subspecies", y = "Number of sites observed\nto be variable") +
      scale_y_continuous(labels = scales::comma_format(big.mark = ',')) +
      theme_bw() +
      theme(legend.position = "none") +
      theme(axis.text.x = element_text(angle = 45,vjust = 1,hjust = 1,size = 8))
    p
    
       ggsave(p,
           filename = str_c("~/Documents/",.y,"_count_VariantsGroup.pdf"),
           height = 7,
           width = 7)
    
    return(p)
    
  })
```


#### ============
## spatstat
## Spatstat world map
- 画出7种类型的 spatstat 地图
```{r}
### 先构建函数
fun_plotSpatstat <- function(df,fun_factor){
  library(raster)
# detach(package:raster,unload = T)
library(spatstat)
library(RColorBrewer)

  choiceA <- fun_factor
  
ppp.df <- ppp(df$Longitude, df$Latitude, c(-180, 180), c(-60, 90))
marks(ppp.df) <- df[, 2] ### load ratio 做为 marks 信息
# plot(density(ppp.df)) ###画出点的聚集程度 测试
ppp.df2 <- Smooth(ppp.df) ### 求每个点对应的load值的大小
ppp.df2.m <- ppp.df2$v  ### 获取矩阵内的密度数据
###矩阵上下颠倒128 -> 1 ,依次减一
ppp.df2.m1 <- ppp.df2.m[seq(128, 1, by = -1), ]

### 创建 raster 类，并将密度的结果填充进去
df.raster <- raster(nrows = 128, ncols = 128, xmn = -180, xmx = 180, ymn = -60, ymx = 90)
values(df.raster) <- as.vector(t(as.matrix(ppp.df2.m1)))
plot(df.raster)

world.shp <- shapefile("data/002_spatstat/spatstat/shapefile/TM_WORLD_BORDERS_SIMPL-0.3/TM_WORLD_BORDERS_SIMPL-0.3.shp")
df.raster.final <- mask(df.raster, world.shp)
par(mfrow = c(1,1))

# pdf(file = str_c("figs/Fig4/misc",choiceA,"_wholeWorld.pdf"),height = 3,width = 4)
### whole world
png(filename = str_c("~/Documents/",choiceA,"_wholemap.png"),units = 'in',height = 4,width = 5.3,res = 300)
plot(df.raster.final, col = colorRampPalette(rev(brewer.pal(n = 3, name = "RdBu")))(100),xaxt="n",yaxt="n")
points(df$Longitude, df$Latitude, pch = 16, cex = 0.5,col=rgb(252, 110, 110,90,maxColorValue=255))
axis(1,at=seq(-150,180,60))
axis(2,at=seq(-90,90,60))
dev.off()

### 亚欧大陆和非洲
png(filename =str_c("~/Documents/",choiceA,"_Eurasia.png"),units = 'in',height = 4,width = 5.3,res = 300)
# pdf( "~/Documents/test.pdf")
plot(df.raster.final, col = colorRampPalette(rev(brewer.pal(n = 3, name = "RdBu")))(100),xlim=c(-20,150),ylim=c(-35,62),xaxt="n",yaxt="n")
points(df$Longitude, df$Latitude, pch = 16, cex = 0.8,col=rgb(252, 110, 110,90,maxColorValue=255))
axis(1,at=seq(0,150,30))
axis(2,at=seq(-20,62,20))
# axis(1,at=seq(0,150,30),labels = c("0","","","","",""))
# axis(2,at=seq(0,62,20))
dev.off()

### 美洲和非洲
png(filename = str_c("~/Documents/",choiceA,"_American.png"),units = 'in',height = 4,width = 5.3,res = 300)
plot(df.raster.final, col = colorRampPalette(rev(brewer.pal(n = 3, name = "RdBu")))(100),xlim=c(-150,60),ylim=c(-60,62),xaxt="n",yaxt="n")
points(df$Longitude, df$Latitude, pch = 16, cex = 0.8,col=rgb(252, 110, 110,90,maxColorValue=255))
axis(1,at=seq(-150,60,30))
axis(2,at=seq(-60,62,30))
dev.off()
}


```

### plot
```{r}
choiceA <- "Ratio_031_GERP16way_1.5andDerived_PolyPhen2_0.5"

factorA <- c("Ratio_002_nonsynonymous","Ratio_033_Derived_PolyPhen2_0.5","Ratio_026_Derived_PolyPhen2_probably","Ratio_032_GERP16way_1.5","Ratio_014_GERP16way_2.14_max","Ratio_025_GERP16way_3.1","Ratio_007_VEP")

factorA_label <- c("Nonsynonymous","PPH2","PPH2_Probably","GERP>1.5","GERP>2.14","GERP>3.1","VEP_HighImpact")

### 先处理del文件
dfdel <- read_tsv("data/004_delCount/delRatio.txt.gz")

dftaxaDB <- read_tsv("data/001_TaxaDB/WheatVMap2_GermplasmInfo.txt") %>%
  select(Taxa=VcfID,GenomeType,Longitude,Latitude)

set.seed(123)
df1 <- dfdel %>% right_join(.,dftaxaDB,by="Taxa") %>%
  filter(GenomeType=="AABBDD") %>%
  filter(Subgenome=="D") %>%
  filter(DelCountGroup == "TotalDerivedSNPCount") %>%
  # filter(Stand_LoadGroup == choiceA) %>%
  ### filter data
  filter(Stand_LoadGroup %in% factorA) %>% 
  mutate(Stand_LoadGroup=factor(Stand_LoadGroup,levels = factorA,labels = factorA_label)) %>% 
  mutate(Stand_LoadGroup=as.character(Stand_LoadGroup)) %>% 
  select(-IfSelected,-Group_refobj) %>%
  select(-c("GenomeType","Subgenome","DelCountGroup")) %>%
  filter(!is.na(Longitude)) %>%  ### 802个个体 -> 543 个有经纬度信息
  arrange(Stand_delCount)

df <- df1 %>%
  mutate(Longitude = Longitude + runif(nrow(df1),0.001,0.005)) %>% ##通过添加随机数，避免重复
  group_by(Stand_LoadGroup) %>% 
  group_map(~fun_plotSpatstat(.x,.y))
  
### M2
# q <- ggpubr::ggarrange(plotlist=list(df[[]],df[[]],df[[]],df[[]],
#                                      df[[]],df[[]],df[[]]),
#                        labels = as.character(c("")),
#                        font.label = list(size = 7),
#                        ncol = 4,nrow = 7)
# cowplot::save_plot(q,filename = "~/Documents/test.pdf",base_height = 7.2,base_width = 7.2)

```