准备数据
需要准备两个数据:一个是基因表达谱,另一个是基因的注释(可以为KO注释,也可以是别的什么注释)
基因表达谱
sample1 | sample2 | sample3 | ... | |
gene1 | 1.0 | 2.0 | 2.0 | ... |
gene2 | 3.0 | 3.0 | 4.0 | ... |
gene3 | 5.0 | 5.0 | 5.0 | ... |
gene4 | 6.0 | 7.0 | 9.0 | ... |
... | ... | ... | ... | ... |
通路信息
gene | KO | pathway |
gene1 | KO1 | pathway1 |
gene2 | KO2 | pathway1 |
gene3 | KO2 | pathway2 |
... | ... | ... |
模拟数据
library(tidyverse)
library(magrittr)
library(circlize)
#模拟数据
## Data1
fpkm <- rbind(cbind(matrix(rnorm(500*3, mean = 1), nr = 500),
matrix(rnorm(500*3, mean = 2), nr = 500),
matrix(rnorm(500*3, mean = 3), nr = 500)))
fpkm <- fpkm[sample(500, 500), ] # randomly permute rows
rownames(fpkm) <- paste0("gene", seq(500))
colnames(fpkm) <- c("A1", "A2", "A3", "B1", "B2", "B3", "C1", "C2", "C3")
fpkm %<>% as.data.frame() %>% mutate(gene = row.names(.))
# Data2
pathways <- rep(paste0("pathway", seq(6)), sample(12:20, size = 6)) %>% sample(70)
KOs <- rep(paste0("KO", seq(20)), sample(5:20, size = 20, replace = TRUE)) %>% sample(70)
KOannotation <- data.frame(KO=KOs, pathway=pathways)
KOannotation <- KOannotation[sample(70, 200, TRUE),]
KOannotation$gene <- sample(paste0("gene", seq(500)),200)
# 假设你有富集到的想要可视化的通路
maps <- c("pathway1", "pathway2", "pathway3")
samples <- c("A1", "A2", "A3", "B1", "B2", "B3", "C1", "C2", "C3")
思考画图数据类型
首先简单介绍一下circos对象。正如普通的图有x轴和Y轴,你可以理解为一个circos图有很多个轴(具体数量由你数据确定)。那么每个轴上自然就有对应位置(如下图中的1、2、3、4)。
那么自然而然的很容易想象,如果要画link,需要一个类似这样的数据
from_axis | from_position | to_axis | to_position |
A | 1 | B | 4 |
A | 2 | C | 5 |
A | 3 | D | 6 |
进一步的,如果要控制link的宽度,那应该需要规定每个link的起始和结束的位置
from_axis | from_position_start | from_position_end | to_axis | to_position_start | to_position_end |
A | 0.5 | 1.5 | B | 3.5 | 4.5 |
A | 1.5 | 2.5 | C | 4.5 | 5.5 |
A | 2.5 | 3.5 | D | 5.5 | 6.5 |
考虑完link之后,考虑热图。首先确认是用circos.heatmap()
画的热图(如上图)。这个数据比较简单,因此考虑先画出热图,再考虑怎么画中间的和弦图。
热图
数据清洗
# 准备热图颜色
col_fun1 = colorRamp2(c(-2, 0, 2), c("#247ab5", "white", "#fda1a0"))
# 需要画图的基因
plot_gene <- KOannotation %>%
filter(pathway %in% maps) %>%
pull(gene) %>%
{filter(fpkm, row.names(fpkm) %in% .)}
# 需要画图的KO
plot_KO <- plot_gene %>%
left_join(KOannotation) %>%
filter(pathway %in% maps) %>% # There are some unenriched map
group_by(KO) %>%
summarise(across(samples,sum))
# 需要画图的Pathway
plot_map <- plot_gene %>%
left_join(KOannotation) %>%
filter(pathway %in% maps) %>% # There are some unenriched map
group_by(pathway) %>%
summarise(across(samples,sum))
plot_data1 <- bind_rows(plot_gene %>% rename(id=gene),
plot_KO %>% rename(id=KO),
plot_map %>% rename(id=pathway)) %>%
`row.names<-`(.$id) %>%
select(-id)
plot_data1 <- t(scale(t(plot_data1))) %>% as.data.frame()
# 在热图上划分为gene、KO、pathway
lev_split = row.names(plot_data1) %>% str_match("[a-zA-Z]+") %>% factor()
画图
circos.clear()
circos.par(gap.degree=10, track.height=0.1)
# 分多次画表达谱数据,更有层次
circos.heatmap(plot_data1[samples[1:3]], split = lev_split , col = col_fun1, rownames.side = "outside", cluster = TRUE)
circos.heatmap(plot_data1[samples[7:9]], split = lev_split , col = col_fun1)
circos.heatmap(plot_data1[samples[4:6]], split = lev_split , col = col_fun1)
根据lev_split
,这个热图划分为gene
,KO
,和pathway
三个轴。这里需要指出的是,每个gene、KO、pathway在每个轴上占的长度都是1,例如gene67在gene轴上的位置为0-1,pathway2在pathway轴上的位置为0-1.因此我们要画link的话,根据之前的讨论,如果有三个KO需要连接到pathway2,应该需要一个类似下面的数据:
from_axis | from_position_start | from_position_end | to_axis | to_position_start | to_position_end |
KO | 0.5 | 1.5 | pathway | 0 | 0.333 |
KO | 1.5 | 2.5 | pathway | 0.333 | 0667 |
KO | 2.5 | 3.5 | pathway | 0.667 | 1 |
注意上表,可能存在多个KO连接到一个pathway的情况,因此我们要对start和end位置进行合理的拆分,以避免重叠。这样做的另一个好处是可以让link线有粗有细,看上去美观许多。所以
另一方面我们要注意,由于circos热图上的基因的是根据聚类结果排布的,所以和数据框里的数据顺序已经不一样了,因此我们首先需要获取画图后的每个gene、KO、pathway在其对应轴上的坐标,这时就需要用到circlize
的get.cell.meta.data
从图上获得相应信息
和弦图
数据清洗
根据上面的讨论结果,我们的数据清洗要实现两个目的:
- 获取热图生成后的gene、KO、pathway上每个轴的信息,可以整理成如下格式:
id | sector | position |
gene1 | gene | 23 |
KO1 | KO | 45 |
pathway1 | pathway | 56 |
表A
- 将一对多的gene-KO关系、KO-pathway关系进行计算,以获得相对的位置
from_axis | from_position_start | from_position_end | to_axis | to_position_start | to_position_end |
gene | 0.5 | 1.5 | KO | 5 | 5.33 |
gene | 1.5 | 2.5 | KO | 5.33 | 5.66 |
gene | 2.5 | 3.5 | KO | 5.66 | 6 |
表B
注意观察上表,由于3个基因连接到了相同的KO,所以我将他们分别连接到了KO不同的位置
进一步,想要获得这个表,可以把过程拆分为以下几步:
2.1生成一个连接对象表
from | to |
gene1 | KO2 |
gene2 | KO1 |
KO1 | pathway1 |
2.2 根据连接对象在表中出现的次数,再计算一个位移
from | to | from_start | from_end | to_start | to_end |
gene1 | KO2 | 0 | 1 | 0 | 0.333 |
gene2 | KO1 | 0 | 1 | 0.333 | 0.667 |
KO1 | pathway1 | 0 | 0.5 | 0.667 | 1 |
KO1 | pathway2 | 0.5 | 1 | 0 | 0.0333 |
表C
2.3 结合表A和表C,计算得到表B
明确思路以后,下面是代码
# 1 获得gene、KO、pathway在每个轴上的位置
plot_data3 = data.frame()
for(lev in levels(lev_split)){
a <- rownames(plot_data1)[lev_split==lev][get.cell.meta.data("row_order", sector.index = lev)]
a <- seq(length(a)) %>% `names<-`(a) %>% enframe("id", "position")
a$sector = lev
plot_data3 <- rbind(plot_data3, a)
}
plot_data3$position <- plot_data3$position - 1 # 因为每个元素的范围是0~1,所以统一减一方便后面相加
#2.1 获取点对点连接表
plot_data2 <- KOannotation %>%
filter(gene %in% row.names(plot_data1), KO %in% row.names(plot_data1)) %>%
select(gene, KO) %>%
rename(from=gene, to=KO)
tmp_obj1 <- KOannotation %>%
filter(KO %in% row.names(plot_data1), pathway %in% row.names(plot_data1)) %>%
select(KO, pathway) %>%
rename(from=KO, to=pathway)
plot_data2 %<>% bind_rows(tmp_obj1) %>% distinct(from, to)
#2.2 计算位移
tmp_obj1<-plot_data2 %>%
group_by(from) %>%
mutate(V1=1/n()) %>%
group_by(from) %>%
mutate(from_end = cumsum(V1),
from_start = from_end-V1) %>%
select(from, to, from_start, from_end)
tmp_obj2<-plot_data2 %>%
group_by(to) %>%
mutate(V1=1/n()) %>%
group_by(to) %>%
mutate(to_end = cumsum(V1),
to_start = to_end-V1) %>%
select(from, to, to_start, to_end)
plot_data2 %<>% left_join(tmp_obj1) %>% left_join(tmp_obj2)
# 2.3 合并上述两表
plot_data2 %<>%
left_join(plot_data3, by=c('from'='id')) %>%
rename('from_position'=position, 'from_sector'=sector) %>%
left_join(plot_data3, by=c('to'='id')) %>%
rename('to_position'=position, 'to_sector'=sector)
由于上述plot_data2
包含了全部点之间的连线关系,我们可能不需要这么多,所以可以挑选自己想要展示的数据。这一步可能需要反复画图几次确定需要展示的数据
#3. 进一步筛选想要展示的连线
# highlight the pathway I wanted
highlight_pathway <- c("pathway1", "pathway2")
highlight_KO <- c("KO5", "KO6", "KO20")
highlight_gene <- c("gene292", "gene256", "gene67", "gene146", "gene52", "gene391", "gene139", "gene327", "gene218", "gene142", "gene375", "gene194")
plot_link <- plot_data2 %>% filter(to %in% c(highlight_pathway, highlight_KO)) %>% mutate(col="#dcdcdc80")
highlight_link <- plot_link %>% filter (from %in% c(highlight_gene, highlight_KO)) %>% mutate(col="#fb9b9a80")
plot_link <- plot_link %>% filter (!from %in% c(highlight_gene, highlight_KO))
plot_data2<-rbind(plot_link, highlight_link)
使用circos.link
循环作图
for ( idx in seq(nrow(plot_data2))){
tmp_obj <- plot_data2[idx,]
circos.link(tmp_obj[['from_sector']],
c(tmp_obj[['from_position']] + tmp_obj[['from_start']],
tmp_obj[['from_position']] + tmp_obj[['from_end']]),
tmp_obj[['to_sector']],
c(tmp_obj[['to_position']] + tmp_obj[['to_start']],
tmp_obj[['to_position']] + tmp_obj[['to_end']]),
col = tmp_obj[['col']],
border = NA
)
}
全部代码
library(tidyverse)
library(magrittr)
library(circlize)
#模拟数据
## Data1
fpkm <- rbind(cbind(matrix(rnorm(500*3, mean = 1), nr = 500),
matrix(rnorm(500*3, mean = 2), nr = 500),
matrix(rnorm(500*3, mean = 3), nr = 500)))
fpkm <- fpkm[sample(500, 500), ] # randomly permute rows
rownames(fpkm) <- paste0("gene", seq(500))
colnames(fpkm) <- c("A1", "A2", "A3", "B1", "B2", "B3", "C1", "C2", "C3")
fpkm %<>% as.data.frame() %>% mutate(gene = row.names(.))
# Data2
pathways <- rep(paste0("pathway", seq(6)), sample(12:20, size = 6)) %>% sample(70)
KOs <- rep(paste0("KO", seq(20)), sample(5:20, size = 20, replace = TRUE)) %>% sample(70)
KOannotation <- data.frame(KO=KOs, pathway=pathways)
KOannotation <- KOannotation[sample(70, 200, TRUE),]
KOannotation$gene <- sample(paste0("gene", seq(500)),200)
# 假设你有富集到的想要可视化的通路
maps <- c("pathway1", "pathway2", "pathway3")
samples <- c("A1", "A2", "A3", "B1", "B2", "B3", "C1", "C2", "C3")
# 准备热图颜色
col_fun1 = colorRamp2(c(-2, 0, 2), c("#247ab5", "white", "#fda1a0"))
# 需要画图的基因
plot_gene <- KOannotation %>%
filter(pathway %in% maps) %>%
pull(gene) %>%
{filter(fpkm, row.names(fpkm) %in% .)}
# 需要画图的KO
plot_KO <- plot_gene %>%
left_join(KOannotation) %>%
filter(pathway %in% maps) %>% # There are some unenriched map
group_by(KO) %>%
summarise(across(samples,sum))
# 需要画图的Pathway
plot_map <- plot_gene %>%
left_join(KOannotation) %>%
filter(pathway %in% maps) %>% # There are some unenriched map
group_by(pathway) %>%
summarise(across(samples,sum))
plot_data1 <- bind_rows(plot_gene %>% rename(id=gene),
plot_KO %>% rename(id=KO),
plot_map %>% rename(id=pathway)) %>%
`row.names<-`(.$id) %>%
select(-id)
plot_data1 <- t(scale(t(plot_data1))) %>% as.data.frame()
# 在热图上划分为gene、KO、pathway
lev_split = row.names(plot_data1) %>% str_match("[a-zA-Z]+") %>% factor()
circos.clear()
circos.par(gap.degree=10, track.height=0.1)
# 分多次画表达谱数据,更有层次
circos.heatmap(plot_data1[samples[1:3]], split = lev_split , col = col_fun1, rownames.side = "outside", cluster = TRUE)
circos.heatmap(plot_data1[samples[7:9]], split = lev_split , col = col_fun1)
circos.heatmap(plot_data1[samples[4:6]], split = lev_split , col = col_fun1)
plot_data3 = data.frame()
for(lev in levels(lev_split)){
a <- rownames(plot_data1)[lev_split==lev][get.cell.meta.data("row_order", sector.index = lev)]
a <- seq(length(a)) %>% `names<-`(a) %>% enframe("id", "position")
a$sector = lev
plot_data3 <- rbind(plot_data3, a)
}
plot_data3$position <- plot_data3$position - 1 # 因为每个元素的范围是0~1,所以统一减一方便后面相加
#2.1 获取点对点连接表
plot_data2 <- KOannotation %>%
filter(gene %in% row.names(plot_data1), KO %in% row.names(plot_data1)) %>%
select(gene, KO) %>%
rename(from=gene, to=KO)
tmp_obj1 <- KOannotation %>%
filter(KO %in% row.names(plot_data1), pathway %in% row.names(plot_data1)) %>%
select(KO, pathway) %>%
rename(from=KO, to=pathway)
plot_data2 %<>% bind_rows(tmp_obj1) %>% distinct(from, to)
#2.2 计算位移
tmp_obj1<-plot_data2 %>%
group_by(from) %>%
mutate(V1=1/n()) %>%
group_by(from) %>%
mutate(from_end = cumsum(V1),
from_start = from_end-V1) %>%
select(from, to, from_start, from_end)
tmp_obj2<-plot_data2 %>%
group_by(to) %>%
mutate(V1=1/n()) %>%
group_by(to) %>%
mutate(to_end = cumsum(V1),
to_start = to_end-V1) %>%
select(from, to, to_start, to_end)
plot_data2 %<>% left_join(tmp_obj1) %>% left_join(tmp_obj2)
# 2.3 合并上述两表
plot_data2 %<>%
left_join(plot_data3, by=c('from'='id')) %>%
rename('from_position'=position, 'from_sector'=sector) %>%
left_join(plot_data3, by=c('to'='id')) %>%
rename('to_position'=position, 'to_sector'=sector)
highlight_pathway <- c("pathway1", "pathway2")
highlight_KO <- c("KO5", "KO6", "KO20")
highlight_gene <- c("gene292", "gene256", "gene67", "gene146", "gene52", "gene391", "gene139", "gene327", "gene218", "gene142", "gene375", "gene194")
plot_link <- plot_data2 %>% filter(to %in% c(highlight_pathway, highlight_KO)) %>% mutate(col="#dcdcdc80")
highlight_link <- plot_link %>% filter (from %in% c(highlight_gene, highlight_KO)) %>% mutate(col="#fb9b9a80")
plot_link <- plot_link %>% filter (!from %in% c(highlight_gene, highlight_KO))
plot_data2<-rbind(plot_link, highlight_link)
for ( idx in seq(nrow(plot_data2))){
tmp_obj <- plot_data2[idx,]
circos.link(tmp_obj[['from_sector']],
c(tmp_obj[['from_position']] + tmp_obj[['from_start']],
tmp_obj[['from_position']] + tmp_obj[['from_end']]),
tmp_obj[['to_sector']],
c(tmp_obj[['to_position']] + tmp_obj[['to_start']],
tmp_obj[['to_position']] + tmp_obj[['to_end']]),
col = tmp_obj[['col']],
border = NA
)
}
成品
注意点
- 如果画多层圈图,会根据行号对齐,但是不会根据行名对齐。所以所有组的表达谱数据必须一步准备。
- 热图和和弦图代码之间进行了大量数据清洗和计算,不要怕,没关系。因为只有拿到热图之后才能获取对应gene、KO、pathway的位置
- 每次画图之前习惯性的用
circos.clear()
清理一下缓存。