3. 散点图
- 3.10 向散点图添加边际地毯
- 3.11 向散点图添加标签
- 3.12 绘制气泡图
- 3.13 绘制散点图矩阵
- 往期文章
- 参考书籍
3.10 向散点图添加边际地毯
# 使用 geom_rug() 函数添加边际地毯
ggplot(faithful, aes(x=eruptions, y=waiting)) +
geom_point() +
geom_rug()
# 通过向边际地毯线的位置坐标添加扰动并设定size减小线宽可以减轻边际地毯线的重叠程度
ggplot(faithful, aes(x=eruptions, y=waiting)) +
geom_point() +
geom_rug(position="jitter", size=.2)
3.11 向散点图添加标签
library(gcookbook)
# 以countries数据集为例,对各国医疗保健支出与婴儿死亡率之间的关系进行可视化
# 选取人均支出大于2000美元的国家的数据子集进行分析
subset(countries, Year==2009 & healthexp>2000)
sp <- ggplot(subset(countries, Year==2009 & healthexp>2000),
aes(x=healthexp, y=infmortality)) +
geom_point()
# annotate()函数指定标签坐标和标签文本
sp + annotate("text", x=4350, y=5.4, label="Canada") +
annotate("text", x=7400, y=6.8, label="USA")
# geom_text()函数自动添加数据标签
sp + geom_text(aes(label=Name), size=4)
调整标签位置,大家自行尝试。
# 对标签的位置进行调整
sp + geom_text(aes(label=Name), size=4, vjust=0)
sp + geom_text(aes(y=infmortality+.1, label=Name), size=4, vjust=0)
sp + geom_text(aes(label=Name), size=4, hjust=0)
sp + geom_text(aes(x=healthexp+100, label=Name), size=4, hjust=0)
如何只对自己想要的数据点添加标签。
注:有很多人在后台问我如何在火山图里给自己想要的基因添加注释。这里提供了一个思路。
# 新建一个数据
cdat <- subset(countries, Year==2009 & healthexp>2000)
cdat$Name1 <- cdat$Name
# 用%in%运算符找出绘图时希望抱怨的标签
idx <- cdat$Name1 %in% c("Canada", "Ireland", "United Kingdom", "United States",
"New Zealand", "Iceland", "Japan", "Luxembourg",
"Netherlands", "Switzerland")
idx
# 根据上面的逻辑向量用 NA 重写变量 Name1 中的其它取值
cdat$Name1[!idx] <- NA
cdat
ggplot(cdat, aes(x=healthexp, y=infmortality)) +
geom_point() +
geom_text(aes(x=healthexp+100, label=Name1), size=4, hjust=0) +
xlim(2000, 10000)
3.12 绘制气泡图
调用 geom_point()
和 scale_size_area()
函数即可绘制气泡图。
# 示例数据
library(gcookbook) # For the data set
cdat <- subset(countries, Year==2009 &
Name %in% c("Canada", "Ireland", "United Kingdom", "United States",
"New Zealand", "Iceland", "Japan", "Luxembourg",
"Netherlands", "Switzerland"))
cdat
> cdat
Name Code Year GDP laborrate healthexp infmortality
1733 Canada CAN 2009 39599.04 67.8 4379.761 5.2
4436 Iceland ISL 2009 37972.24 77.5 3130.391 1.7
4691 Ireland IRL 2009 49737.93 63.6 4951.845 3.4
4946 Japan JPN 2009 39456.44 59.5 3321.466 2.4
5864 Luxembourg LUX 2009 106252.24 55.5 8182.855 2.2
7088 Netherlands NLD 2009 48068.35 66.1 5163.740 3.8
7190 New Zealand NZL 2009 29352.45 68.6 2633.625 4.9
9587 Switzerland CHE 2009 63524.65 66.9 7140.729 4.1
10454 United Kingdom GBR 2009 35163.41 62.2 3285.050 4.7
10505 United States USA 2009 45744.56 65.0 7410.163 6.6
p <- ggplot(cdat, aes(x=healthexp, y=infmortality, size=GDP)) +
geom_point(shape=21, colour="black", fill="cornsilk")
# 将GDP 映射给半径 (scale_size_continuous)
p
# 将GDP 映射给面积
p + scale_size_area(max_size=15)
如果x轴,y轴皆是分类变量,气泡图可以用来表示网格上的变量值。
# 对男性组和女性组求和
hec <- HairEyeColor[,,"Male"] + HairEyeColor[,,"Female"]
# 转化为长格式(long format)
library(reshape2)
hec <- melt(hec, value.name="count")
ggplot(hec, aes(x=Eye, y=Hair)) +
geom_point(aes(size=count), shape=21, colour="black", fill="cornsilk") +
scale_size_area(max_size=20, guide=FALSE) +
geom_text(aes(y=as.numeric(Hair)-sqrt(count)/22, label=count), vjust=1,
colour="grey60", size=4)
3.13 绘制散点图矩阵
散点图矩阵是一种对多个变量两两之间关系进行可视化的有效方法。pairs()函数可以绘制散点图矩阵。
注:现在散点图矩阵有现成的R包(如
GGally_ggpairs
)。以下内容仅供了解。
# 示例数据
library(gcookbook) # For the data set
c2009 <- subset(countries, Year==2009,
select=c(Name, GDP, laborrate, healthexp, infmortality))
head(c2009
> head(c2009)
Name GDP laborrate healthexp infmortality
50 Afghanistan NA 59.8 50.88597 103.2
101 Albania 3772.605 59.5 264.60406 17.2
152 Algeria 4022.199 58.5 267.94653 32.0
203 American Samoa NA NA NA NA
254 Andorra NA NA 3089.63589 3.1
305 Angola 4068.576 81.3 203.80787 99.9
pairs(c2009[,2:5])
# 定义一个panel.cor函数来展示变量两两之间的相关系数以代替默认的散点图
panel.cor <- function(x, y, digits=2, prefix="", cex.cor, ...) {
usr <- par("usr")
on.exit(par(usr))
par(usr = c(0, 1, 0, 1))
r <- abs(cor(x, y, use="complete.obs"))
txt <- format(c(r, 0.123456789), digits=digits)[1]
txt <- paste(prefix, txt, sep="")
if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
text(0.5, 0.5, txt, cex = cex.cor * (1 + r) / 2)
}
# 定义 panel.hist 函数展示各个变量的直方图
panel.hist <- function(x, ...) {
usr <- par("usr")
on.exit(par(usr))
par(usr = c(usr[1:2], 0, 1.5) )
h <- hist(x, plot = FALSE)
breaks <- h$breaks
nB <- length(breaks)
y <- h$counts
y <- y/max(y)
rect(breaks[-nB], 0, breaks[-1], y, col="white", ...)
}
pairs(c2009[,2:5], upper.panel = panel.cor,
diag.panel = panel.hist,
lower.panel = panel.smooth)
# 线性模型替代lowess 模型
panel.lm <- function (x, y, col = par("col"), bg = NA, pch = par("pch"),
cex = 1, col.smooth = "black", ...) {
points(x, y, pch = pch, col = col, bg = bg, cex = cex)
abline(stats::lm(y ~ x), col = col.smooth, ...)
}
pairs(c2009[,2:5], pch=".",
upper.panel = panel.cor,
diag.panel = panel.hist,
lower.panel = panel.lm)