# Read the input files# “header=T” means that the data has a title, and sep="\t" is used as the separatordata <-read.table("input/box_plots_mtcars.txt",header=T,sep="\t")df <- data[, c("mpg", "cyl", "wt")]df2 <-read.table("input/histogram_plots.txt",header=T,sep="\t")df3 <-read.table("input/volcano_plots.txt", header=T)df4 <-read.table("input/manhattan_plots_gwasResults.txt",header=T,sep="\t")df5 <-read.table("input/heatmaps.txt",header=T,sep="\t")# Covert data into matrix format# nrow(df5) and ncol(df5) return the number of rows and columns of matrix df5 respectively.dm <-data.matrix(df5[1:nrow(df5),2:ncol(df5)])# Get the row namesrow.names(dm) <- df5[,1]df6 <-read.table("input/ballon_plots_GO.txt", header=T, sep="\t")df7 <-read.table("input/box_plots_David_GO.txt",header=T,sep="\t")df7 <- df7[1:10,]
library(ggplot2) # R语言中最常用的基于grid的可视化工具# 另外两个比较常见的作图packagelibrary(gplots) library(plotrix)library(qqman) # 用于GWAS数据可视化library(pheatmap) #用于绘制热图,ComplexHeatmap也是另外一个常用的packagelibrary(scales) # map numeric value to colorlibrary(RColorBrewer) #提供常见的配色方案# reshape data in Rlibrary(reshape2) library(plyr)
# 指定输出pdf,路径为output/1.1.Basic_boxplot.pdf,高度宽度均为3pdf("output/1.1.Basic_boxplot.pdf", height =3, width =3)# ggplot从数据框df中读取作图所需的数据# aes(x=cyl, y=mpg)告诉ggplot2将数据框中的cyl列作为x轴,mpg列作为y轴ggplot(df, aes(x=cyl, y=mpg))+# 加号在ggplot中意思是在当前的ggplot对象上进行修改# draw the boxplot and fill it with graygeom_boxplot(fill="gray")+# Use the labs function to set the title and modify x and ylabs(title="Plot of mpg per cyl",x="Cyl", y ="Mpg")+# Set the theme styletheme_classic()# Save the plotdev.off()
# Begin to plotp <-ggplot(df, aes(x=cyl, y=mpg))+geom_boxplot(fill="gray")+labs(title="Plot of mpg per cyl",x="Cyl", y ="Mpg")+theme_classic()# Sava as pdfggsave("output/1.1.Basic_boxplot.pdf", plot=p, height =3, width =3)
ggplot(df, aes(x=cyl, y=mpg))+geom_boxplot(fill="gray")+labs(title="Plot of mpg per cyl",x="Cyl", y ="Mpg")+theme_classic()
1b) Change continuous color by groups
ggplot(df, aes(x=cyl, y=mpg, fill=cyl))+# fill=cyl: 用颜色表示cyl一列的数值geom_boxplot()+labs(title="Plot of mpg per cyl",x="Cyl", y ="Mpg")+scale_fill_brewer(palette="Blues")+# palette="Blues": 定义了一种数值到颜色的对应关系,数值越大蓝色的颜色越深theme_bw()
#Read the data tabledata=read.csv("boxplot_example.csv")####################I.Prepare the data#1.Normalize the data, etcfor (i in12:17){ data[,i]=log(data[,i]+1e-3) # log some expression values}for (i in9:17) { maxValue=max(data[,i]) #scale the data into 0-1 minValue=min(data[,i]) range=maxValue-minValue data[,i]=(data[,i]-minValue)/range}data$X8.Identity=data$X8.Identity/100#2.Make the new matrix for boxplot: cleaning the data tablelibrary("reshape")m=melt(data[,c(2,7:12,14:17)], id=1)# remove some columns not to show and reshape the matrix into 3 columns for boxplot drawing in bwplotcolnames(m)=c("Type","Feature","Normalized_Value") #define the new column names#3.Clean the names of each type and each feature#Merge sub-types of different elementsm[,1]=sub ("ncRNA_selected","RNAI", m[,1])m[,1]=sub ("ncRNA_3019","RNAII", m[,1])m[,1]=sub ("exon_CCDS","CDS", m[,1])m[,1]=sub ("five_prime_UTR","UTR", m[,1])m[,1]=sub ("three_prime_UTR","UTR", m[,1])m[,1]=sub ("ancestral_repeat","AP", m[,1])#Rename the featurem[,2]=sub('X7.GC','01.GC Content',m[,2])m[,2]=sub('X8.Identity','02.DNA Conservation',m[,2])m[,2]=sub('X9.z_score','03.RNA Struc. Free Energy',m[,2])m[,2]=sub('X10.SCI','04.RNA Struc. Cons.',m[,2])m[,2]=sub('X11.tblastx_score','05.Protein Conservation',m[,2])m[,2]=sub('X12.polyA_RNAseq_MAX','06.PolyA+ RNA-seq',m[,2])m[,2]=sub('X14.small_RNAseq_MAX','07.Small RNA-seq',m[,2])m[,2]=sub('X15.Array_totalRNA_MAX','08.Total RNA Array',m[,2])m[,2]=sub('X16.Array_polyA_MAX','09.PolyA+ RNA Array',m[,2])m[,2]=sub('X17.Array_nonpolyA_MAX','10.PolyA- RNA Array',m[,2])############################Making Boxplotlibrary("lattice")png("boxplot.png",width=1500,height=500) # pdf is recommended for most cases, or png for figure with huge amount of data points#pdf("boxplot.pdf") attach(m)bwplot(Normalized_Value ~ Type|Feature,fill=c("green","red","yellow","blue","light blue"),layout=c(10,1))dev.off()
ggplot(df, aes(x=cyl, y=mpg))+geom_violin(trim=FALSE)+labs(title="Plot of mpg per cyl", x="Cyl", y ="Mpg")
2b) Add summary statistics on a violin plot
(2b.1) Add median and quartile
ggplot(df, aes(x=cyl, y=mpg))+geom_violin(trim=FALSE)+labs(title="Plot of mpg per cyl", x="Cyl", y ="Mpg")+stat_summary(fun.y=mean, geom="point", shape=23, size=2, color="red")
or
ggplot(df, aes(x=cyl, y=mpg))+geom_violin(trim=FALSE)+labs(title="Plot of mpg per cyl", x="Cyl", y ="Mpg")+geom_boxplot(width=0.1)
(2b.2) Add mean and standard deviation
ggplot(df, aes(x=cyl, y=mpg))+geom_violin(trim=FALSE)+labs(title="Plot of mpg per cyl", x="Cyl", y ="Mpg")+stat_summary(fun.data="mean_sdl", fun.args =list(mult =1), geom="crossbar", width=0.1)
or
ggplot(df, aes(x=cyl, y=mpg))+geom_violin(trim=FALSE)+labs(title="Plot of mpg per cyl", x="Cyl", y ="Mpg")+stat_summary(fun.data=mean_sdl, fun.args =list(mult =1), geom="pointrange", color="red")
2c) Change violin plot fill colors
ggplot(df, aes(x=cyl, y=mpg, fill=cyl))+geom_violin(trim=FALSE)+geom_boxplot(width=0.1, fill="white")+labs(title="Plot of mpg per cyl", x="Cyl", y ="Mpg")+scale_fill_brewer(palette="Blues")+theme_classic()
##Use the plyr package plyr to calculate the average weight of each group :mu <-ddply(df2, "sex", summarise, grp.mean=mean(weight))head(mu)
### sex grp.mean
### 1 F 54.70
### 2 M 65.36
draw the plot
4d) Change fill colors
ggplot(df2, aes(x=weight, fill=sex))+geom_density(alpha=0.7)+geom_vline(data=mu, aes(xintercept=grp.mean, color=sex), linetype="dashed")+labs(title="Weight density curve",x="Weight(kg)", y ="Density")+scale_color_brewer(palette="Paired")+scale_fill_brewer(palette="Blues")+theme_classic()
4e) Change line colors
ggplot(df2, aes(x=weight, color=sex))+geom_density()+geom_vline(data=mu, aes(xintercept=grp.mean, color=sex), linetype="dashed")+labs(title="Weight density curve",x="Weight(kg)", y ="Density")+scale_color_brewer(palette="Paired")+theme_classic()
4f) Combine histogram and density plots
ggplot(df2, aes(x=weight, color=sex, fill=sex))+geom_histogram(binwidth=1, aes(y=..density..), alpha=0.5, position="identity")+geom_density(alpha=.2)+labs(title="Weight density curve",x="Weight(kg)", y ="Density")+scale_color_brewer(palette="Paired")+scale_fill_brewer(palette="Blues")+theme_classic()
ggplot(df, aes(x=cyl, y=mpg, fill=cyl, shape=cyl))+geom_dotplot(binaxis='y', stackdir='center', binwidth=1, dotsize=0.8)+labs(title="Plot of mpg per cyl",x="Cyl", y ="Mpg")+#stat_summary(fun.data="mean_sdl", fun.args = list(mult=1), geom="crossbar", width=0.5) +scale_fill_brewer(palette="Blues")+#scale_color_brewer(palette="Blues") +theme_classic()
5d) Change dot colors, shapes and align types
ggplot(df, aes(x=cyl, y=mpg, color=cyl, shape=cyl))+geom_jitter(position=position_jitter(0.1), cex=2)+labs(title="Plot of mpg per cyl",x="Cyl", y ="Mpg")+scale_color_brewer(palette="Blues")+theme_classic()
##to draw high expression value in red, we use colorRampPalette instead of redblue in heatmap.2##colorRampPalette is a function in the RColorBrewer packagecr <-colorRampPalette(c("blue","white","red"))heatmap.2(dm, scale="row", #scale the rows, scale each gene's expression value key=T, keysize=1.1, cexCol=0.9,cexRow=0.8, col=cr(1000), ColSideColors=c(rep(c("blue","red"),5)), density.info="none",trace="none",#dendrogram='none', #if you want to remove dendrogram Colv = T,Rowv = T) #clusters by both row and col
9b) pheatmap package: pheatmap()
# pheatmap的annotation_col和annotation_row可以传入数据框,用于行和列的注释# annotation_col行数和矩阵列数相同,annotation_row行数和矩阵行数相同,它们都可以包含多列,用于标记不同的注释信息annotation_col =data.frame(CellType =factor(rep(c("Control", "Tumor"), 5)), Time =1:5)# annotation_col(annotation_row)的行名应与矩阵的列(行)名一致rownames(annotation_col) = colnames(dm)annotation_row =data.frame(GeneClass =factor(rep(c("Path1", "Path2", "Path3"), c(10, 4, 6))))rownames(annotation_row) = paste("Gene", 1:20, sep ="")# pheatmap接受一个列表用于设置annotation_col和annotation_row的颜色ann_colors =list(Time =c("white", "springgreen4"), CellType =c(Control ="#7FBC41", Tumor ="#DE77AE"), GeneClass =c(Path1 ="#807DBA", Path2 ="#9E9AC8", Path3 ="#BCBDDC"))# draw the heatmappheatmap(dm, cutree_col =2, cutree_row =3, #break up the heatmap by clusters you define cluster_rows=TRUE, show_rownames=TRUE, cluster_cols=TRUE, #by default, pheatmap clusters by both row and col annotation_col = annotation_col, annotation_row = annotation_row,annotation_colors = ann_colors)# pheatmap默认会对行和列聚类,如果不想聚类,可以把cluster_rows和cluster_cols设成False# 其他常用参数包括颜色的设置等,例如color = colorRampPalette(rev(brewer.pal(n = 7, name ="RdBu")))(100)# 具体可参考https://www.rdocumentation.org/packages/pheatmap/versions/1.0.12/topics/pheatmap
9c) ggplot2 package
##9.3.1.cluster by row and col##cluster and re-order rowsrowclust =hclust(dist(dm))reordered = dm[rowclust$order,]##cluster and re-order columnscolclust =hclust(dist(t(dm)))##9.3.2.scale each row value in [0,1]dm.reordered = reordered[, colclust$order]dm.reordered=apply(dm.reordered,1,rescale) #rescale is a function in the scales packagedm.reordered=t(dm.reordered) #transposed matrix##9.3.3.save col and row names before changing the matrix formatcol_name=colnames(dm.reordered) row_name=rownames(dm.reordered) ##9.3.4.change data format for geom_title colnames(dm.reordered)=1:ncol(dm.reordered)rownames(dm.reordered)=1:nrow(dm.reordered)dm.reordered=melt(dm.reordered)#melt is a function in the reshape2 packagehead(dm.reordered)##9.3.5.draw the heatmapggplot(dm.reordered, aes(Var2, Var1))+geom_tile(aes(fill = value), color ="white")+scale_fill_gradient(low ="white", high ="steelblue")+theme_grey(base_size =10)+labs(x ="", y ="")+scale_x_continuous(expand =c(0, 0),labels=col_name,breaks=1:length(col_name)) +scale_y_continuous(expand =c(0, 0),labels=row_name,breaks=1:length(row_name))
10) Ballon plots
10a) basic ballon plots
head(df6)
### Biological.process Fold.enrichment X.log10.Pvalue. col
### 1 Small molecule metabolic process 1.0 16 1
### 2 Single-organism catabolic process 1.5 12 1
### 3 Oxoacid metabolic process 2.0 23 1
### 4 Small molecule biosynthetic process 2.5 6 1
### 5 Carboxylic acid metabolic process 2.7 24 1
### 6 Organic acid metabolic process 2.7 25 1