Using ggplot functions to draw gene and sub-features such motifs and domains.
I have some hits from the motif search tool XSTREME and I would like to visualisation the locations of discovered motifs, which also have significant match to known TFBSs in Jaspar.
Previously I used the R package Sushi to visualise gene features, but it requires a bed file and specific feature names such as CDS, exon, UTR etc. But ggplot2 is on the other side more flexible.
I also have the matched TFBS names for some of the motif_alt_id, so I can merge the two data frames.
1
2
3
4
5
6
7
8
top10tfbs<-merge(top10tfbs0,mejsp,all.x=TRUE,by.x="motif_alt_id",by.y="V1")# also reverse the motif position because they are upstream TSStop10tfbs$loc<--1*(top10tfbs$start)motif_alt_idgenestartstopstrandp.valuematched_sequenceV2V3loc1MEME-22Smp_161160288302+5.48e-05TTTGGTCCATAATTTMA0543.1eor-1-2882MEME-8Smp_179650240254-3.90e-05AGCAACAGCCTCTGCMA0260.1che-1-2403MEME-8Smp_337590906920+3.24e-06AATGAATGCTTCATTMA0260.1che-1-906
Now I selected some of the genes to visualise their motif locations.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
top10markers<-top10tfbs%>%filter(grepl('Smp_0868|Smp_0465|Smp_2450|Smp_0824|Smp_0743|Smp_1762|Smp_1796|Smp_2106|Smp_0583|Smp_0973',gene))# and combine column names for the legendtop10markers$jaspar<-paste0(top10markers$motif_alt_id," (",top10markers$V3,")")library(ggplot2)# first draw some bars as the chromosome regions, we used 1000bp p.st1<-ggplot(data=top10markers,aes(gene,-1000))+geom_bar(stat="identity",fill="grey",width=0.05,height=1000,position=position_stack(reverse=TRUE))+coord_flip()+labs(x="Genes",y="Position")+ylim(-1000,0)# now add segments as the motifsp.st1<-p.st1+geom_segment(data=top10markers,aes(x=gene,xend=gene,y=start*(-1),yend=stop*(-1),color=jaspar),size=5)# change legend title and remove y-axis p.st1<-p.st1+theme_classic()+labs(color="Motifs")+theme(axis.ticks.y=element_blank(),,axis.line.y=element_blank())p.st1
The above example is to visualise all different sub-features on the same gene. Another situation is to visualise different genes with the same feature (such as protein domain).
For instance I have a list of clusters within which all genes have the same Pfam, and another file with information about the positions of matched Pfam, as well as length and coordinates of the peptides.
1
2
3
4
5
6
7
8
9
10
11
12
13
# information of the clusterssample_n(allClusters,2)chrPfampf_namedescriptiongene_counts1SM_V7_1PF01221Dynein_lightDynein_light_chain_type_1_72SM_V7_ZWPF17039Glyco_tran_10_NFucosyltransferase,_N-terminal3# information on peptides and pfam domainssample_n(pfamInfo,2)idchrstartstrandpep_lenPfampf_startpf_end1Smp_162630.1SM_V7_320366872-1058PF161912933612Smp_159600.2SM_V7_19907913-1762PF01391720773
So the idea is to visualise each cluster, where all peptides with their matched pfam features will be shown.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# select a clustercluster1<-allClusters[49,]# get all transcripts and pfam domains in that clusterscluster1.range<-pfamInfo[which(as.character(cluster1$chr)==as.character(pfamInfo$chr)&pfamInfo$start>=cluster1$first_start&pfamInfo$start<=cluster1$last_start&as.character(cluster1$Pfam)==as.character(pfamInfo$Pfam)),]cluster1.rangeidchrstartstrandpep_lenPfampf_startpf_end14417Smp_344710.1SM_V7_520379409+498PF097283433414418Smp_332120.1SM_V7_520465519+299PF0972858314419Smp_330720.1SM_V7_520484109+361PF097284515914420Smp_332130.1SM_V7_520519729+778PF0972832944314421Smp_332130.1SM_V7_520519729+778PF0972849561714422Smp_332130.1SM_V7_520519729+778PF097285124
Now to draw the features
1
2
3
4
5
# limit the range of genomic barps<-ggplot(data=cluster1.range,aes(chr,10000))+geom_bar(stat="identity",fill="grey",width=0.05,position=position_stack(reverse=TRUE))+coord_flip()+labs(x="",y="Position (Coord/50)")+ylim(min(cluster1.range$start)/50,max(cluster1.range$start)/50+800)# add Pfam segmentsps1<-ps+geom_segment(data=cluster1.range,aes(x=chr,xend=chr,y=start/50+pf_start,yend=start/50+pf_end,color=id,alpha=0.5),size=4)+theme_classic()+theme(axis.ticks.y=element_blank(),axis.line.y=element_blank())+theme(legend.position="top")
1
2
3
4
5
6
7
8
9
# add peptide segmentsps2<-ps1+geom_segment(data=cluster1.range,aes(x=chr,xend=chr,y=start/50,yend=start/50+pep_len,colour=id,alpha=0.5),size=1.5)library(grid)# Create a textgrob<-grobTree(textGrob(paste(cluster1$Pfam,cluster1$pf_name,cluster1$description,sep=" "),x=0,y=0.95,hjust=0,gp=gpar(col="red",fontsize=10,fontface="italic")))# add text to the plotps2<-ps2+annotation_custom(grob)