Visualising gene features using ggplot2

I have some hits from the motif search tool XSTREME and I would like to visualisation the locations of discovered motifs, which also have significant match to known TFBSs in Jaspar.

Previously I used the R package Sushi to visualise gene features, but it requires a bed file and specific feature names such as CDS, exon, UTR etc. But ggplot2 is on the other side more flexible.

1
2
3
4
5
6
7


library(dplyr)
sample_n(top10tfbs0, 5)

  motif_alt_id       gene start stop strand  p.value matched_sequence
1      MEME-22 Smp_151030   411  425      + 6.05e-05  TTCCATTCCTTTATT
2      MEME-14 Smp_083180   571  585      + 2.97e-07  GAAGTCCTTCAATTC
3       MEME-8 Smp_083680   957  971      + 3.05e-07  AACGCATGCTTCTTC

I also have the matched TFBS names for some of the motif_alt_id, so I can merge the two data frames.

1
2
3
4
5
6
7
8


top10tfbs<-merge(top10tfbs0, mejsp, all.x=TRUE, by.x="motif_alt_id", by.y="V1")
# also reverse the motif position because they are upstream TSS
top10tfbs$loc<--1*(top10tfbs$start)

  motif_alt_id       gene start stop strand  p.value matched_sequence       V2    V3  loc
1      MEME-22 Smp_161160   288  302      + 5.48e-05  TTTGGTCCATAATTT MA0543.1 eor-1 -288
2       MEME-8 Smp_179650   240  254      - 3.90e-05  AGCAACAGCCTCTGC MA0260.1 che-1 -240
3       MEME-8 Smp_337590   906  920      + 3.24e-06  AATGAATGCTTCATT MA0260.1 che-1 -906

Now I selected some of the genes to visualise their motif locations.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14


top10markers<-top10tfbs %>% filter(grepl('Smp_0868|Smp_0465|Smp_2450|Smp_0824|Smp_0743|Smp_1762|Smp_1796|Smp_2106|Smp_0583|Smp_0973', gene))
# and combine column names for the legend
top10markers$jaspar<-paste0(top10markers$motif_alt_id, " (", top10markers$V3, ")")

library(ggplot2)
# first draw some bars as the chromosome regions, we used 1000bp 
p.st1<-ggplot(data = top10markers, aes(gene, -1000)) + geom_bar(stat="identity", fill="grey", width = 0.05, height=1000, position = position_stack(reverse = TRUE)) +coord_flip()+labs(x="Genes", y="Position")+ylim(-1000,0)

# now add segments as the motifs
p.st1<-p.st1+geom_segment(data=top10markers, aes(x=gene, xend=gene, y=start*(-1), yend=stop*(-1), color=jaspar), size=5)

# change legend title and remove y-axis 
p.st1<-p.st1+theme_classic()+labs(color="Motifs")+theme(axis.ticks.y = element_blank(),, axis.line.y = element_blank())
p.st1

The above example is to visualise all different sub-features on the same gene. Another situation is to visualise different genes with the same feature (such as protein domain).

For instance I have a list of clusters within which all genes have the same Pfam, and another file with information about the positions of matched Pfam, as well as length and coordinates of the peptides.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13


# information of the clusters
sample_n(allClusters, 2)

       chr    Pfam         pf_name                    description gene_counts
1  SM_V7_1 PF01221    Dynein_light     Dynein_light_chain_type_1_           7
2 SM_V7_ZW PF17039 Glyco_tran_10_N Fucosyltransferase,_N-terminal           3

# information on peptides and pfam domains
sample_n(pfamInfo, 2)

            id     chr    start strand pep_len    Pfam pf_start pf_end
1 Smp_162630.1 SM_V7_3 20366872      -    1058 PF16191      293    361
2 Smp_159600.2 SM_V7_1  9907913      -    1762 PF01391      720    773

So the idea is to visualise each cluster, where all peptides with their matched pfam features will be shown.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15


# select a cluster
cluster1<-allClusters[49,]

# get all transcripts and pfam domains in that clusters
cluster1.range<-pfamInfo[which(as.character(cluster1$chr)==as.character(pfamInfo$chr) & pfamInfo$start >= cluster1$first_start & pfamInfo$start <= cluster1$last_start & as.character(cluster1$Pfam)==as.character(pfamInfo$Pfam)),]

cluster1.range

                id     chr    start strand pep_len    Pfam pf_start pf_end
14417 Smp_344710.1 SM_V7_5 20379409      +     498 PF09728       34    334
14418 Smp_332120.1 SM_V7_5 20465519      +     299 PF09728        5     83
14419 Smp_330720.1 SM_V7_5 20484109      +     361 PF09728       45    159
14420 Smp_332130.1 SM_V7_5 20519729      +     778 PF09728      329    443
14421 Smp_332130.1 SM_V7_5 20519729      +     778 PF09728      495    617
14422 Smp_332130.1 SM_V7_5 20519729      +     778 PF09728        5    124

Now to draw the features

1
2
3
4
5


# limit the range of genomic bar
ps<-ggplot(data=cluster1.range, aes(chr, 10000)) +  geom_bar(stat="identity", fill="grey", width = 0.05, position = position_stack(reverse = TRUE)) +coord_flip()+labs(x="", y="Position (Coord/50)")+ylim(min(cluster1.range$start)/50, max(cluster1.range$start)/50+800)

# add Pfam segments
ps1<-ps+geom_segment(data=cluster1.range, aes(x=chr, xend=chr, y=start/50+pf_start, yend=start/50+pf_end, color=id, alpha = 0.5), size=4)+theme_classic()+theme(axis.ticks.y = element_blank(),axis.line.y = element_blank())+theme(legend.position = "top")

1
2
3
4
5
6
7
8
9


# add peptide segments
ps2<-ps1+geom_segment(data=cluster1.range, aes(x=chr, xend=chr, y=start/50, yend=start/50+pep_len, colour=id, alpha=0.5), size=1.5)

library(grid)
# Create a text
grob <- grobTree(textGrob(paste(cluster1$Pfam, cluster1$pf_name, cluster1$description, sep="  "), x=0,  y=0.95, hjust=0, gp=gpar(col="red", fontsize=10, fontface="italic")))

# add text to the plot
ps2<-ps2 +annotation_custom(grob)