library(dplyr)pfamCoord<-read.delim("cluster_geneCoord.txt",sep="\t",header=T)pfamCoord%>%group_by(pfam)%>%mutate(Dist=start-lag(end))# A tibble: 388 x 7# Groups: pfam [57]chridstartendstrandpfamDist<fct><fct><int><int><fct><fct><int>1SM_V7_1Smp_1784204898135448982520-PF00001-7tm_1NA2SM_V7_1Smp_3193104898613448988127+PF00001-7tm_136143SM_V7_1Smp_1373004898953848995030-PF00001-7tm_114114SM_V7_1Smp_1373104900394549005340-PF00001-7tm_189155SM_V7_1Smp_0279404900805049010380-PF00001-7tm_127106SM_V7_1Smp_1373204902501149029650-PF00001-7tm_1146317SM_V7_1Smp_0480506556643665626137+PF00011-HSP20NA8SM_V7_1Smp_3022706557459665576347+PF00011-HSP20-515419SM_V7_1Smp_0492306558530165587122-PF00011-HSP20895410SM_V7_1Smp_3022806559484865596602-PF00011-HSP207726# … with 378 more rows
or using data.table (shift function)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
library(data.table)setDT(pfamCoord)pfamCoord[,Dist:=start-shift(end),by=pfam]head(pfamCoord,10)chridstartendstrandpfamDist1:SM_V7_1Smp_1784204898135448982520-PF00001-7tm_1NA2:SM_V7_1Smp_3193104898613448988127+PF00001-7tm_136143:SM_V7_1Smp_1373004898953848995030-PF00001-7tm_114114:SM_V7_1Smp_1373104900394549005340-PF00001-7tm_189155:SM_V7_1Smp_0279404900805049010380-PF00001-7tm_127106:SM_V7_1Smp_1373204902501149029650-PF00001-7tm_1146317:SM_V7_1Smp_0480506556643665626137+PF00011-HSP20NA8:SM_V7_1Smp_3022706557459665576347+PF00011-HSP20-515419:SM_V7_1Smp_0492306558530165587122-PF00011-HSP20895410:SM_V7_1Smp_3022806559484865596602-PF00011-HSP207726#setDF(pfamCoord) # convert back to old data.frame syntax
Finally we can summarise the differences (Dist) by group (pfam)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
pfamCoord%>%group_by(pfam)%>%summarise(minDist=min(Dist,na.rm=TRUE),maxDist=max(Dist,na.rm=TRUE))`summarise()` ungrouping output (override with `.groups`argument)# A tibble: 57 x 3pfamminDistmaxDist<fct><int><int>1PF00001-7tm_1-14237049373133972PF00011-HSP20-51541549213PF00012-HSP70482848424PF00014-Kunitz_BPTI3703460645PF00026-Asp11392288496PF00041-fn3121213277PF00079-Serpin20471454248PF00089-Trypsin-33481301385089PF00096-zf-C2H257701563210PF00106-adh_short-4349715653670503# … with 47 more rows
or to summarise by pfam on each chr, because some pfam clusters exist on multiple chromosomes.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
pfamCoord%>%group_by(pfam,chr)%>%summarise(minDist=min(Dist,na.rm=TRUE),maxDist=max(Dist,na.rm=TRUE))# A tibble: 67 x 4# Groups: pfam [57]pfamchrminDistmaxDist<fct><fct><int><int>1PF00001-7tm_1SM_V7_11411146312PF00001-7tm_1SM_V7_3-953947218707373PF00001-7tm_1SM_V7_4-14237049204544PF00001-7tm_1SM_V7_ZW-17590373133975PF00011-HSP20SM_V7_1-51541549216PF00012-HSP70SM_V7_1482848427PF00014-Kunitz_BPTISM_V7_23703460648PF00026-AspSM_V7_311392288499PF00041-fn3SM_V7_ZW1212132710PF00079-SerpinSM_V7_62047145424# … with 57 more rows
Some distances are minus because genes are overlapped.