Reverse complement sequence
echo <SEQUENCE> | perl -nle 'print map{$_ =~ tr/ACGT/TGCA/; $_} reverse split("",$_)'
Combine multiple rows based on the same column value
awk '$1!=p{if(p)print s; p=$1; s=$0; next}{sub(p,x); s=s $0} END{print s}'
Split multiple columns into two columns with common first column
awk -v OFS='\t' '{for (i=2;i<=NF;i++) print $1,$i}'
Get a range of text
|
|
Get a specific sequence from a fasta file
sed '/>Smp_000020.1/,/>/!d' <fasta file>| awk '/>/{i++}i==1'
sed -n '/Smp_000020/,/>/p' <fasta file> | sed '$d'
Convert pdf to png
mogrify -verbose -density 300 -resize 800 -format png *.pdf
Remove blank lines
grep -v -e '^$'
sed '/^$/d'
Call Artemis
bsub -q yesterday -n2 -o o -e e -R "span[hosts=1]" -M 2000 -R 'select[mem>2000] rusage[mem=2000]' -I art -Dshow_cov_plot <FILE>
Sum and Mean of column 1
awk '{sum+=$1} END {print sum}'
awk '{x+=$1}END{print x/NR}'
Print lines whose 5th field does not match the regular expression
awk '$5 !~ /^Smp_[0-9\{6\}]/'
awk '$5 != "STRING"'
awk 'substr($5,1,10) == "STRING"'
Trim leading and trailing whitespaces and tabulations
sed 's/^[ \t]*//;s/[ \t]*$//'
Output sequence name and length
cat seq.fa | awk '$0 ~ ">" {print c; c=0;printf substr($0,2,100) "\t"; } $0 !~ ">" {c+=length($0);} END { print c; }'
Convert a FASTQ file to FASTA
sed -n '1~4s/^@/>/p;2~4p' file.fq > file.fa
Sort gff file for tabix
sort -k1,1 -k4,4 -V
Number each line
cat -n FILE
less -N FILE
Print file name and content
grep . *.txt
Extract all gene IDs from a GFF3 file
cat <GFF3 FILE> | grep $'\tgene\t' | perl -ne '/ID=([^;]+)/ and printf("%s\n", $1)'
Print line 20 content
sed -n 20p FILE
awk '(NR==20){print $0}' FILE
Omit 1st line and print lines with 4 columns
awk 'NR > 1 && NF == 4' FILE
NCBI Blast+
blastp -query <> -db <> -num_alignments 5 -num_descriptions 10 -out <>
Search for any string in all txt files
grep -r 'STRING1\|STRING2\|STRING3' ./*.txt
Conditions based on column values
awk '{for (i=2; i<=8; i++) if ($i<-0.2){print $0; next}}'