Bioops

Bioinformatics=(ACGAAG->AK)+(#!/bin/sh)+(P(A|B)=P(B|A)*P(A)/P(B))

perl脚本提取BLAST结果中的信息【以tblastn为例】

| Comments

用blast中 -m 8参数可以得到类似的信息,但满足不了所有的需求。用bioperl可以得到任意想要的信息,但我用的cluster上没有bioperl,只好自己写了一个。只是用了一些正则表达式,不适用于其他blast(比如blastn、blastp),但只需要稍微改一下就可以了,暂时将就着用吧。欢迎留言讨论。

parse_blast.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#!/usr/bin/perl
# Parse alignments from tBLASTn output file
use strict;
use warnings;
print "query\tquery_length\thit\tscore\texpect\tidentity\tquery_strand\tsubject_strand\tcoverage_query\tcoverage_subject\tquery_start\tquery_end\tsubject_start\tsubject_end\n";
my $input=$ARGV[0];
my ($eachline,$query_name,$query_length,$hit_name,$hsp_num);
my ($query_start_temp,$query_end_temp,$subject_start_temp,$subject_end_temp);
my ($query_start,$query_end,$subject_start,$subject_end,$query_seq,$subject_seq,$query_seq_temp,$subject_seq_temp);
my ($score,$expect,$coverage_query,$coverage_subject,$identity_query,$identity_subject,$identity,$query_strand,$subject_strand);
$hsp_num=0;
open(IN,"<$input") or die ("no file $input !");
while (defined($eachline=<IN>)){
  $eachline=~s/[\r\n]//g;
  if($eachline=~/^Query= /){
    ($query_name)=($eachline=~/^Query= (\S+)/);
    $eachline=<IN>;
    ($query_length)=($eachline=~/\((\S+) letters\)/);
    $query_length=~s/,//g;
  }
  if($eachline=~/^>/){
    ($hit_name)=($eachline=~/^>(\S+)/);
  }
  if($eachline=~/^ Score/){
    ($score,$expect)=($eachline =~ /Score = (.*) bits.*Expect.* = (\S+),/);
    $hsp_num++;
    if($hsp_num>1){
      $coverage_query=$query_end-$query_start+1;
      $coverage_subject=$subject_end-$subject_start+1;
      print "$coverage_query\t$coverage_subject\t$query_start\t$query_end\t$subject_start\t$subject_end\t$query_seq\t$subject_seq\n";
    }
    $query_start=1000000000;$query_end=0;
    $subject_start=1000000000;$subject_end=0;
    $query_seq='';$subject_seq='';
    print "$query_name\t$query_length\t$hit_name\t";
    print "$score\t$expect\t";
  }
  if($eachline=~/^ Identities/){
    ($identity_query,$identity_subject,$identity)=($eachline =~ /Identities = (\d+)\/(\d+) \((\d+)%\)/);
    print "$identity\t";
  }
  if($eachline=~/^ Frame = /){
    if($eachline =~ / Frame = -/){$subject_strand='Minus';}elsif($eachline =~ / Frame = +/){$subject_strand='Plus';}
    $query_strand='Plus';
    print "$query_strand\t$subject_strand\t";
  }
  if($eachline=~/^Query: (\d+)/){
    if($query_strand eq 'Plus'){($query_start_temp,$query_seq_temp,$query_end_temp)=($eachline =~ /(\d+)(\D+)(\d+)/);}
    if($query_strand eq 'Minus'){($query_end_temp,$query_seq_temp,$query_start_temp)=($eachline =~ /(\d+)(\D+)(\d+)/);}
    $query_seq_temp=~s/[ -]//g;$query_seq.=$query_seq_temp;
    if($query_start>$query_start_temp){$query_start=$query_start_temp;}
    if($query_end<$query_end_temp){$query_end=$query_end_temp;}
  }
  if($eachline=~/^Sbjct: (\d+)/){
    if($subject_strand eq 'Plus'){($subject_start_temp,$subject_seq_temp,$subject_end_temp)=($eachline =~ /(\d+)(\D+)(\d+)/);}
    if($subject_strand eq 'Minus'){($subject_end_temp,$subject_seq_temp,$subject_start_temp)=($eachline =~ /(\d+)(\D+)(\d+)/);}
    $subject_seq_temp=~s/[ -]//g;$subject_seq.=$subject_seq_temp;
    if($subject_start>$subject_start_temp){$subject_start=$subject_start_temp;}
    if($subject_end<$subject_end_temp){$subject_end=$subject_end_temp;}
  }
  if($eachline=~/^Matrix: /){
    $coverage_query=$query_end-$query_start+1;
    $coverage_subject=$subject_end-$subject_start+1;
    print "$coverage_query\t$coverage_subject\t$query_start\t$query_end\t$subject_start\t$subject_end\t$query_seq\t$subject_seq\n";
  }
}
close IN;

Comments