1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 |
|
PS: I rank the exon positions before calculating the intron size to avoid the +/- strand issues
“` perl get_intron_size.pl
!/usr/bin/perl
use strict; use warnings;
usage: perl get_intron_size.pl gff3_file >output
my $input=$ARGV[0];
my ($eachline,@exons);
my $first=0;
open (IN, “<$input”) or die (“no such file!”);
while(defined($eachline=
sub print_intron{ my (@exons)=@_; if(scalar(@exons)>2){ my @ordered_exons=sort {$a<=>$b} @exons; for (my $i=1;$i<=scalar(@ordered_exons)-3;$i=$i+2){ my $each_intron_size=$ordered_exons[$i+1]-$ordered_exons[$i]-1; print “$each_intron_size\t”; } }else{print “0”;} print “\n”; }
“`
An example of gff3 file from soybean genome annotation:
>>>>>>> d80cd8fa3e1fb5461144707ba04f7385ec6726a7:category/perl/atom.xml##gff-version 3 Gm01 phytozome8_0 gene 51481 61502 . - . ID=Glyma01g00270;Name=Glyma01g00270 Gm01 phytozome8_0 mRNA 51481 61502 . - . ID=PAC:16242891;Name=Glyma01g00270.1;pacid=16242891;longest=1;Parent=Glyma01g00270 Gm01 phytozome8_0 CDS 61437 61502 . - 0 ID=PAC:16242891.CDS.1;Parent=PAC:16242891;pacid=16242891 Gm01 phytozome8_0 CDS 61167 61305 . - 0 ID=PAC:16242891.CDS.2;Parent=PAC:16242891;pacid=16242891 Gm01 phytozome8_0 CDS 60722 60780 . - 2 ID=PAC:16242891.CDS.3;Parent=PAC:16242891;pacid=16242891 Gm01 phytozome8_0 CDS 60339 60533 . - 0 ID=PAC:16242891.CDS.4;Parent=PAC:16242891;pacid=16242891 Gm01 phytozome8_0 CDS 59699 59814 . - 0 ID=PAC:16242891.CDS.5;Parent=PAC:16242891;pacid=16242891 Gm01 phytozome8_0 CDS 59420 59547 . - 1 ID=PAC:16242891.CDS.6;Parent=PAC:16242891;pacid=16242891 Gm01 phytozome8_0 CDS 59176 59284 . - 2 ID=PAC:16242891.CDS.7;Parent=PAC:16242891;pacid=16242891 Gm01 phytozome8_0 CDS 56878 56879 . - 1 ID=PAC:16242891.CDS.8;Parent=PAC:16242891;pacid=16242891 Gm01 phytozome8_0 CDS 55013 55069 . - 2 ID=PAC:16242891.CDS.9;Parent=PAC:16242891;pacid=16242891 Gm01 phytozome8_0 CDS 54390 54500 . - 2 ID=PAC:16242891.CDS.10;Parent=PAC:16242891;pacid=16242891 Gm01 phytozome8_0 CDS 54218 54320 . - 2 ID=PAC:16242891.CDS.11;Parent=PAC:16242891;pacid=16242891 Gm01 phytozome8_0 CDS 53902 53966 . - 1 ID=PAC:16242891.CDS.12;Parent=PAC:16242891;pacid=16242891 Gm01 phytozome8_0 CDS 52608 52639 . - 2 ID=PAC:16242891.CDS.13;Parent=PAC:16242891;pacid=16242891 Gm01 phytozome8_0 CDS 52517 52571 . - 0 ID=PAC:16242891.CDS.14;Parent=PAC:16242891;pacid=16242891 Gm01 phytozome8_0 CDS 52362 52465 . - 2 ID=PAC:16242891.CDS.15;Parent=PAC:16242891;pacid=16242891 Gm01 phytozome8_0 CDS 51828 51925 . - 0 ID=PAC:16242891.CDS.16;Parent=PAC:16242891;pacid=16242891 Gm01 phytozome8_0 CDS 51481 51625 . - 1 ID=PAC:16242891.CDS.17;Parent=PAC:16242891;pacid=16242891 Gm01 phytozome8_0 gene 90693 95580 . - . ID=Glyma01g00300;Name=Glyma01g00300 Gm01 phytozome8_0 mRNA 90693 94401 . - . ID=PAC:16242894;Name=Glyma01g00300.1;pacid=16242894;longest=1;Parent=Glyma01g00300 Gm01 phytozome8_0 CDS 92970 94401 . - 0 ID=PAC:16242894.CDS.1;Parent=PAC:16242894;pacid=16242894 Gm01 phytozome8_0 CDS 92083 92084 . - 2 ID=PAC:16242894.CDS.2;Parent=PAC:16242894;pacid=16242894 Gm01 phytozome8_0 CDS 90693 90860 . - 0 ID=PAC:16242894.CDS.3;Parent=PAC:16242894;pacid=16242894 Gm01 phytozome8_0 mRNA 94388 95580 . - . ID=PAC:16242895;Name=Glyma01g00300.2;pacid=16242895;longest=0;Parent=Glyma01g00300 Gm01 phytozome8_0 CDS 95467 95469 . - 0 ID=PAC:16242895.CDS.1;Parent=PAC:16242895;pacid=16242895 Gm01 phytozome8_0 five_prime_UTR 95470 95580 . - . ID=PAC:16242895.five_prime_UTR.1;Parent=PAC:16242895;pacid=16242895 Gm01 phytozome8_0 CDS 95277 95356 . - 0 ID=PAC:16242895.CDS.2;Parent=PAC:16242895;pacid=16242895 Gm01 phytozome8_0 CDS 94388 94475 . - 1 ID=PAC:16242895.CDS.3;Parent=PAC:16242895;pacid=16242895 Gm01 phytozome8_0 gene 116300 127990 . + . ID=Glyma01g00320;Name=Glyma01g00320 Gm01 phytozome8_0 mRNA 116300 127990 . + . ID=PAC:16242897;Name=Glyma01g00320.1;pacid=16242897;longest=1;Parent=Glyma01g00320 Gm01 phytozome8_0 five_prime_UTR 116300 116467 . + . ID=PAC:16242897.five_prime_UTR.1;Parent=PAC:16242897;pacid=16242897 Gm01 phytozome8_0 CDS 116468 117077 . + 0 ID=PAC:16242897.CDS.1;Parent=PAC:16242897;pacid=16242897 Gm01 phytozome8_0 CDS 117156 118627 . + 2 ID=PAC:16242897.CDS.2;Parent=PAC:16242897;pacid=16242897 Gm01 phytozome8_0 CDS 125732 125982 . + 0 ID=PAC:16242897.CDS.3;Parent=PAC:16242897;pacid=16242897 Gm01 phytozome8_0 CDS 127537 127567 . + 1 ID=PAC:16242897.CDS.4;Parent=PAC:16242897;pacid=16242897 Gm01 phytozome8_0 three_prime_UTR 127568 127990 . + . ID=PAC:16242897.three_prime_UTR.1;Parent=PAC:16242897;pacid=16242897 Gm01 phytozome8_0 mRNA 116300 127990 . + . ID=PAC:16242898;Name=Glyma01g00320.2;pacid=16242898;longest=0;Parent=Glyma01g00320 Gm01 phytozome8_0 five_prime_UTR 116300 116467 . + . ID=PAC:16242898.five_prime_UTR.1;Parent=PAC:16242898;pacid=16242898 Gm01 phytozome8_0 CDS 116468 117077 . + 0 ID=PAC:16242898.CDS.1;Parent=PAC:16242898;pacid=16242898 Gm01 phytozome8_0 CDS 117156 118627 . + 2 ID=PAC:16242898.CDS.2;Parent=PAC:16242898;pacid=16242898 Gm01 phytozome8_0 CDS 127537 127707 . + 0 ID=PAC:16242898.CDS.3;Parent=PAC:16242898;pacid=16242898 Gm01 phytozome8_0 three_prime_UTR 127708 127990 . + . ID=PAC:16242898.three_prime_UTR.1;Parent=PAC:16242898;pacid=16242898 Gm01 phytozome8_0 mRNA 127557 127707 . + . ID=PAC:16242899;Name=Glyma01g00320.4;pacid=16242899;longest=0;Parent=Glyma01g00320 Gm01 phytozome8_0 CDS 127557 127707 . + 1 ID=PAC:16242899.CDS.1;Parent=PAC:16242899;pacid=16242899 <<<<<<< HEAD:source/_posts/2012-11-28-intron-size-gff3-perl.html Gm01 phytozome8_0 gene 170877 193446 . + . ID=Glyma01g00380;Name=Glyma01g00380======= Gm01 phytozome8_0 gene 170877 193446 . + . ID=Glyma01g00380;Name=Glyma01g00380
</p>
]]>旧文回收
#当时写的代码有点傻傻的
!/usr/bin/perl
#————————————————————————————— #Author: me #Date: 2009-05-13 19:00 #Version: 1.0 #Project: random genetic drift #模拟随机漂变对某一群体等位基因频率的影响,只是简单写了一下核心的程序。 #————————————————————————————— use strict; use warnings; srand(time | $$); #为随机数发生器设定种子 my $a1percent; #声明变量: my $i;my $j; my $a1sum; my $randomnum; my $generation; my $avegeneration;my $sumgeneration; my $fix1; my $fix2; for($i=0; $i<10000; ++$i){ #10000次重复 $a1percent=0.7; #设定等位基因a1初始频率为0.7 for($generation=0; 1;++$generation){ #模拟世代更替,记录a1或a2固定时经历的世代数($generation) if ($a1percent==0 or $a1percent==1){ last; #当a1固定或丢失的时候跳出世代更替的循环 } $a1sum=0; for ($j=0; $j<20; ++$j){ #随机挑选20个配子作为下一代个体的组成 $randomnum=rand(1); if ($randomnum<=$a1percent){ ++$a1sum; #记录挑选到a1基因的总数 } } $a1percent=$a1sum/20; #记录挑选到a1基因的频率 } $sumgeneration+=$generation; #将每次a1或a2固定时经历世代数求和 if ($a1percent==1){ #a1固定 ++$fix1; #记录a1固定的次数 }else{ #a1丢失,a2固定 ++$fix2; #记录a2固定的次数 } } $avegeneration=$sumgeneration/10000; $fix1=$fix1/10000; $fix2=$fix2/10000; print “a1或a2固定的平均世代数为: $avegeneration\n”; print “a1固定的概率为:$fix1\n”; print “a2固定的概率为:$fix2\n”; exit;
“`
]]>旧文回收
#将数据导入excel,再用excel处理基本是多此一举 #且excel可以直接导入按一定规律排列的数据(如用tab分隔的数据) #留给有需要的人用吧
use Spreadsheet::WriteExcel; #需要安装此模块 use strict;
新建一个工作簿,gethomo.xls
my $workbook = Spreadsheet::WriteExcel->new(“gethomo.xls”);
新建一个工作表
my $worksheet = $workbook->add_worksheet();
需要导入的数据
my $datafile=”gethomo.txt”;
打开数据文件
unless( open(GET_FILE_DATA, $datafile) ) {
print STDERR “Cannot open file “$filename”\n\n”;
exit;
}
my @input=
写入excel
for (my $i=0;$i<scalar @input;$i++){
#按照输入文件内容的排列格式修改此行
my @input_row=split (/\t/, $input[$i]);
for (my $j=0;$j<scalar @input_row;$j++){
my $input_cell=$input_row[$j];
#在$i行$j列的单元格中写入数据.(注: 行列初始值为0)
unless ($input_cell=~/=/){$worksheet->write($i, $j, $input_cell);}
} } #打完收工
若要从excel中提取数据,
#则需要使用Spreadsheet::ParseExcel模块 #常用的几条语句如下: my $parser=Spreadsheet::ParseExcel->new(); my $workbook = $parser->Parse(‘RNA1.xls’); for my $worksheet ( $workbook->worksheets() ) { my ( $row_min, $row_max ) = $worksheet->row_range(); my ( $col_min, $col_max ) = $worksheet->col_range(); for my $col ( $col_min .. $col_max ) { #…… } } #关于更多用perl操作excel的方式和代码可以参考CPAN
“`
]]>需要安装相应模块
use GD::Graph::histogram;
创建一个数组的reference
$data = [1,5,7,8,9,10,11,3,3,5,5,5,7,2,2];
创建图片对象
my $graph = new GD::Graph::histogram(400,600);
设置格式
$graph->set( x_label => ‘X Label’, y_label => ‘Count’, title => ‘A Simple Count Histogram Chart’, x_labels_vertical => 1, bar_spacing => 0, shadow_depth => 1, shadowclr => ‘dred’, transparent => 0, ) or warn $graph->error;
输出频率分布图到图片对象上
my $gd = $graph->plot($data) or die $graph->error;
打印到文件
open(IMG, ‘>histogram.png’) or die $!; binmode IMG; print IMG $gd->png; close IMG;
“`
Perl做出来的图真是难看,跟R差太远了。
我是用了大量perl script提取出了一些数据,只是想先看看数据的分布情况,不做拟合和后续统计分析,懒得写R script了。
]]>直接拿来主义,用已有的模块,List::Compare。
既然发在CPAN上,估计是优化过的,而且反正perl的速度本来就不快,又只是自己用,简单易用省心,最好。
“` perl listcomp.pl
use List::Compare; #需要安装 @Llist = qw(abel abel baker camera delta edward fargo golfer); @Rlist = qw(baker camera delta delta edward fargo golfer hilton); $lc = List::Compare->new(\@Llist, \@Rlist); #使用数组的reference
@intersection = $lc->get_intersection; #子集 @union = $lc->get_union; #并集
print “@intersection\n@union\n”;
“`
List::Compare还有很多其他用法。
比如,get_unique(),返回第一个数组特有的那些元素;
而get_complement(),返回第二个数组特有的那些元素;
以及get_symmetric_difference(),返回属于某一数组但不同时属于两个数组的那些元素。
]]>