eval 'exec /usr/dist/exe/perl -S $0 ${1+"$@"}' if 0; #genbank2mif: Ken Shirriff #Usage: genbank2mif < genbankfile > file.mif #This generates a MIF file for FrameMaker that shows the structure of #the sequence. $proteinonly = 1; # Only display CDS records $textabove = 1; # Print text above the box $xscale = 1500.; # How many bases per inch $yp = 0; # Starting y position $yheight = 0.1; $yspc = 0.15; # Spacing between lines $color[1] = "Red"; $color[2] = "Green"; $color[0] = "Blue"; &ginit(); while (<>) { chop; if (/^SOURCE.*/) { &parsesource(); } if (/^FEATURES.*/) { &parsefeatures(); } if (/^ORIGIN.*/) { &parseorigin(); last; } } # seq = 0 ... length($seq)-1 $len = length($seq); print STDERR "length =$len\n"; &findorf(); # &checkcg(); # experimental function &gend(); # ------------ #Draw the ORF's by looking for stop codons. # findorf( sub findorf { &ggroup(); $yp += 0.5; $yheight = 0.2; $yspc = 0.3; $yspc = 0.05; >ext(0./$xscale, $yp-$yheight/2, "0"); >ext($len/$xscale,$yp-$yheight/2,$len); &gbox(1./$xscale,$yp,$len/$xscale,$yp+$yheight,$color[1%3]); &findseq(1,3,"taa",$yp,$yp+$yheight); &findseq(1,3,"tag",$yp,$yp+$yheight); &findseq(1,3,"tga",$yp,$yp+$yheight); &gendgroup(); $yp += $yheight + $yspc; &ggroup(); &gbox(1./$xscale,$yp,$len/$xscale,$yp+$yheight,$color[2%3]); &findseq(2,3,"taa",$yp,$yp+$yheight); &findseq(2,3,"tag",$yp,$yp+$yheight); &findseq(2,3,"tga",$yp,$yp+$yheight); &gendgroup(); $yp += $yheight + $yspc; &ggroup(); &gbox(1./$xscale,$yp,$len/$xscale,$yp+$yheight,$color[3%3]); &findseq(3,3,"taa",$yp,$yp+$yheight); &findseq(3,3,"tag",$yp,$yp+$yheight); &findseq(3,3,"tga",$yp,$yp+$yheight); &gendgroup(); } # Print the SOURCE record at the top of the page. sub parsesource { ($junk,$txt) = split(' ',$_,2); >ext(0,$yp,$txt); $yp += 3*$yspc; } # Parse the FEATURES entries, printing a box for note, gene, product sub parsefeatures { local($y, $tag, $pos, $note); while (<>) { if (/^\w/) { if ($pos) { &printfeature($text,$pos); } if (/^ORIGIN.*/) { &parseorigin(); } last; } if ( /^ {9}/ ) { # ttype is a priority field; the highest # priority gets printed. if ( /note/ && $ttype<3) { ($text) = $_ =~ /^ *.note="([^"]*)"?\n?/; $ttype = 3; } elsif ( /gene/ && $ttype < 6) { ($text) = $_ =~ /^ *.gene="([^"]*)"?\n?/; $ttype = 6; } elsif ( /product/ && $ttype < 8) { ($text) = $_ =~ /^ *.product="([^"]*)"?\n?/; $ttype = 8; } else { } } else { if ($pos) { &printfeature($text,$pos); } $pos = 0; ($text, $pos1) = split(' '); if ($proteinonly) { if ($text =~ /CDS/) { } else { next; } } #Skip all these fields next if $text =~ /exon/; next if $text =~ /prim_transcript/; next if $text =~ /intron/; next if $text =~ /virion/; next if $text =~ /provirus/; next if $text =~ /mRNA/; $pos = $pos1; $ttype = 1; } } } # Print a box for the feature. Parse the 123..456 field. sub printfeature { local($txt, $pos) = @_; local($offset, $stride, $tst, $y0,$y1) = @_; if (($a,$b) = $pos =~ /^?(\d+)$/) { &ggroup(); &gbox($a/$xscale, $yp, $b/$xscale, $yp+$yheight, $color[$a%3]); if ($textabove) { >ext($a/$xscale+0.03, $yp+$2.1*yheight, $txt); } else { >ext($b/$xscale+0.03, $yp+$yheight, $txt); } &gendgroup($gc); $yp += $yheight+$yspc; } elsif (($a) = $pos =~ /^) { chop; if ($_ eq "//" ) { last; } s/\s\d+\w//; s/\s//g; $c = substr($_,0,1); substr($seq,-1,0) = $_; } $seq = substr($seq,1); chop $seq; } # ------------ # Look for occurrences of $tst in the sequence, starting at $offset and # moving through at $y. Draw lines from y0 to y1 whereever found. # This is used, for instance, to find stop codons. sub findseq { local($offset, $stride, $tst, $y0,$y1) = @_; local($slen); $slen = length($tst); for ($i=$offset-1; $i<$len; $i += $stride) { if (substr($seq,$i,$slen) eq $tst) { &gline($i/$xscale,$y0,$i/$xscale,$y1); } } } sub checkcg { $yp += 1; &gbox(1./$xscale,$yp,$len/$xscale,$yp+$yheight); &findseq(1,1,"cg",$yp, $yp+$yheight); $oldtotal = 0; $oldx = 0; $total = 0; $histw = 100; for ($i=0;$i<$len;$i++) { $bases{substr($seq,$i,1)}++; } $ct = $bases{"c"}; $gt = $bases{"g"}; $at = $bases{"a"}; $tt = $bases{"t"}; $bscale = 4; #inches per fraction $exp = $ct*$gt/$len/$len; &gline(0/$xscale, $yp-$exp*$bscale, $len/$xscale,$yp-$exp*$bscale); $p = int($exp*100); >ext($len/$xscale,$yp-$exp*$bscale,$p . "%"); for ($i=0;$i<$len;$i++) { if (0) { $cnt++; if (substr($seq,$i,2) eq "cg") { $total++; } if ($i-$histw>0 && substr($seq,$i-$histw,2) eq "cg") { $total--; } if (1 || $cnt==$histw || $i==$len-1) { $cnt = $histw; &gline($oldx/$xscale, $yp-$oldtotal*$bscale/$cnt, ($i-$histw/2.)/$xscale,$yp-$total*$bscale/$cnt); $oldtotal = $total; $oldx = $i-$histw/2.; $cnt = 0; } } else { $cnt++; if (substr($seq,$i,2) eq "cg") { $total++; } if ($cnt==$histw) { &gline($oldx/$xscale, $yp-$oldtotal*$bscale/$cnt, ($i-$histw/2.)/$xscale,$yp-$total*$bscale/$cnt); $oldtotal = $total; $oldx = $i-$histw/2.; $cnt = 0; $total = 0; } } } } # ------------ # Graphics functions. These functions generate MIF code. # ginit sub ginit { print " #Generated by genbank2mif\n"; } # gend sub gend { print "# End of MIFFile\n"; } # gline(x0,y0,x1,y1) sub gline { local($x0,$y0,$x1,$y1) = @_; print " 0) { print " \n"; } print " \n"; print " \n"; print "> #end of PolyLine\n"; } # gbox(x0,y0,x1,y1,fill) sub gbox { local($x0,$y0,$x1,$y1,$fill) = @_; print " 0) { print " \n"; } if ($fill) { print " \n"; print " >\n"; } else { print " \n"; } $w = $x1-$x0; $h = $y1-$y0; print " \n"; print " \n"; print "> #end of Rectangle\n"; } # ggroup: Group together all elements until gendgroup sub ggroup { $gc++; $group = $gc; } # gendgroup sub gendgroup { # Should pop group print "\n"; if (0) { print " \n"; } print "> #end of Group\n"; $group = 0; } #gtext (x,y,text) sub gtext { local($x,$y,$text) = @_; print " 0) { print " \n"; } print " \n"; print " \n"; print " \n"; print "> #end of TextLine\n"; }