#!/usr/local/bin/perl -w # msf2alpro.pl # Usage: msf2alpro.pl file.msf > protseq # convert MSF multiple alignment format to # alpro format to generate sequence logos. # Mike Sauder, msauder@stromix.com 11-29-00 # Replacing > with * is not necessary in newest version of 'alpro' # Append . to end of sequences; Replace . with - and remove spaces. %seqs=(); @line=(); $ok=0; $width=60; if (!defined($ARGV[0]) || $ARGV[0] eq "-h" || $ARGV[0] eq "-help") { printhelp() } $filename=$ARGV[0]; while(<>) { chomp; if (/^\/\//) { $ok=1; next } if (/^ /) { next } if ($_ ne "" && $ok==1) { @line=split; $name=$line[0]; for ($i=1; $i<6; $i++) { if (defined($line[$i])) { $seqs{$name} .= $line[$i] } } } } if ($ok==0) { print STDERR "* Not an MSF alignment file; no // line.\n"; print STDERR "Please specify an MSF alignment file on the command line.\n"; die "\n"; } else { print "* ${filename}; alpro format for use with Sequence Logos (seqlogo).\n" } foreach $name (sort keys %seqs) { print ">$name \n"; $seqs{$name}=~s/\./\-/g; # Convert . to - $seqs{$name}.="."; printseq($seqs{$name}); } # Format sequence at $width columns sub printseq { $seq = $_[0]; $length=length($seq); $ncol=int($length/$width); for ($i=0; $i<=$ncol; $i++) { print substr($seq,$i*$width,$width),"\n"; } } sub printhelp { print<<'EOF'; NAME msf2alpro.pl - Convert MSF alignment format to alpro format USAGE msf2alpro.pl file.msf > protseq DESCRIPTION msf2alpro.pl converts a MSF format multiple alignment file to alpro format for generating Sequence Logos based on the 'alpro' and 'makelogo' programs by Tom Schneider. EOF die "\n"; } =pod # protseq.demo - sample alpro format: * protseq.globin: aligned globin sequences >Name1 description ---------------DLVSGFWGKV--DA---HKIGGEALARLLVVYPWTQRYFTTFGNL GSADAIC-----HNA---KVLAHG-EKVLAAIGEGLKHPENLKAHY--AKL-SEYHSNK- ---LHVDPANFRLLGNVFITVLARHF-QH-EFTPELQ-. >Name2 description ---------LTAAEKTQVTNLWGKV--NV---KELGGEALSRLLVVYPWTRRFFEHFGDL STAEAVL-----HNA---KVLAHG-EKVLTSFGEGLKHLDNLKGTF--ADL-SELHCDK- ---LHVDPENFRLLGNVLVIVLARHF-GK-EFTPDVQ-. >Name3 description ---------LTAEEKAAVTGFWGKV--KV---DEVGAEALGRLLVVYPWTQRFFEHFGDL SSADAVM-----NNA---KVKAHG-KKVLDSFSNGMKHLDDLKGTF--AQL-SELHCDK- ---LHVDPENFKLLGNVLVVVLARHH-GS-EFTPLLQ-A. =cut