#!/usr/local/bin/perl -w # clustalw2alpro.pl # Usage: clustalw2alpro.pl file.aln > protseq # convert Clustalw multiple alignment format to # alpro format to generate sequence logos. # Mike Sauder, msauder@stromix.com 11-29-00 # Replacing > with * is not necessary in newest version of 'alpro' # Append . to end of sequences %seqs=(); $ok=0; $width=60; if (!defined($ARGV[0]) || $ARGV[0] eq "-h" || $ARGV[0] eq "-help") { printhelp() } $filename=$ARGV[0]; while(<>) { chomp; if (/^CLUSTAL/) { $ok=1; next } if (/^ /) { next } if ($_ ne "" && $ok==1) { @line=split; $name=$line[0]; $seqs{$name} .= $line[1]; } } if ($ok==0) { print STDERR "* Not a ClustalW alignment file; no CLUSTAL header line.\n"; print STDERR "Please specify a ClustalW alignment file on the command line.\n"; die "\n"; } else { print "* ${filename}; alpro format for use with Sequence Logos (seqlogo).\n" } foreach $name (sort keys %seqs) { print ">$name \n"; $seqs{$name}.="."; printseq($seqs{$name}); } # Format sequence at $width columns sub printseq { $seq = $_[0]; $length=length($seq); $ncol=int($length/$width); for ($i=0; $i<=$ncol; $i++) { print substr($seq,$i*$width,$width),"\n"; } } sub printhelp { print<<'EOF'; NAME clustalw2alpro.pl - Convert ClustalW alignment to alpro format USAGE clustalw2alpro.pl file.aln > protseq DESCRIPTION clustalw2alpro.pl converts a ClustalW multiple alignment file to 'alpro' format to create Sequence Logos based on the 'alpro' and 'makelogo' programs by Tom Schneider. EOF die "\n"; } =pod # protseq.demo - sample alpro format: * protseq.globin: aligned globin sequences *Frog HEMOGLOBIN BETA CHAIN - EDIBLE FROG ---------------DLVSGFWGKV--DA---HKIGGEALARLLVVYPWTQRYFTTFGNL GSADAIC-----HNA---KVLAHG-EKVLAAIGEGLKHPENLKAHY--AKL-SEYHSNK- ---LHVDPANFRLLGNVFITVLARHF-QH-EFTPELQ-. *African Elephant HEMOGLOBIN BETA CHAIN - AFRICAN ELEPHANT ---------LTAAEKTQVTNLWGKV--NV---KELGGEALSRLLVVYPWTRRFFEHFGDL STAEAVL-----HNA---KVLAHG-EKVLTSFGEGLKHLDNLKGTF--ADL-SELHCDK- ---LHVDPENFRLLGNVLVIVLARHF-GK-EFTPDVQ-. *Goat HEMOGLOBIN BETA-A CHAIN - GOAT ---------LTAEEKAAVTGFWGKV--KV---DEVGAEALGRLLVVYPWTQRFFEHFGDL SSADAVM-----NNA---KVKAHG-KKVLDSFSNGMKHLDDLKGTF--AQL-SELHCDK- ---LHVDPENFKLLGNVLVVVLARHH-GS-EFTPLLQ-A. =cut