#!/usr/bin/perl -w =pod =head1 NAME bootfam - Create families based on their phylogeny and species tree reconciliation =head1 SYNOPSIS bootfam [] =head1 OPTIONS =over 4 =item B<-h, -u> Help text. =item B<< -a >> This option needs a makeover! Right now it means: "Don't look for kalign". In the future, we will use it to allow opening alignments in different formats. Or to say: "Do align!" =item B<< -c >> Compute consensus tree and put it in . This consensus tree is rooted. =item B<< -d >> B decides how to estimate protein distance, it is either 'lapd' or 'protdist'. =item B<< -D >> This option has two meanings: either create or read from an intermediate result file. Since bootstrapped distances are computationally expensive, one should if possible avoid this step. If any experimentation with parameters for steps after bootstrapping is to be expected, this option should be used. If the file does I<< not >> exist, then bootfam creates this file, proceeds with computing distances for all bootstrap replicates, I<< and >> saves the distances in the named file. If the file exists, bootfam tries to read a set of distances from that file and continues to the next step, hence avoiding the costly distance step. Note that you cannot, yet, bypass bootfam's distance estimation and let bootfam read your own bootstrapped distances. This is because bootfam renames sequences internally to avoid issues with Phylip's requirements on sequence names. =item B<< -m >> Decide how to handle families that fall under the min bootstrap threshold (see -b). Default is 'strict', which means group everything unsupported together and label it as "Unclear". The 'extended' mode reports non-conflicting families in a greedy fashion (highest support first), while 'merge' simply merge all unsupported and overlapping family candidates. This differst from 'strict' in that we may have several groups that are non-overlapping and unsupported and 'strict' would put them all in the same family. =item B<< -r >> Specify the number of replicates in the bootstrap process. Default: 100 =item B<< -b >> Experimental: Minimum bootstrap value to report families for. Everything below this is called "Unclear". =item B<-H> Let the output be HTML formatted for easy viewing in a web browser. =item B<-q> Quiet operation, no verbose messages. =head1 DESCRIPTION More later. If the sequences are all the same length, it is assumed that they are aligned. Otherwise, kalign is invoked to create an alignment. The seq-to-species file should map each sequence accession to a _short_ string, preferrably two or three character, that can be used as a prefix to sequence names. =head1 TO DO Feature: Output ortholog predictions a' la RIO. =head1 Authors =item Lars Arvestad =item Petra Leimlehner =cut use strict; use Pod::Usage; use Pod::Text; use File::Temp qw/ tempfile /; use Getopt::Std; #use lib '/afs/pdc.kth.se/home/l/leiml/space/Programs/bioperl-1.5.0'; use Bio::SeqIO; use Bio::AlignIO; use Graph::UnionFind; use Data::Dumper; # Globals, for options etc my $verbose = 1; my $html_output = undef; my $replicates = 100; my $dont_align = undef; # or the format of the input alignment, or 'distancematrices'... my $must_save_distances= undef; # user requested distmatrices to be written to file $inout_distfile my $inout_distfile = undef; # If defined, contains distance matrices my $wastemode = 'strict'; my $consensustree = undef; my $kalign = 'kalign'; my $java = 'java'; my $notung = 'notung'; my $reconcile = 'reconcile'; my $chainsaw = 'chainsaw'; my $neighbor = 'neighbor'; my $consense = 'consense'; my $destimator = 'protdist'; my $lapd = 'bootstrap_lapd'; my $protdist = 'protdist'; my $seqboot = 'seqboot'; my $min_boot_value = 0.5; # Experimental: Only theoretically valid value is 0.5. my $min_merge_value = 0.0; # Don't merge families not reaching this threshold my $commandline = join(' ', @ARGV); ### Get input and options ############################################ my %opts = (); getopts('a:hub:c:m:r:qd:D:vH', \%opts); if (defined $opts{'h'}) { pod2usage(-verbose => 2, -exitval => 0); } if (defined $opts{'u'}) { pod2usage(-exitval => 0); } if (defined $opts{'H'}) { $html_output = 1; } if (defined $opts{'b'}) { my $tmp = $opts{'b'}; if ($tmp =~ m/^0\.\d+$/) { $min_boot_value = $tmp; } else { print STDERR "Illegal value for option -b: $tmp\n"; exit 8; } } if (defined $opts{'a'}) { $dont_align = $opts{'a'}; } if (defined $opts{'c'}) { $consensustree = $opts{'c'}; } if (defined $opts{'m'}) { $wastemode = $opts{'m'}; if (! ($wastemode eq 'strict' || $wastemode eq 'extended' || $wastemode eq 'merge')) { exit 9; } } if (defined $opts{'D'}) { # Read or save distance data $inout_distfile = $opts{'D'}; if (stat($inout_distfile)) { # Distance matrices are already available in $inputfile $dont_align = 'distancematrices'; } else { $must_save_distances = 1; } } if (defined $opts{'d'}) { if ($opts{'d'} eq 'lapd') { $destimator = 'lapd'; } elsif ($opts{'d'} eq 'protdist') { $destimator = 'protdist'; } else { print STDERR "Unknown program: ", $opts{'d'}, " please reconsider lapd or protdist."; exit 7; } } if (defined $opts{'r'}) { my $tmp = $opts{'r'}; if ($tmp =~ m/^\d+$/) { $replicates = $tmp; } else { print STDERR "Option '-r' (bootstrap replicates, default $replicates) requires an integer argument."; exit 6; } } if (defined $opts{'q'}) { $verbose = 0; } elsif (defined $opts{'v'}) { $verbose = 2; } if (scalar(@ARGV) != 3) { pod2usage(-message => 'Wrong number of arguments', -exitval => 1); } ### Main, more or less ########################################################### my $seqfile = shift; my $speciestreefile = shift; my $gene2speciesfile = shift; my @program_list = ($java, $notung, $reconcile, $chainsaw, $seqboot, $neighbor); if (!$dont_align) { push @program_list, $kalign; } if (!defined $inout_distfile) { if ($destimator eq 'lapd') { push @program_list, $lapd; } else { push @program_list, $protdist; } } if (defined $consensustree) { push @program_list, $consense; } check_for_programs(\@program_list); my @obstructions; foreach my $fname ('infile', 'outfile', 'outtree', 'treefile') { if (check_for_file($fname)) { push @obstructions, $fname; } } if (@obstructions == 1) { print STDERR "Error: The file ", join(', ', @obstructions), " in the current directory will obstruct\nthe Phylip programs that BootFam will run. Please remove it!\n"; exit 2; } elsif (@obstructions > 0) { print STDERR "Error: The files ", join(', ', @obstructions), " in the current directory, and its presence\nwill obstruct the Phylip programs that BootFam will run. Please remove them!\n"; exit 2; } ### Read input ######################################################## # We want to replace the real sequence ids with our own temporary identifiers # since the stupid phylip programs cut them off. # First two values are hashes, the second is the name of the sequence # file name with short accessions. my ($shortened, $seq, $sfile, $sameLength) = readData($seqfile, $gene2speciesfile); if ($sameLength) { $dont_align = 1; } ### Align the sequences and put the result in a new file my $phylipfile=''; if ($dont_align) { # OK, aligned input or distance already computed if (!$inout_distfile) { # Aha, distances not computed, but a phylip file for input $phylipfile = fasta2phylip($sfile, undef); } } else { my ($ah, $afile) = my_tempfile(); close($ah); $phylipfile = align($sfile, $afile); } ### Time for distances my $dfile = undef; if (defined $dont_align && $dont_align eq 'distancematrices') { $dfile = $inout_distfile; } else { # Bootstrap the aligned files my $bfile = bootstrap($phylipfile, $replicates); # Call distance estimation $dfile= distance_estimation($bfile, $replicates); # Save this intermediate result if asked to if ($must_save_distances) { `cp $dfile $inout_distfile`; } } ### Call Neighbor my $njfile = neighbor_joining($dfile, $replicates); ### Reroot all trees to minimize duplications my $rerootedfile = reroot_all_trees($njfile, $speciestreefile); ### Possibly produce a consensus tree here! if (defined $consensustree) { make_consensus_tree($rerootedfile, $consensustree); } ### Get reconciliations and cut the trees up! my $bootfams = reconcile_and_cut($rerootedfile, $speciestreefile); my $fam_h = consensus_families($bootfams, $replicates); if ($html_output) { html_consensus_families($fam_h); } else { output_consensus_families($fam_h); } ############################## subroutines ######################### # available options: # -d name database name sub checkOptions { } # end checkOptions # # Read sequence data # # Returnvalues: # - Name hash, short name -> long name # - Sequence hash, accession -> seq # - File name for Fasta file of sequences. # - Flag: 1 if same length seqs, 0 if not. # sub readData { my ($seqfile, $gene2speciesfile) = @_; my %shortened = (); my %seq = (); my $in = Bio::SeqIO->new(-file => $seqfile, -format => 'Fasta'); # Get species annotation. my $gs = get_gs($gene2speciesfile); # Store sequences here: my $i=0; my %lenStore=(); # Record the different sequence lenghts while ( my $s = $in->next_seq() ) { my $id = $s->primary_id(); if (!defined $gs->{$id}) { print STDERR "Error: No species information for $id!\n"; exit 3; } my $prefix = $gs->{$id}; my $short = "$prefix$i"; if (length $short > 10) { print STDERR "Error: The species tag is too long. Phylip cannot handle\na long name such as '$short'\n"; exit 4; } $seq{$short} = $s->seq(); $shortened{$short} = $s->primary_id(); $lenStore{length($s->seq())} = undef; # print $short, "\t", $shortened{$short}, "\n"; $i++; } # Are the sequences of same length? my $samelength; if (scalar(keys(%lenStore)) == 1) { $samelength = 1; } else { $samelength = 0; } # Write a fasta file with short accessions my ($sh, $sfile) = my_tempfile(); foreach my $s (keys %seq) { print $sh ">$s\n" . $seq{$s} . "\n"; } close($sh); return (\%shortened, \%seq, $sfile, $samelength); } # # Convert alignment files # sub fasta2phylip { my ($fastafile, $phylipfile) = @_; if (!defined $phylipfile) { $phylipfile = $fastafile . ".phylip"; } my $in = Bio::AlignIO->new(-file => $fastafile, -format => 'fasta'); my $phylip = Bio::AlignIO->new(-file => ">$phylipfile", -format => 'phylip'); while (my $aln = $in->next_aln()) { report("Alignment #sequences: " . $aln->no_sequences()); report(" length: " . $aln->length()); report(" residue density: " . sprintf("%.2f", (100 * $aln->no_residues() / ($aln->no_sequences() * $aln->length()))) . " %"); report(" average identity: " . sprintf("%.2f", $aln->average_percentage_identity()) . " %"); $phylip->write_aln($aln); } return $phylipfile; } # # Align sequences with kalign and reformat for phylip # sub align { my $infile = shift @_; my $outfile = shift @_; # call kalign # todo set path report("Aligning..."); my $signo = system("$kalign -i $infile -o $outfile 2&> /dev/null"); if ($signo &= 127) { exit 1; } elsif ($signo) { die "Could not run '$kalign' (error code: $?)"; } my $newfilename = undef; $newfilename = fasta2phylip($outfile, $newfilename); return $newfilename; } # end kalign # # # sub bootstrap { my ($fromfile, $replicates) = @_; report("Bootstrapping $replicates replicates..."); return call_phylip_prog($seqboot, "$fromfile\nr\n$replicates\ny\n17\n"); } # sub distance_estimation { my ($infile, $replicates) = @_; report("Distances..."); if ($destimator eq 'protdist') { return call_phylip_prog($protdist, "$infile\nm\nd\n$replicates\ny\n" ); } elsif ($destimator eq 'lapd') { return call_lapd($infile, $replicates); } else { die "Ohh, bad bug in distance_estimation, don't know about $destimator!"; } } # # Call Phylip's neighbor and return a file with a copy of 'outtree'. # sub neighbor_joining { my ($infile, $replicates) = @_; report("Building trees..."); my $instructions = "$infile\nm\n$replicates\n17\ny\n"; my $res = call_phylip_prog($neighbor, $instructions, 'outtree'); unlink 'outfile'; return $res; } # # Ask Phylip for a consensus tree. Tree is put in requested file. # sub make_consensus_tree { my ($infile, $outfile) = @_; report("Consensus tree being put in file '$outfile'"); my @lines = (); open(F, "<$infile") or die "Cannot open $infile for reading"; open(O, ">$infile.cons") or die "Cannot open $infile for writing."; while () { s/\[[^\[\]]+\]//g; print O; } close(F); close(O); my $instructions = "$infile.cons\nr\ny\n"; my $res = call_phylip_prog($consense, $instructions, 'outtree'); unlink 'outfile'; my $ctree=''; open(C, "<$res") or die "Bug-a-loo!"; while () { chomp; $ctree .= $_; } close(C); my $newtree = transform_string($ctree); open(O, ">$outfile") or die "Bug: Could not write to '$outfile'"; print O "\[Consensus tree from BootFam. Command line: $commandline\]\n"; print O $newtree, "\n"; close(O); # system("cp $res $outfile"); unlink($res); } # # Generic way of calling Phylip programs. # sub call_phylip_prog { my ($prog, $instructions, $resfile) = @_; # Where to store instructions my ($ih, $ifile) = my_tempfile(); # Where the result should go my ($rh, $rfile) = my_tempfile(); close($rh); # write inputfiles for phylip programs print $ih $instructions; close $ih; # report("Instructions in $ifile"); `rm -f outfile`; my $cmd = "$prog < $ifile > /dev/null"; if ($verbose > 1) { $cmd = "$prog < $ifile"; } my $res = system($cmd); if ($res != 0) { unlink 'outfile'; unlink 'outtree'; my $exitval = $res >> 8; my $signalnum = $res & 127; # my $dumpedcore = $res & 128; print STDERR "Phylip instructions:\n$instructions\n"; die "Error running '$prog' (exit code $exitval, signal $signalnum)"; } if (!defined $resfile) { $resfile = 'outfile'; } `cp $resfile $rfile`; unlink $resfile; # Could simply us `mv ...` here, but file # when issues may occur over file system boundaries. # Yeah, I am an AFS user. return $rfile; } # # Call lapd. Duh. # sub call_lapd { my ($infile, $replicates) = @_; # Where the result should go my ($rh, $rfile) = my_tempfile(); close($rh); my $lapd_verbosity = ''; if ($verbose > 1) { $lapd_verbosity = '-v'; } # Note that is 'bootstrap_lapd', not 'lapd', that we are calling here. my $signo = system($lapd, $replicates, $infile, $rfile, $lapd_verbosity); if ($signo &= 127) { exit 1; } elsif ($signo) { die "Error running 'lapd' (or 'bootstrap_lapd', error code is " . ($? >> 8) . ")"; } return $rfile; } # # Reroot # # For each tree (bootstrapped) in input file, # call Notung and request a re-rooting so that the number of duplications # is minimized. sub reroot_all_trees { my ($infile, $speciestreefile) = @_; report("Rerooting..."); my ($resh, $resfile) = my_tempfile(); close($resh); open(I, "<$infile") || die "Could not open $infile for reading."; # Cannot assume one tree per line... $/ = ';'; # Let semi colon separate inputs while () { # Write each gene tree to a file and let Notung work on it. # Notung can only handle one tree at the time. if (m/\(.+\)/) { my $gtree = $_; my ($th, $tfile) = my_tempfile(); print $th $gtree; close($th); run_notung($tfile, $speciestreefile, $resfile); } } close(I); $/ = "\n"; return $resfile; } # # Run notung # # Result is appended to $resfile # sub run_notung { my ($gtreefile, $streefile, $resfile) = @_; # Assemble the options my $options = "-g $gtreefile -s $streefile"; $options .= " --root"; # Ask for rerooting $options .= " --speciestag prefix"; # Species is given by seq acc prefix $options .= " --treeoutput nhx"; # Reconcile can read NHX format! No need for a GS file... $options .= " --outputdir /tmp"; # Don't want output in current dir! $options .= " --nolosses"; # Do not indicat gene losses in output tree $options .= " --maxtrees 1"; # One optimal solution suffices $options .= " --edgeweights nhx"; # This is to avoid that Notung reads edge lenghts at all. # The problem with Notung reading the edge weights is that it re-prints "0.0001" as # "1e-4", which the Phylip program consense cannot read... if ($verbose < 2) { $options .= " --silent"; # No verbose messages } my $signo = system($notung, $options); system("cat $gtreefile.rooting.0 >> $resfile"); if ($signo &= 127) { exit 1; } elsif ($signo) { die "Error running 'notung' (error code is " . ($? >> 8) . ")"; } elsif (! check_for_file($resfile)) { print STDERR "\nDo you have all the necessary species in the species tree?\n"; die "Could not reconcile tree"; } } # sub get_gs { my ($gsfile) = @_; my %gs; open(GS, "<$gsfile") || die "Cannot open $gsfile."; while () { if (m/(\S+)\s+(\S+)/) { $gs{$1} = $2; } } return \%gs; } # sub get_speciestree { my ($spfile) = @_; my $tree; open(GS, "<$spfile") || die "Cannot open $spfile."; while () { if (m/\S/) { $tree .= $_; } } return $tree; } # # Produce reconciliations for each tree and run chainsaw on it # sub reconcile_and_cut { my ($infile, $spfile) = @_; my ($resh, $resfile) = my_tempfile(); close($resh); open(I, "<$infile") || die "Could not open $infile for reading."; report("Reconciling and cutting..."); # Assume one tree per line while () { # Write each gene tree to a file my $gtree = $_; my ($th, $tfile) = my_tempfile(); print $th $gtree; close($th); my $r = run_reconcile_and_chainsaw($tfile, $speciestreefile); system("cat $r >> $resfile"); } close(I); return $resfile; } # sub run_reconcile_and_chainsaw { my ($tfile, $spfile) = @_; my ($resh1, $resfile1) = my_tempfile(); close($resh1); my ($resh2, $resfile2) = my_tempfile(); close($resh2); system("$reconcile $tfile $spfile >> $resfile1"); system("$chainsaw $resfile1 $spfile >> $resfile2"); return $resfile2; } # # There are some files that can obstruct Phylip and we want to ask the user # to remove them. # sub check_for_file { my ($fname) = @_; if (stat $fname) { return $fname; } else { return 0; } } # # Make sure we can run the programs needed # sub check_for_programs { my ($prog_list_ref) = @_; my @inaccessibles = (); foreach my $p (@$prog_list_ref) { if (!perl_which($p)) { push @inaccessibles, $p; } } if (@inaccessibles > 0) { print STDERR "BootFam: The following programs could not be run:\n\t", join("\n\t", @inaccessibles), "\n"; exit 5; } } # # From sbc-staff@pdc.kth.se, avoid using 'which' or 'type' # # Returns the path to a program if it is in the path, otherwise return undef sub perl_which { my $arg = shift; my $path = $ENV{'PATH'}; my @p = split (':', $path); foreach my $i (@p) { # print STDERR "$i\n"; my $exe = "$i/$arg"; if (-x $exe) { return $exe; } } return undef; } # # Read bootstrapped cuts from file # # Returns: A pair of hashrefs to a cut hash table and # a hash keeping track of what ids are present in a cut. sub read_cuts { my ($cutfile) = @_; my %cuts=(); my %cut_presence=(); # Keep track of which ids are present in a cut. We want to # report those genes that do not end up in a consensus family! open (F, "<$cutfile") || die "Bug: Could not open file $cutfile"; my $current_cut=''; while () { chomp; if (m/^Cut\s+(\S+):\s*$/) { $current_cut = $1; } elsif (m/EndCut/) { # Do nothing I guess. } else { # Must have found a potential subfamily # Extract members, sort them alphabetically, and index them my @l = sort {$a cmp $b} split(/,\s*/, $_); foreach my $id (@l) { $cut_presence{$current_cut}{$id}=undef; # "Yes, $id belongs to this cut, will it appear in a family later? } my $famname = join(',', @l); $cuts{$current_cut}{$famname}++; } } close(F); return (\%cuts, \%cut_presence); } # # Consense families # # Output all cuts that have a bootstrap of at least 50% # sub output_consensus_families { my ($cuts) = @_; # Extract potential subfamilies my @namelist = keys %$cuts; my $namelen = name_width(\@namelist); # Walk through cuts and see what consensus families they define report(""); # Get an empty line in case we have had diagnostic output. print "# BootFam: $commandline\n"; print "#\n"; print "# Group", ' 'x ($namelen - 4), " # Support Size Accessions\n"; foreach my $cut (keys %$cuts) { my $cut_results = $cuts->{$cut}; my $cut_name_str = $cut . (' ' x ($namelen - length($cut))); my @cut_list = sort keys %{$cuts->{$cut}}; my $num = 'A'; foreach my $f (@cut_list) { my $f_boot = $cuts->{$cut}{$f}{support}; my $count = $cuts->{$cut}{$f}{count}; my $contents = $cuts->{$cut}{$f}{contents}; if ($f_boot =~ m/Unclear/) { printf("%s %3s Unclear %3d %s\n", $cut_name_str, $num, $count, $contents); } else { printf("%s %3s %.2f %3d %s\n", $cut_name_str, $num, $f_boot, $count, $contents); } $num++; } print "\n"; } } # # Consense families in HTML # # Output HTML for subfamilies # sub html_consensus_families { my ($cuts) = @_; # Walk through cuts and see what consensus families they define report(""); # Get an empty line in case we have had diagnostic output. print "\nBootFam output\n"; print "\n\n"; print "\n

BootFam output

\nCommandline: $commandline\n"; print "\n\n"; foreach my $cut (sort keys %$cuts) { my $cut_results = $cuts->{$cut}; my $new_group = 1; my @cut_list = sort keys %{$cuts->{$cut}}; my $num = 'A'; foreach my $f (@cut_list) { my $f_boot = $cuts->{$cut}{$f}{support}; my $count = $cuts->{$cut}{$f}{count}; my $contents = $cuts->{$cut}{$f}{contents}; my $formatstring; if ($new_group) { $new_group = 0; # Don't print cut name next time, and no line until new group. my $cutname; # Let's shorten hierarchical names! a/b/c/d => c/d if ($cut =~ m/([^\/]+\/[^\/]+)$/) { $cutname = $1; } else { $cutname = $cut; } $formatstring = "\n"; printf($formatstring, $cutname, $num, $f_boot, $count, $contents); } else { $formatstring = "\n"; printf($formatstring, $num, $f_boot, $count, $contents); } $num++; } } print "
Group#SupportSizeAccessions
%s %s %s %s %s
%s %s %s %s
\n\n"; } # # Find consensus families # # This function expects cuts in the gene tree to be summarized # in the following format: # Cut X: # # EndCut # .... # where X is a (clade) name or an integer repesenting a node in # the species tree. # sub consensus_families { my ($cutfile, $replicates) = @_; my ($cuts, $cut_presence) = read_cuts($cutfile); # Keep track of which ids are present in a cut using cut_presence. # We want to make sure that we don't forget genes in badly defined families. # Store families in this hash table: $family{$cutname}{$integer} is a string? my %family = (); # Walk through cuts and see what consensus families they define # $cut is typically a node name or id foreach my $cut (keys %$cuts) { my $cutnum = 0; my $cut_results = $cuts->{$cut}; # Study each potential subfamily: How often do we see it? # First compute bootstrap support so we can sort it in the output my %booting=(); foreach my $f (keys %$cut_results) { my $f_boot = $cut_results->{$f} / $replicates; $booting{$f} = $f_boot; } foreach my $f (sort {$booting{$b} <=> $booting{$a}} keys %booting) { my $f_boot = $booting{$f}; if ($f_boot > $min_boot_value) { add_family($cut, \%family, $cutnum++, $f, $f_boot, $cut_presence); delete $booting{$f}; } else { last; } } # # "Extended majority" means to continue collect subfamilies, even though # their bootstrap support is less than 0.5 (or whatever threshold we have chosen). # if ($wastemode eq 'extended') { foreach my $f (sort {$booting{$b} <=> $booting{$a}} keys %booting) { my $f_boot = $booting{$f}; if (!incompatible_family($f, $cut_presence, $cut)) { add_family($cut, \%family, $cutnum++, $f, $f_boot, $cut_presence); } delete $booting{$f}; } } elsif ($wastemode eq 'merge') { # "Merging" means to join families containing uncertain genes. $cutnum = merge_unstable($cut, $cutnum, \%family, \%booting, $cut_presence); } if ($wastemode eq 'strict') { # Do we need to do anything special here? Don't think so. } # If "strict majority", we should end up here right away. # Regardless of value on $wastemode, we should end up here and # pick up the genes that has not been put anywhere. if (scalar(keys %{$cut_presence->{$cut}}) > 0) { my $orphan_ids = join(', ', keys %{$cut_presence->{$cut}}); add_family($cut, \%family, $cutnum++, $orphan_ids, 'Unclear', $cut_presence); } } return \%family; } sub add_family { my ($cut, $famh, $cutnum, $f, $f_boot, $cut_presence) = @_; my ($prettyfam, $count) = transform_id_list($f); $famh->{$cut}{$cutnum}{contents} = $prettyfam; # Ex: "BAG5_HUMAN, GAB17_DOG" $famh->{$cut}{$cutnum}{support} = $f_boot; # Ex: "0.78" $famh->{$cut}{$cutnum++}{count} = $count; # Ex: "11" (as in 11 genes) remove_from_focus($cut, $f, $cut_presence); } # # Check if potential family $f contains a gene/protein which has already been # declared as being contained in another family. # # "cut_presence" is a hash table with cut names and gene names as hash keys. # sub incompatible_family { my ($f, $cut_presence, $cut) = @_; foreach my $id (split(/,\s*/, $f)) { if (exists $cut_presence->{$cut}{$id}) { next; } else { return 1; # Oh, one of our genes is already taken } } return 0; } # # merge_unstable # # Overlapping, badly supported, families are to be merged. # Use the Union-Find datastructure for this. sub merge_unstable { my ($cut, $cutnum, $families, $booting, $cut_presence) = @_; my $uf = Graph::UnionFind->new(); # Put genes in "graph" and make a union-edge between those in # the same suggested subfamily my %tmp_presence = (); foreach my $f (sort {$booting->{$b} <=> $booting->{$a}} keys %$booting) { my $f_boot = $booting->{$f}; if (incompatible_family($f, $cut_presence, $cut)) { if ($cut eq 'Euteleostomi') { my ($prettyfam, $count) = transform_id_list($f); # print STDERR "$f_boot\t$f\t F = $prettyfam\n"; # print STDERR "$cut\t$f_boot\t F = $prettyfam\n"; } next; } else { if ($f_boot >= $min_merge_value) { my @genelist = split(/,\s*/, $f); if (scalar(@genelist) > 1) { # Don't bother with the trivial sets my $head = pop @genelist; $tmp_presence{$head} = undef; foreach my $gene (@genelist) { $uf->union($head, $gene); $tmp_presence{$gene} = undef; } } } } } # See which family each gene has ended up in my %tmpfamilies = (); foreach my $g (keys %tmp_presence) { my $key = $uf->find($g); # print STDERR "Gene: $g, \tKey: $key\n"; push @{$tmpfamilies{$key}}, $g; } # Register those families foreach my $f (keys %tmpfamilies) { my $family = join(', ', @{$tmpfamilies{$f}}); # print STDERR "f: $f \tfamilj: $family\n"; add_family($cut, $families, $cutnum++, $family, 'Merged', $cut_presence); delete $booting->{$f}; } # Return the last used cut number so we can continue to issue unique numbers return $cutnum; } # # The original function that I am trying to factorize: # sub original_consensus_families { # my ($cutfile, $replicates) = @_; # my ($cuts, $cut_presence) = read_cuts($cutfile); # # Keep track of which ids are present in a cut using cut_presence. # # We want to make sure that we don't forget genes in badly defined families. # # Extract potential subfamilies # my @namelist = keys %$cuts; # my $namelen = name_width(\@namelist); # # Walk through cuts and see what consensus families they define # report(""); # Get an empty line in case we have had diagnostic output. # print "# BootFam: $commandline\n"; # print "#\n"; # print "# Group", ' 'x ($namelen - 4), "Support Size Accessions\n"; # foreach my $cut (keys %$cuts) { # my $cut_results = $cuts->{$cut}; # my $cut_name_str = $cut . (' ' x ($namelen - length($cut))); # # Study each potential subfamily: How often do we see it? # # First compute bootstrap support so we can sort it in the output # my %booting=(); # foreach my $f (keys %$cut_results) { # my $f_boot = $cut_results->{$f} / $replicates; # $booting{$f} = $f_boot; # } # foreach my $f (sort {$booting{$b} <=> $booting{$a}} keys %booting) { # my $f_boot = $booting{$f}; # if ($f_boot > $min_boot_value) { # my ($prettyfam, $count) = transform_id_list($f); # remove_from_focus($cut, $f, $cut_presence); # # print "$cut_name_str $f_boot\t$prettyfam\n"; # printf("%s %.2f %3d %s\n", $cut_name_str, $f_boot, $count, $prettyfam); # } # } # if (scalar(keys %{$cut_presence->{$cut}}) > 0) { # my $orphan_ids = join(', ', keys %{$cut_presence->{$cut}}); # my ($real_orphans, $count) = transform_id_list($orphan_ids); # my $count_str = sprintf("%3d", $count); # print "$cut_name_str Unclear $count_str $real_orphans\n"; # } # } # } # If we have found a decent subfamily, we remove the ids from # our little hash table so that we can stop worry about them. sub remove_from_focus { my ($cut, $ids, $h) = @_; my @id_list = split(/,\s*/, $ids); foreach my $id (@id_list) { delete $h->{$cut}{$id}; } } # Find the length of the longest name in a list of strings sub name_width { my ($lr) = @_; my $longest = 0; foreach my $str (@$lr) { if (length($str) > $longest) { $longest = length($str); } } return $longest; } sub transform_id_list { my ($s) = @_; my @l = split(/,\s*/, $s); my $count = 0; my @new_l = (); foreach my $id (@l) { push @new_l , $shortened->{$id}; $count++; } return (join(', ', sort {$a cmp $b} @new_l), $count); } sub transform_string { my ($s) = @_; for my $id (keys %$shortened) { my $longid=$shortened->{$id}; $s =~ s/$id/$longid/; } return $s; } ### A safe temporary file generator ####################################### # # I don't want file to be removed prematurely due to garbage collection! # #my @fh_list = (); sub my_tempfile { my ($fh, $f) = tempfile(TEMPLATE => "bfXXXXXXXXX", UNLINK => 1); # push @fh_list, $fh return ($fh, $f); } ### Diagnostics support ################################################### sub report { my ($str) = @_; if ($verbose) { print STDERR "BootFam: $str\n"; } }