use strict; use SeedEnv; use Data::Dumper; use ANNOserver; my $anno = ANNOserver->new(); if (@ARGV != 3) { die "Usage: frameshift-finder dna-fasta-file protein-tbl-file protein-functions\n"; } my $fasta_file = shift @ARGV; my $tbl_file = shift @ARGV; my $function_file = shift @ARGV; my $edge_extension = 200; # # Load functions # open(FUN, "<", $function_file) or die "Cannot open function file $function_file: $!"; my %functions; while () { chomp; my($id, $function) = split(/\t/); $functions{$id} = $function; } close(FUN); # # Load location information # open(TBL, "<", $tbl_file) or die "Cannot open $tbl_file: $!"; my %locs; while () { chomp; my($fid, $loc) = split(/\t/); my ($contig, $begin, $end, $strand) = parse_location($loc); my $left = SeedUtils::min($begin, $end); my $right = SeedUtils::max($begin, $end); $locs{$fid} = [$contig, $begin, $end, $strand, $left, $right]; } close(TBL); open(FASTA, "<", $fasta_file) or die "Cannot open $fasta_file: $!"; my $resultHandle = $anno->assign_functions_to_dna(-input => \*FASTA, -kmer => 8, -minHits => 15, -maxGap => 600, ); while (my $result = $resultHandle->get_next()) { # # Each hit region is a 5-tuple consisting of the number of matches to the function, the start location, the stop location, the proposed function, and the name of the Genome Set from which the gene is likely to have originated. # my ($dna_id, $hit) = @$result; my($hits, $start, $stop, $function, $otu) = @$hit; next if $function eq 'hypothetical protein'; my $reg_left = SeedUtils::min($start, $stop) - $edge_extension; my $reg_right = SeedUtils::max($start, $stop) + $edge_extension;; print "$dna_id\t$hits\t$start\t$stop\t$function\n"; my @matching_proteins; for my $protein_id (keys %locs) { my($prot_contig, $prot_begin, $prot_end, $prot_strand, $prot_left, $prot_right) = @{$locs{$protein_id}}; if ($prot_contig ne $dna_id) { next; } if (($prot_left <= $reg_right) && ($prot_right >= $reg_left)) { push(@matching_proteins, $protein_id); } } if (@matching_proteins > 1) { @matching_proteins = sort { $locs{$a}->[2] <=> $locs{$b}->[2] } @matching_proteins; my $fn_left = $functions{$matching_proteins[0]}; my $fn_right = $functions{$matching_proteins[$#matching_proteins]}; if (($fn_left eq $fn_right) && ($fn_left eq $function)) { print "Possible FS:\t$dna_id\t$start\t$stop\t$function\n"; foreach my $prot (@matching_proteins) { my($prot_contig, $prot_begin, $prot_end, $prot_strand, $prot_left, $prot_right) = @{$locs{$prot}}; print "\t$prot\t$prot_begin\t$prot_end\t$functions{$prot}\n"; } } } } exit 0;