| Filename | /Users/ap13/pathogens/Roary/lib/Bio/Roary/ContigsToGeneIDsFromGFF.pm |
| Statements | Executed 741331 statements in 1.51s |
| Calls | P | F | Exclusive Time |
Inclusive Time |
Subroutine |
|---|---|---|---|---|---|
| 8 | 1 | 1 | 877ms | 1.40s | Bio::Roary::ContigsToGeneIDsFromGFF::_build_contig_to_ids |
| 8 | 1 | 1 | 314ms | 470ms | Bio::Roary::ContigsToGeneIDsFromGFF::_build_overlapping_hypothetical_protein_ids |
| 197 | 1 | 1 | 1.03ms | 1.03ms | Bio::Roary::ContigsToGeneIDsFromGFF::_percent_overlap |
| 8 | 1 | 1 | 66µs | 117µs | Bio::Roary::ContigsToGeneIDsFromGFF::_build__awk_filter |
| 8 | 1 | 1 | 55µs | 55µs | Bio::Roary::ContigsToGeneIDsFromGFF::__ANON__[lib/Bio/Roary/ContigsToGeneIDsFromGFF.pm:24] |
| 1 | 1 | 1 | 33µs | 4.18ms | Bio::Roary::ContigsToGeneIDsFromGFF::BEGIN@17 |
| 1 | 1 | 1 | 10µs | 10µs | Bio::Roary::ContigsToGeneIDsFromGFF::BEGIN@18 |
| 1 | 1 | 1 | 10µs | 100µs | Bio::Roary::ContigsToGeneIDsFromGFF::BEGIN@142 |
| Line | State ments |
Time on line |
Calls | Time in subs |
Code |
|---|---|---|---|---|---|
| 1 | package Bio::Roary::ContigsToGeneIDsFromGFF; | ||||
| 2 | |||||
| 3 | # ABSTRACT: Parse a GFF and efficiently and extract ordered gene ids on each contig | ||||
| 4 | |||||
| 5 | =head1 SYNOPSIS | ||||
| 6 | |||||
| 7 | Parse a GFF and efficiently and extract ordered gene ids on each contig | ||||
| 8 | use Bio::Roary::ContigsToGeneIDsFromGFF; | ||||
| 9 | |||||
| 10 | my $obj = Bio::Roary::ContigsToGeneIDsFromGFF->new( | ||||
| 11 | gff_file => 'abc.gff' | ||||
| 12 | ); | ||||
| 13 | $obj->contig_to_ids; | ||||
| 14 | |||||
| 15 | =cut | ||||
| 16 | |||||
| 17 | 2 | 48µs | 2 | 8.33ms | # spent 4.18ms (33µs+4.15) within Bio::Roary::ContigsToGeneIDsFromGFF::BEGIN@17 which was called:
# once (33µs+4.15ms) by Bio::Roary::OrderGenes::BEGIN@21 at line 17 # spent 4.18ms making 1 call to Bio::Roary::ContigsToGeneIDsFromGFF::BEGIN@17
# spent 4.15ms making 1 call to Moose::import |
| 18 | 2 | 596µs | 1 | 10µs | # spent 10µs within Bio::Roary::ContigsToGeneIDsFromGFF::BEGIN@18 which was called:
# once (10µs+0s) by Bio::Roary::OrderGenes::BEGIN@21 at line 18 # spent 10µs making 1 call to Bio::Roary::ContigsToGeneIDsFromGFF::BEGIN@18 |
| 19 | 1 | 3µs | 1 | 9.12ms | with 'Bio::Roary::ParseGFFAnnotationRole'; # spent 9.12ms making 1 call to Moose::with |
| 20 | |||||
| 21 | 1 | 3µs | 1 | 2.13ms | has 'contig_to_ids' => ( is => 'rw', isa => 'HashRef', lazy => 1, builder => '_build_contig_to_ids'); # spent 2.13ms making 1 call to Moose::has |
| 22 | |||||
| 23 | 1 | 2µs | 1 | 1.86ms | has 'overlapping_hypothetical_protein_ids' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_overlapping_hypothetical_protein_ids'); # spent 1.86ms making 1 call to Moose::has |
| 24 | 9 | 47µs | 1 | 1.79ms | # spent 55µs within Bio::Roary::ContigsToGeneIDsFromGFF::__ANON__[lib/Bio/Roary/ContigsToGeneIDsFromGFF.pm:24] which was called 8 times, avg 7µs/call:
# 8 times (55µs+0s) by Bio::Roary::ContigsToGeneIDsFromGFF::new at line 52 of (eval 25)[Eval/Closure.pm:125], avg 7µs/call # spent 1.79ms making 1 call to Moose::has |
| 25 | |||||
| 26 | 1 | 2µs | 1 | 1.62ms | has '_min_nucleotide_overlap_percentage' => ( is => 'ro', isa => 'Int', default => 10); # spent 1.62ms making 1 call to Moose::has |
| 27 | |||||
| 28 | # Manually parse the GFF file because the BioPerl module is too slow | ||||
| 29 | sub _build_contig_to_ids | ||||
| 30 | # spent 1.40s (877ms+522ms) within Bio::Roary::ContigsToGeneIDsFromGFF::_build_contig_to_ids which was called 8 times, avg 175ms/call:
# 8 times (877ms+522ms) by Bio::Roary::ContigsToGeneIDsFromGFF::contig_to_ids at line 15 of (eval 25)[Eval/Closure.pm:125], avg 175ms/call | ||||
| 31 | 8 | 8µs | my ($self) = @_; | ||
| 32 | 8 | 3µs | my %contigs_to_ids; | ||
| 33 | 8 | 4µs | my @genes_annotation; | ||
| 34 | |||||
| 35 | 8 | 16.6ms | 16 | 16.1ms | open( my $fh, '-|', $self->_gff_fh_input_string ) or die "Couldnt open GFF file"; # spent 15.5ms making 8 calls to Bio::Roary::ContigsToGeneIDsFromGFF::CORE:open, avg 1.94ms/call
# spent 589µs making 8 calls to Bio::Roary::ParseGFFAnnotationRole::_gff_fh_input_string, avg 74µs/call |
| 36 | 8 | 63.3ms | 8 | 63.1ms | while(<$fh>) # spent 63.1ms making 8 calls to Bio::Roary::ContigsToGeneIDsFromGFF::CORE:readline, avg 7.88ms/call |
| 37 | { | ||||
| 38 | 40008 | 8.13ms | chomp; | ||
| 39 | 40008 | 12.5ms | my $line = $_; | ||
| 40 | 40008 | 2.31ms | my $id_name; | ||
| 41 | 40008 | 194ms | 40008 | 94.7ms | if($line =~/ID=["']?([^;"']+)["']?;?/i) # spent 94.7ms making 40008 calls to Bio::Roary::ContigsToGeneIDsFromGFF::CORE:match, avg 2µs/call |
| 42 | { | ||||
| 43 | $id_name= $1; | ||||
| 44 | } | ||||
| 45 | else | ||||
| 46 | { | ||||
| 47 | next; | ||||
| 48 | } | ||||
| 49 | |||||
| 50 | 40008 | 122ms | my @annotation_elements = split(/\t/,$line); | ||
| 51 | # Map gene IDs to the contig | ||||
| 52 | 40008 | 43.0ms | push(@{$contigs_to_ids{$annotation_elements[0]}}, $id_name); | ||
| 53 | |||||
| 54 | 40008 | 525ms | 80016 | 314ms | if($line =~/product=["']?([^;,"']+)[,"']?;?/i) # spent 192ms making 40008 calls to Bio::Roary::ContigsToGeneIDsFromGFF::CORE:readline, avg 5µs/call
# spent 123ms making 40008 calls to Bio::Roary::ContigsToGeneIDsFromGFF::CORE:match, avg 3µs/call |
| 55 | { | ||||
| 56 | 40008 | 4.18ms | my %gene_data; | ||
| 57 | 40008 | 63.5ms | $gene_data{product} = $1; | ||
| 58 | 40008 | 25.3ms | $gene_data{id_name} = $id_name; | ||
| 59 | 40008 | 144ms | 59516 | 32.9ms | if($line =~ /UniProtKB/ || $line =~ /RefSeq/ || $line =~ /protein motif/) # spent 32.9ms making 59516 calls to Bio::Roary::ContigsToGeneIDsFromGFF::CORE:match, avg 553ns/call |
| 60 | { | ||||
| 61 | $gene_data{database_annotation_exists} = 1; | ||||
| 62 | } | ||||
| 63 | else | ||||
| 64 | { | ||||
| 65 | 4684 | 2.60ms | $gene_data{database_annotation_exists} = 0; | ||
| 66 | } | ||||
| 67 | |||||
| 68 | 40008 | 28.4ms | $gene_data{contig} = $annotation_elements[0]; | ||
| 69 | 40008 | 17.7ms | $gene_data{start} = $annotation_elements[1]; | ||
| 70 | 40008 | 17.6ms | $gene_data{end} = $annotation_elements[2]; | ||
| 71 | 40008 | 39.7ms | push(@genes_annotation,\%gene_data); | ||
| 72 | } | ||||
| 73 | |||||
| 74 | } | ||||
| 75 | 8 | 332µs | 8 | 286µs | close($fh); # spent 286µs making 8 calls to Bio::Roary::ContigsToGeneIDsFromGFF::CORE:close, avg 36µs/call |
| 76 | |||||
| 77 | 8 | 127µs | 8 | 170µs | $self->_genes_annotation(\@genes_annotation); # spent 170µs making 8 calls to Bio::Roary::ContigsToGeneIDsFromGFF::_genes_annotation, avg 21µs/call |
| 78 | 8 | 189µs | return \%contigs_to_ids; | ||
| 79 | } | ||||
| 80 | |||||
| 81 | sub _build_overlapping_hypothetical_protein_ids | ||||
| 82 | # spent 470ms (314+156) within Bio::Roary::ContigsToGeneIDsFromGFF::_build_overlapping_hypothetical_protein_ids which was called 8 times, avg 58.7ms/call:
# 8 times (314ms+156ms) by Bio::Roary::ContigsToGeneIDsFromGFF::overlapping_hypothetical_protein_ids at line 12 of (eval 25)[Eval/Closure.pm:125], avg 58.7ms/call | ||||
| 83 | 8 | 7µs | my ($self) = @_; | ||
| 84 | 8 | 13µs | 8 | 18µs | $self->contig_to_ids; # spent 18µs making 8 calls to Bio::Roary::ContigsToGeneIDsFromGFF::contig_to_ids, avg 2µs/call |
| 85 | |||||
| 86 | 8 | 2µs | my %overlapping_protein_ids; | ||
| 87 | |||||
| 88 | #Checking to see if the current feature is hypotheitical and if the next one has annotation | ||||
| 89 | 8 | 45.1ms | 40008 | 51.7ms | for(my $i = 0; $i< (@{$self->_genes_annotation} -1) ; $i++ ) # spent 51.7ms making 40008 calls to Bio::Roary::ContigsToGeneIDsFromGFF::_genes_annotation, avg 1µs/call |
| 90 | { | ||||
| 91 | 40000 | 41.6ms | 40000 | 49.5ms | my $current_feature = $self->_genes_annotation->[$i]; # spent 49.5ms making 40000 calls to Bio::Roary::ContigsToGeneIDsFromGFF::_genes_annotation, avg 1µs/call |
| 92 | 40000 | 48.4ms | 40000 | 49.5ms | my $next_feature = $self->_genes_annotation->[$i+1]; # spent 49.5ms making 40000 calls to Bio::Roary::ContigsToGeneIDsFromGFF::_genes_annotation, avg 1µs/call |
| 93 | |||||
| 94 | 40000 | 24.6ms | next if($current_feature->{database_annotation_exists} == 1); | ||
| 95 | 4677 | 12.2ms | 4677 | 3.74ms | next unless($current_feature->{product} =~ /hypothetical/i); # spent 3.74ms making 4677 calls to Bio::Roary::ContigsToGeneIDsFromGFF::CORE:match, avg 800ns/call |
| 96 | 2596 | 1.26ms | next unless($next_feature->{database_annotation_exists} == 1); | ||
| 97 | |||||
| 98 | 1468 | 637µs | my $start_coord = $current_feature->{start} ; | ||
| 99 | 1468 | 336µs | my $end_coord = $current_feature->{end} ; | ||
| 100 | 1468 | 393µs | my $comparison_start_coord =$next_feature->{start} ; | ||
| 101 | 1468 | 288µs | my $comparison_end_coord =$next_feature->{end} ; | ||
| 102 | 1468 | 1.15ms | if($comparison_start_coord < $end_coord && $comparison_end_coord > $start_coord ) | ||
| 103 | { | ||||
| 104 | 197 | 401µs | 197 | 1.03ms | my $percent_overlap = $self->_percent_overlap($start_coord, $end_coord , $comparison_start_coord,$comparison_end_coord); # spent 1.03ms making 197 calls to Bio::Roary::ContigsToGeneIDsFromGFF::_percent_overlap, avg 5µs/call |
| 105 | 197 | 1.85ms | 197 | 517µs | if($percent_overlap >= $self->_min_nucleotide_overlap_percentage) # spent 517µs making 197 calls to Bio::Roary::ContigsToGeneIDsFromGFF::_min_nucleotide_overlap_percentage, avg 3µs/call |
| 106 | { | ||||
| 107 | $overlapping_protein_ids{$current_feature->{id_name}}++; | ||||
| 108 | } | ||||
| 109 | } | ||||
| 110 | } | ||||
| 111 | |||||
| 112 | 8 | 110µs | return \%overlapping_protein_ids; | ||
| 113 | } | ||||
| 114 | |||||
| 115 | sub _percent_overlap | ||||
| 116 | # spent 1.03ms within Bio::Roary::ContigsToGeneIDsFromGFF::_percent_overlap which was called 197 times, avg 5µs/call:
# 197 times (1.03ms+0s) by Bio::Roary::ContigsToGeneIDsFromGFF::_build_overlapping_hypothetical_protein_ids at line 104, avg 5µs/call | ||||
| 117 | 197 | 192µs | my ($self, $start_coord, $end_coord , $comparison_start_coord,$comparison_end_coord) = @_; | ||
| 118 | 197 | 54µs | my $size_of_hypothetical_gene = $end_coord - $start_coord; | ||
| 119 | |||||
| 120 | 197 | 36µs | my $lower_bound = $start_coord; | ||
| 121 | 197 | 59µs | if($comparison_start_coord > $start_coord) | ||
| 122 | { | ||||
| 123 | $lower_bound = $comparison_start_coord; | ||||
| 124 | } | ||||
| 125 | 197 | 39µs | my $upper_bound = $end_coord; | ||
| 126 | 197 | 31µs | if($comparison_end_coord < $end_coord ) | ||
| 127 | { | ||||
| 128 | $upper_bound = $comparison_end_coord; | ||||
| 129 | } | ||||
| 130 | 197 | 616µs | return (($upper_bound-$lower_bound)*100) / $size_of_hypothetical_gene; | ||
| 131 | } | ||||
| 132 | |||||
| 133 | |||||
| 134 | # spent 117µs (66+51) within Bio::Roary::ContigsToGeneIDsFromGFF::_build__awk_filter which was called 8 times, avg 15µs/call:
# 8 times (66µs+51µs) by Bio::Roary::ContigsToGeneIDsFromGFF::_awk_filter at line 12 of (eval 25)[Eval/Closure.pm:125], avg 15µs/call | ||||
| 135 | 8 | 4µs | my ($self) = @_; | ||
| 136 | return | ||||
| 137 | 8 | 63µs | 8 | 50µs | 'awk \'BEGIN {FS="\t"};{ if ($3 ~/' # spent 50µs making 8 calls to Bio::Roary::ContigsToGeneIDsFromGFF::_tags_to_filter, avg 6µs/call |
| 138 | . $self->_tags_to_filter | ||||
| 139 | . '/) print $1"\t"$4"\t"$5"\t"$9;}\' '; | ||||
| 140 | } | ||||
| 141 | |||||
| 142 | 2 | 47µs | 2 | 190µs | # spent 100µs (10+90) within Bio::Roary::ContigsToGeneIDsFromGFF::BEGIN@142 which was called:
# once (10µs+90µs) by Bio::Roary::OrderGenes::BEGIN@21 at line 142 # spent 100µs making 1 call to Bio::Roary::ContigsToGeneIDsFromGFF::BEGIN@142
# spent 90µs making 1 call to Moose::unimport |
| 143 | 1 | 6µs | 2 | 6.68ms | __PACKAGE__->meta->make_immutable; # spent 6.67ms making 1 call to Class::MOP::Class::make_immutable
# spent 15µs making 1 call to Bio::Roary::ContigsToGeneIDsFromGFF::meta |
| 144 | |||||
| 145 | 1 | 36µs | 1; |