| Filename | /Users/ap13/perl5/lib/perl5/Bio/SeqUtils.pm |
| Statements | Executed 105 statements in 5.48ms |
| Calls | P | F | Exclusive Time |
Inclusive Time |
Subroutine |
|---|---|---|---|---|---|
| 1 | 1 | 1 | 896µs | 1.12ms | Bio::SeqUtils::BEGIN@585 |
| 2 | 2 | 1 | 115µs | 132µs | Bio::SeqUtils::valid_aa |
| 1 | 1 | 1 | 18µs | 37µs | Bio::SeqUtils::BEGIN@189 |
| 1 | 1 | 1 | 14µs | 63µs | Bio::SeqUtils::BEGIN@191 |
| 27 | 1 | 1 | 12µs | 12µs | Bio::SeqUtils::CORE:match (opcode) |
| 1 | 1 | 1 | 11µs | 19µs | Bio::SeqUtils::BEGIN@190 |
| 1 | 1 | 1 | 10µs | 45µs | Bio::SeqUtils::BEGIN@192 |
| 1 | 1 | 1 | 6µs | 6µs | Bio::SeqUtils::CORE:sort (opcode) |
| 0 | 0 | 0 | 0s | 0s | Bio::SeqUtils::_coord_adjust |
| 0 | 0 | 0 | 0s | 0s | Bio::SeqUtils::_coord_adjust_deletion |
| 0 | 0 | 0 | 0s | 0s | Bio::SeqUtils::_coord_adjust_insertion |
| 0 | 0 | 0 | 0s | 0s | Bio::SeqUtils::_coord_revcom |
| 0 | 0 | 0 | 0s | 0s | Bio::SeqUtils::_feature_revcom |
| 0 | 0 | 0 | 0s | 0s | Bio::SeqUtils::_get_similarity |
| 0 | 0 | 0 | 0s | 0s | Bio::SeqUtils::_location_objects_from_coordinate_list |
| 0 | 0 | 0 | 0s | 0s | Bio::SeqUtils::_new_seq_from_old |
| 0 | 0 | 0 | 0s | 0s | Bio::SeqUtils::_new_seq_via_clone |
| 0 | 0 | 0 | 0s | 0s | Bio::SeqUtils::_single_loc_object_from_collection |
| 0 | 0 | 0 | 0s | 0s | Bio::SeqUtils::cat |
| 0 | 0 | 0 | 0s | 0s | Bio::SeqUtils::delete |
| 0 | 0 | 0 | 0s | 0s | Bio::SeqUtils::evolve |
| 0 | 0 | 0 | 0s | 0s | Bio::SeqUtils::insert |
| 0 | 0 | 0 | 0s | 0s | Bio::SeqUtils::ligate |
| 0 | 0 | 0 | 0s | 0s | Bio::SeqUtils::mutate |
| 0 | 0 | 0 | 0s | 0s | Bio::SeqUtils::revcom_with_features |
| 0 | 0 | 0 | 0s | 0s | Bio::SeqUtils::seq3 |
| 0 | 0 | 0 | 0s | 0s | Bio::SeqUtils::seq3in |
| 0 | 0 | 0 | 0s | 0s | Bio::SeqUtils::translate_3frames |
| 0 | 0 | 0 | 0s | 0s | Bio::SeqUtils::translate_6frames |
| 0 | 0 | 0 | 0s | 0s | Bio::SeqUtils::trunc_with_features |
| Line | State ments |
Time on line |
Calls | Time in subs |
Code |
|---|---|---|---|---|---|
| 1 | # BioPerl module for Bio::SeqUtils | ||||
| 2 | # | ||||
| 3 | # Please direct questions and support issues to <bioperl-l@bioperl.org> | ||||
| 4 | # | ||||
| 5 | # Cared for by Heikki Lehvaslaiho <heikki-at-bioperl-dot-org> | ||||
| 6 | # | ||||
| 7 | # Copyright Heikki Lehvaslaiho | ||||
| 8 | # | ||||
| 9 | # You may distribute this module under the same terms as perl itself | ||||
| 10 | |||||
| 11 | # POD documentation - main docs before the code | ||||
| 12 | |||||
| 13 | =head1 NAME | ||||
| 14 | |||||
| 15 | Bio::SeqUtils - Additional methods for PrimarySeq objects | ||||
| 16 | |||||
| 17 | =head1 SYNOPSIS | ||||
| 18 | |||||
| 19 | use Bio::SeqUtils; | ||||
| 20 | # get a Bio::PrimarySeqI compliant object, $seq, somehow | ||||
| 21 | $util = Bio::SeqUtils->new(); | ||||
| 22 | $polypeptide_3char = $util->seq3($seq); | ||||
| 23 | # or | ||||
| 24 | $polypeptide_3char = Bio::SeqUtils->seq3($seq); | ||||
| 25 | |||||
| 26 | # set the sequence string (stored in one char code in the object) | ||||
| 27 | Bio::SeqUtils->seq3($seq, $polypeptide_3char); | ||||
| 28 | |||||
| 29 | # translate a sequence in all six frames | ||||
| 30 | @seqs = Bio::SeqUtils->translate_6frames($seq); | ||||
| 31 | |||||
| 32 | # inplace editing of the sequence | ||||
| 33 | Bio::SeqUtils->mutate($seq, | ||||
| 34 | Bio::LiveSeq::Mutation->new(-seq => 'c', | ||||
| 35 | -pos => 3 | ||||
| 36 | )); | ||||
| 37 | # mutate a sequence to desired similarity% | ||||
| 38 | $newseq = Bio::SeqUtils-> evolve | ||||
| 39 | ($seq, $similarity, $transition_transversion_rate); | ||||
| 40 | |||||
| 41 | # concatenate two or more sequences with annotations and features, | ||||
| 42 | # the first sequence will be modified | ||||
| 43 | Bio::SeqUtils->cat(@seqs); | ||||
| 44 | my $catseq=$seqs[0]; | ||||
| 45 | |||||
| 46 | # truncate a sequence, retaining features and adjusting their | ||||
| 47 | # coordinates if necessary | ||||
| 48 | my $truncseq = Bio::SeqUtils->trunc_with_features($seq, 100, 200); | ||||
| 49 | |||||
| 50 | # reverse complement a sequence and its features | ||||
| 51 | my $revcomseq = Bio::SeqUtils->revcom_with_features($seq); | ||||
| 52 | |||||
| 53 | # simulate cloning of a fragment into a vector. Cut the vector at | ||||
| 54 | # positions 1000 and 1100 (deleting postions 1001 to 1099) and | ||||
| 55 | # "ligate" a fragment into the sites. The fragment is | ||||
| 56 | # reverse-complemented in this example (option "flip"). | ||||
| 57 | # All features of the vector and fragment are preserved and | ||||
| 58 | # features that are affected by the deletion/insertion are | ||||
| 59 | # modified accordingly. | ||||
| 60 | # $vector and $fragment must be Bio::SeqI compliant objects | ||||
| 61 | my $new_molecule = Bio::Sequtils->ligate( | ||||
| 62 | -vector => $vector, | ||||
| 63 | -fragment => $fragment, | ||||
| 64 | -left => 1000, | ||||
| 65 | -right => 1100, | ||||
| 66 | -flip => 1 | ||||
| 67 | ); | ||||
| 68 | |||||
| 69 | # delete a segment of a sequence (from pos 1000 to 1100, inclusive), | ||||
| 70 | # again preserving features and annotations | ||||
| 71 | my $new_molecule = Bio::SeqUtils->cut( $seq, 1000, 1100 ); | ||||
| 72 | |||||
| 73 | # insert a fragment into a recipient between positions 1000 and | ||||
| 74 | # 1001. $recipient is a Bio::SeqI compliant object | ||||
| 75 | my $new_molecule = Bio::SeqUtils::PbrTools->insert( | ||||
| 76 | $recipient_seq, | ||||
| 77 | $fragment_seq, | ||||
| 78 | 1000 | ||||
| 79 | ); | ||||
| 80 | |||||
| 81 | =head1 DESCRIPTION | ||||
| 82 | |||||
| 83 | This class is a holder of methods that work on Bio::PrimarySeqI- | ||||
| 84 | compliant sequence objects, e.g. Bio::PrimarySeq and | ||||
| 85 | Bio::Seq. These methods are not part of the Bio::PrimarySeqI | ||||
| 86 | interface and should in general not be essential to the primary function | ||||
| 87 | of sequence objects. If you are thinking of adding essential | ||||
| 88 | functions, it might be better to create your own sequence class. | ||||
| 89 | See L<Bio::PrimarySeqI>, L<Bio::PrimarySeq>, and L<Bio::Seq> for more. | ||||
| 90 | |||||
| 91 | The methods take as their first argument a sequence object. It is | ||||
| 92 | possible to use methods without first creating a SeqUtils object, | ||||
| 93 | i.e. use it as an anonymous hash. | ||||
| 94 | |||||
| 95 | The first two methods, seq3() and seq3in(), give out or read in protein | ||||
| 96 | sequences coded in three letter IUPAC amino acid codes. | ||||
| 97 | |||||
| 98 | The next two methods, translate_3frames() and translate_6frames(), wrap | ||||
| 99 | around the standard translate method to give back an array of three | ||||
| 100 | forward or all six frame translations. | ||||
| 101 | |||||
| 102 | The mutate() method mutates the sequence string with a mutation | ||||
| 103 | description object. | ||||
| 104 | |||||
| 105 | The cat() method concatenates two or more sequences. The first sequence | ||||
| 106 | is modified by addition of the remaining sequences. All annotations and | ||||
| 107 | sequence features will be transferred. | ||||
| 108 | |||||
| 109 | The revcom_with_features() and trunc_with_features() methods are similar | ||||
| 110 | to the revcom() and trunc() methods from Bio::Seq, but also adjust any | ||||
| 111 | features associated with the sequence as appropriate. | ||||
| 112 | |||||
| 113 | There are also methods that simulate molecular cloning with rich | ||||
| 114 | sequence objects. | ||||
| 115 | The delete() method cuts a segment out of a sequence and re-joins the | ||||
| 116 | left and right fragments (like splicing or digesting and re-ligating a | ||||
| 117 | molecule). Positions (and types) of sequence features are adjusted | ||||
| 118 | accordingly: | ||||
| 119 | Features that span the deleted segment are converted to split featuress | ||||
| 120 | to indicate the disruption. (Sub)Features that extend into the deleted | ||||
| 121 | segment are truncated. | ||||
| 122 | A new molecule is created and returned. | ||||
| 123 | |||||
| 124 | The insert() method inserts a fragment (which can be a rich Bio::Seq | ||||
| 125 | object) into another sequence object adding all annotations and | ||||
| 126 | features to the final product. | ||||
| 127 | Features that span the insertion site are converted to split features | ||||
| 128 | to indicate the disruption. | ||||
| 129 | A new feature is added to indicate the inserted fragment itself. | ||||
| 130 | A new molecule is created and returned. | ||||
| 131 | |||||
| 132 | The ligate() method simulates digesting a recipient (vector) and | ||||
| 133 | ligating a fragment into it, which can also be flipped if needed. It | ||||
| 134 | is simply a combination of a deletion and an insertion step and | ||||
| 135 | returns a new molecule. The rules for modifying feature locations | ||||
| 136 | outlined above are also used here, e.g. features that span the cut | ||||
| 137 | sites are converted to split features with truncated sub-locations. | ||||
| 138 | |||||
| 139 | |||||
| 140 | =head1 FEEDBACK | ||||
| 141 | |||||
| 142 | =head2 Mailing Lists | ||||
| 143 | |||||
| 144 | User feedback is an integral part of the evolution of this and other | ||||
| 145 | Bioperl modules. Send your comments and suggestions preferably to one | ||||
| 146 | of the Bioperl mailing lists. Your participation is much appreciated. | ||||
| 147 | |||||
| 148 | bioperl-l@bioperl.org - General discussion | ||||
| 149 | http://bioperl.org/wiki/Mailing_lists - About the mailing lists | ||||
| 150 | |||||
| 151 | =head2 Support | ||||
| 152 | |||||
| 153 | Please direct usage questions or support issues to the mailing list: | ||||
| 154 | |||||
| 155 | I<bioperl-l@bioperl.org> | ||||
| 156 | |||||
| 157 | rather than to the module maintainer directly. Many experienced and | ||||
| 158 | reponsive experts will be able look at the problem and quickly | ||||
| 159 | address it. Please include a thorough description of the problem | ||||
| 160 | with code and data examples if at all possible. | ||||
| 161 | |||||
| 162 | =head2 Reporting Bugs | ||||
| 163 | |||||
| 164 | Report bugs to the Bioperl bug tracking system to help us keep track | ||||
| 165 | the bugs and their resolution. Bug reports can be submitted via the | ||||
| 166 | web: | ||||
| 167 | |||||
| 168 | https://github.com/bioperl/bioperl-live/issues | ||||
| 169 | |||||
| 170 | =head1 AUTHOR - Heikki Lehvaslaiho | ||||
| 171 | |||||
| 172 | Email: heikki-at-bioperl-dot-org | ||||
| 173 | |||||
| 174 | =head1 CONTRIBUTORS | ||||
| 175 | |||||
| 176 | Roy R. Chaudhuri - roy.chaudhuri at gmail.com | ||||
| 177 | Frank Schwach - frank.schwach@sanger.ac.uk | ||||
| 178 | |||||
| 179 | =head1 APPENDIX | ||||
| 180 | |||||
| 181 | The rest of the documentation details each of the object | ||||
| 182 | methods. Internal methods are usually preceded with a _ | ||||
| 183 | |||||
| 184 | =cut | ||||
| 185 | |||||
| 186 | # Let the code begin... | ||||
| 187 | |||||
| 188 | package Bio::SeqUtils; | ||||
| 189 | 2 | 31µs | 2 | 55µs | # spent 37µs (18+19) within Bio::SeqUtils::BEGIN@189 which was called:
# once (18µs+19µs) by Bio::Tools::CodonTable::BEGIN@191 at line 189 # spent 37µs making 1 call to Bio::SeqUtils::BEGIN@189
# spent 19µs making 1 call to strict::import |
| 190 | 2 | 33µs | 2 | 27µs | # spent 19µs (11+8) within Bio::SeqUtils::BEGIN@190 which was called:
# once (11µs+8µs) by Bio::Tools::CodonTable::BEGIN@191 at line 190 # spent 19µs making 1 call to Bio::SeqUtils::BEGIN@190
# spent 8µs making 1 call to warnings::import |
| 191 | 2 | 31µs | 2 | 112µs | # spent 63µs (14+49) within Bio::SeqUtils::BEGIN@191 which was called:
# once (14µs+49µs) by Bio::Tools::CodonTable::BEGIN@191 at line 191 # spent 63µs making 1 call to Bio::SeqUtils::BEGIN@191
# spent 49µs making 1 call to Exporter::import |
| 192 | 2 | 1.25ms | 2 | 79µs | # spent 45µs (10+34) within Bio::SeqUtils::BEGIN@192 which was called:
# once (10µs+34µs) by Bio::Tools::CodonTable::BEGIN@191 at line 192 # spent 45µs making 1 call to Bio::SeqUtils::BEGIN@192
# spent 34µs making 1 call to parent::import |
| 193 | |||||
| 194 | # new inherited from RootI | ||||
| 195 | |||||
| 196 | 1 | 17µs | our %ONECODE = ( | ||
| 197 | 'Ala' => 'A', | ||||
| 198 | 'Asx' => 'B', | ||||
| 199 | 'Cys' => 'C', | ||||
| 200 | 'Asp' => 'D', | ||||
| 201 | 'Glu' => 'E', | ||||
| 202 | 'Phe' => 'F', | ||||
| 203 | 'Gly' => 'G', | ||||
| 204 | 'His' => 'H', | ||||
| 205 | 'Ile' => 'I', | ||||
| 206 | 'Lys' => 'K', | ||||
| 207 | 'Leu' => 'L', | ||||
| 208 | 'Met' => 'M', | ||||
| 209 | 'Asn' => 'N', | ||||
| 210 | 'Pro' => 'P', | ||||
| 211 | 'Gln' => 'Q', | ||||
| 212 | 'Arg' => 'R', | ||||
| 213 | 'Ser' => 'S', | ||||
| 214 | 'Thr' => 'T', | ||||
| 215 | 'Val' => 'V', | ||||
| 216 | 'Trp' => 'W', | ||||
| 217 | 'Xaa' => 'X', | ||||
| 218 | 'Tyr' => 'Y', | ||||
| 219 | 'Glx' => 'Z', | ||||
| 220 | 'Ter' => '*', | ||||
| 221 | 'Sec' => 'U', | ||||
| 222 | 'Pyl' => 'O', | ||||
| 223 | 'Xle' => 'J' | ||||
| 224 | ); | ||||
| 225 | |||||
| 226 | 1 | 10µs | our %THREECODE = ( | ||
| 227 | 'A' => 'Ala', | ||||
| 228 | 'B' => 'Asx', | ||||
| 229 | 'C' => 'Cys', | ||||
| 230 | 'D' => 'Asp', | ||||
| 231 | 'E' => 'Glu', | ||||
| 232 | 'F' => 'Phe', | ||||
| 233 | 'G' => 'Gly', | ||||
| 234 | 'H' => 'His', | ||||
| 235 | 'I' => 'Ile', | ||||
| 236 | 'K' => 'Lys', | ||||
| 237 | 'L' => 'Leu', | ||||
| 238 | 'M' => 'Met', | ||||
| 239 | 'N' => 'Asn', | ||||
| 240 | 'P' => 'Pro', | ||||
| 241 | 'Q' => 'Gln', | ||||
| 242 | 'R' => 'Arg', | ||||
| 243 | 'S' => 'Ser', | ||||
| 244 | 'T' => 'Thr', | ||||
| 245 | 'V' => 'Val', | ||||
| 246 | 'W' => 'Trp', | ||||
| 247 | 'Y' => 'Tyr', | ||||
| 248 | 'Z' => 'Glx', | ||||
| 249 | 'X' => 'Xaa', | ||||
| 250 | '*' => 'Ter', | ||||
| 251 | 'U' => 'Sec', | ||||
| 252 | 'O' => 'Pyl', | ||||
| 253 | 'J' => 'Xle' | ||||
| 254 | ); | ||||
| 255 | |||||
| 256 | =head2 seq3 | ||||
| 257 | |||||
| 258 | Title : seq3 | ||||
| 259 | Usage : $string = Bio::SeqUtils->seq3($seq) | ||||
| 260 | Function: Read only method that returns the amino acid sequence as a | ||||
| 261 | string of three letter codes. alphabet has to be | ||||
| 262 | 'protein'. Output follows the IUPAC standard plus 'Ter' for | ||||
| 263 | terminator. Any unknown character, including the default | ||||
| 264 | unknown character 'X', is changed into 'Xaa'. A noncoded | ||||
| 265 | aminoacid selenocystein is recognized (Sec, U). | ||||
| 266 | |||||
| 267 | Returns : A scalar | ||||
| 268 | Args : character used for stop in the protein sequence optional, | ||||
| 269 | defaults to '*' string used to separate the output amino | ||||
| 270 | acid codes, optional, defaults to '' | ||||
| 271 | |||||
| 272 | =cut | ||||
| 273 | |||||
| 274 | sub seq3 { | ||||
| 275 | my ( $self, $seq, $stop, $sep ) = @_; | ||||
| 276 | |||||
| 277 | $seq->isa('Bio::PrimarySeqI') | ||||
| 278 | || $self->throw('Not a Bio::PrimarySeqI object but [$self]'); | ||||
| 279 | $seq->alphabet eq 'protein' | ||||
| 280 | || $self->throw('Not a protein sequence'); | ||||
| 281 | |||||
| 282 | if ( defined $stop ) { | ||||
| 283 | length $stop != 1 | ||||
| 284 | and $self->throw('One character stop needed, not [$stop]'); | ||||
| 285 | $THREECODE{$stop} = "Ter"; | ||||
| 286 | } | ||||
| 287 | $sep ||= ''; | ||||
| 288 | |||||
| 289 | my $aa3s; | ||||
| 290 | foreach my $aa ( split //, uc $seq->seq ) { | ||||
| 291 | $THREECODE{$aa} and $aa3s .= $THREECODE{$aa} . $sep, next; | ||||
| 292 | $aa3s .= 'Xaa' . $sep; | ||||
| 293 | } | ||||
| 294 | $sep and substr( $aa3s, -( length $sep ), length $sep ) = ''; | ||||
| 295 | return $aa3s; | ||||
| 296 | } | ||||
| 297 | |||||
| 298 | =head2 seq3in | ||||
| 299 | |||||
| 300 | Title : seq3in | ||||
| 301 | Usage : $seq = Bio::SeqUtils->seq3in($seq, 'MetGlyTer') | ||||
| 302 | Function: Method for changing of the sequence of a | ||||
| 303 | Bio::PrimarySeqI sequence object. The three letter amino | ||||
| 304 | acid input string is converted into one letter code. Any | ||||
| 305 | unknown character triplet, including the default 'Xaa', is | ||||
| 306 | converted into 'X'. | ||||
| 307 | |||||
| 308 | Returns : Bio::PrimarySeq object | ||||
| 309 | Args : sequence string | ||||
| 310 | optional character to be used for stop in the protein sequence, | ||||
| 311 | defaults to '*' | ||||
| 312 | optional character to be used for unknown in the protein sequence, | ||||
| 313 | defaults to 'X' | ||||
| 314 | |||||
| 315 | =cut | ||||
| 316 | |||||
| 317 | sub seq3in { | ||||
| 318 | my ( $self, $seq, $string, $stop, $unknown ) = @_; | ||||
| 319 | |||||
| 320 | $seq->isa('Bio::PrimarySeqI') | ||||
| 321 | || $self->throw("Not a Bio::PrimarySeqI object but [$self]"); | ||||
| 322 | $seq->alphabet eq 'protein' | ||||
| 323 | || $self->throw('Not a protein sequence'); | ||||
| 324 | |||||
| 325 | if ( defined $stop ) { | ||||
| 326 | length $stop != 1 | ||||
| 327 | and $self->throw("One character stop needed, not [$stop]"); | ||||
| 328 | $ONECODE{'Ter'} = $stop; | ||||
| 329 | } | ||||
| 330 | if ( defined $unknown ) { | ||||
| 331 | length $unknown != 1 | ||||
| 332 | and $self->throw("One character stop needed, not [$unknown]"); | ||||
| 333 | $ONECODE{'Xaa'} = $unknown; | ||||
| 334 | } | ||||
| 335 | |||||
| 336 | my ( $aas, $aa3 ); | ||||
| 337 | my $length = ( length $string ) - 2; | ||||
| 338 | for ( my $i = 0 ; $i < $length ; $i += 3 ) { | ||||
| 339 | $aa3 = substr( $string, $i, 3 ); | ||||
| 340 | $aa3 = ucfirst( lc($aa3) ); | ||||
| 341 | $ONECODE{$aa3} and $aas .= $ONECODE{$aa3}, next; | ||||
| 342 | $aas .= $ONECODE{'Xaa'}; | ||||
| 343 | } | ||||
| 344 | $seq->seq($aas); | ||||
| 345 | return $seq; | ||||
| 346 | } | ||||
| 347 | |||||
| 348 | =head2 translate_3frames | ||||
| 349 | |||||
| 350 | Title : translate_3frames | ||||
| 351 | Usage : @prots = Bio::SeqUtils->translate_3frames($seq) | ||||
| 352 | Function: Translate a nucleotide sequence in three forward frames. | ||||
| 353 | The IDs of the sequences are appended with '-0F', '-1F', '-2F'. | ||||
| 354 | Returns : An array of seq objects | ||||
| 355 | Args : sequence object | ||||
| 356 | same arguments as to Bio::PrimarySeqI::translate | ||||
| 357 | |||||
| 358 | =cut | ||||
| 359 | |||||
| 360 | sub translate_3frames { | ||||
| 361 | my ( $self, $seq, @args ) = @_; | ||||
| 362 | |||||
| 363 | $self->throw( 'Object [$seq] ' | ||||
| 364 | . 'of class [' | ||||
| 365 | . ref($seq) | ||||
| 366 | . '] can not be translated.' ) | ||||
| 367 | unless $seq->can('translate'); | ||||
| 368 | |||||
| 369 | my ( $stop, $unknown, $frame, $tableid, $fullCDS, $throw ) = @args; | ||||
| 370 | my @seqs; | ||||
| 371 | my $f = 0; | ||||
| 372 | while ( $f != 3 ) { | ||||
| 373 | my $translation = | ||||
| 374 | $seq->translate( $stop, $unknown, $f, $tableid, $fullCDS, $throw ); | ||||
| 375 | $translation->id( $seq->id . "-" . $f . "F" ); | ||||
| 376 | push @seqs, $translation; | ||||
| 377 | $f++; | ||||
| 378 | } | ||||
| 379 | |||||
| 380 | return @seqs; | ||||
| 381 | } | ||||
| 382 | |||||
| 383 | =head2 translate_6frames | ||||
| 384 | |||||
| 385 | Title : translate_6frames | ||||
| 386 | Usage : @prots = Bio::SeqUtils->translate_6frames($seq) | ||||
| 387 | Function: translate a nucleotide sequence in all six frames | ||||
| 388 | The IDs of the sequences are appended with '-0F', '-1F', '-2F', | ||||
| 389 | '-0R', '-1R', '-2R'. | ||||
| 390 | Returns : An array of seq objects | ||||
| 391 | Args : sequence object | ||||
| 392 | same arguments as to Bio::PrimarySeqI::translate | ||||
| 393 | |||||
| 394 | =cut | ||||
| 395 | |||||
| 396 | sub translate_6frames { | ||||
| 397 | my ( $self, $seq, @args ) = @_; | ||||
| 398 | |||||
| 399 | my @seqs = $self->translate_3frames( $seq, @args ); | ||||
| 400 | my @seqs2 = $self->translate_3frames( $seq->revcom, @args ); | ||||
| 401 | foreach my $seq2 (@seqs2) { | ||||
| 402 | my ($tmp) = $seq2->id; | ||||
| 403 | $tmp =~ s/F$/R/g; | ||||
| 404 | $seq2->id($tmp); | ||||
| 405 | } | ||||
| 406 | return @seqs, @seqs2; | ||||
| 407 | } | ||||
| 408 | |||||
| 409 | =head2 valid_aa | ||||
| 410 | |||||
| 411 | Title : valid_aa | ||||
| 412 | Usage : my @aa = $table->valid_aa | ||||
| 413 | Function: Retrieves a list of the valid amino acid codes. | ||||
| 414 | The list is ordered so that first 21 codes are for unique | ||||
| 415 | amino acids. The rest are ['B', 'Z', 'X', '*']. | ||||
| 416 | Returns : array of all the valid amino acid codes | ||||
| 417 | Args : [optional] $code => [0 -> return list of 1 letter aa codes, | ||||
| 418 | 1 -> return list of 3 letter aa codes, | ||||
| 419 | 2 -> return associative array of both ] | ||||
| 420 | |||||
| 421 | =cut | ||||
| 422 | |||||
| 423 | # spent 132µs (115+17) within Bio::SeqUtils::valid_aa which was called 2 times, avg 66µs/call:
# once (64µs+18µs) by Bio::Tools::CodonTable::BEGIN@198 at line 296 of Bio/Tools/CodonTable.pm
# once (51µs+0s) by Bio::Tools::CodonTable::BEGIN@198 at line 295 of Bio/Tools/CodonTable.pm | ||||
| 424 | 2 | 2µs | my ( $self, $code ) = @_; | ||
| 425 | |||||
| 426 | 2 | 1µs | if ( !$code ) { | ||
| 427 | 1 | 200ns | my @codes; | ||
| 428 | 1 | 14µs | 1 | 6µs | foreach my $c ( sort values %ONECODE ) { # spent 6µs making 1 call to Bio::SeqUtils::CORE:sort |
| 429 | 27 | 54µs | 27 | 12µs | push @codes, $c unless ( $c =~ /[BZX\*]/ ); # spent 12µs making 27 calls to Bio::SeqUtils::CORE:match, avg 437ns/call |
| 430 | } | ||||
| 431 | 1 | 1µs | push @codes, qw(B Z X *); # so they are in correct order ? | ||
| 432 | 1 | 9µs | return @codes; | ||
| 433 | } | ||||
| 434 | elsif ( $code == 1 ) { | ||||
| 435 | my @codes; | ||||
| 436 | foreach my $c ( sort keys %ONECODE ) { | ||||
| 437 | push @codes, $c unless ( $c =~ /(Asx|Glx|Xaa|Ter)/ ); | ||||
| 438 | } | ||||
| 439 | push @codes, ( 'Asx', 'Glx', 'Xaa', 'Ter' ); | ||||
| 440 | return @codes; | ||||
| 441 | } | ||||
| 442 | elsif ( $code == 2 ) { | ||||
| 443 | 1 | 9µs | my %codes = %ONECODE; | ||
| 444 | 1 | 3µs | foreach my $c ( keys %ONECODE ) { | ||
| 445 | 27 | 3µs | my $aa = $ONECODE{$c}; | ||
| 446 | 27 | 6µs | $codes{$aa} = $c; | ||
| 447 | } | ||||
| 448 | 1 | 21µs | return %codes; | ||
| 449 | } | ||||
| 450 | else { | ||||
| 451 | $self->warn( | ||||
| 452 | "unrecognized code in " . ref($self) . " method valid_aa()" ); | ||||
| 453 | return (); | ||||
| 454 | } | ||||
| 455 | } | ||||
| 456 | |||||
| 457 | =head2 mutate | ||||
| 458 | |||||
| 459 | Title : mutate | ||||
| 460 | Usage : Bio::SeqUtils->mutate($seq,$mutation1, $mutation2); | ||||
| 461 | Function: Inplace editing of the sequence. | ||||
| 462 | |||||
| 463 | The second argument can be a Bio::LiveSeq::Mutation object | ||||
| 464 | or an array of them. The mutations are applied sequentially | ||||
| 465 | checking only that their position is within the current | ||||
| 466 | sequence. Insertions are inserted before the given | ||||
| 467 | position. | ||||
| 468 | |||||
| 469 | Returns : boolean | ||||
| 470 | Args : sequence object | ||||
| 471 | mutation, a Bio::LiveSeq::Mutation object, or an array of them | ||||
| 472 | |||||
| 473 | See L<Bio::LiveSeq::Mutation>. | ||||
| 474 | |||||
| 475 | =cut | ||||
| 476 | |||||
| 477 | sub mutate { | ||||
| 478 | my ( $self, $seq, @mutations ) = @_; | ||||
| 479 | |||||
| 480 | $self->throw( 'Object [$seq] ' | ||||
| 481 | . 'of class [' | ||||
| 482 | . ref($seq) | ||||
| 483 | . '] should be a Bio::PrimarySeqI ' ) | ||||
| 484 | unless $seq->isa('Bio::PrimarySeqI'); | ||||
| 485 | $self->throw( 'Object [$mutations[0]] ' | ||||
| 486 | . 'of class [' | ||||
| 487 | . ref( $mutations[0] ) | ||||
| 488 | . '] should be a Bio::LiveSeq::Mutation' ) | ||||
| 489 | unless $mutations[0]->isa('Bio::LiveSeq::Mutation'); | ||||
| 490 | |||||
| 491 | foreach my $mutation (@mutations) { | ||||
| 492 | $self->throw('Attempting to mutate sequence beyond its length') | ||||
| 493 | unless $mutation->pos - 1 <= $seq->length; | ||||
| 494 | |||||
| 495 | my $string = $seq->seq; | ||||
| 496 | substr $string, $mutation->pos - 1, $mutation->len, $mutation->seq; | ||||
| 497 | $seq->seq($string); | ||||
| 498 | } | ||||
| 499 | 1; | ||||
| 500 | } | ||||
| 501 | |||||
| 502 | =head2 cat | ||||
| 503 | |||||
| 504 | Title : cat | ||||
| 505 | Usage : Bio::SeqUtils->cat(@seqs); | ||||
| 506 | my $catseq=$seqs[0]; | ||||
| 507 | Function: Concatenates a list of Bio::Seq objects, adding them all on to the | ||||
| 508 | end of the first sequence. Annotations and sequence features are | ||||
| 509 | copied over from any additional objects, and the coordinates of any | ||||
| 510 | copied features are adjusted appropriately. | ||||
| 511 | Returns : a boolean | ||||
| 512 | Args : array of sequence objects | ||||
| 513 | |||||
| 514 | Note that annotations have no sequence locations. If you concatenate | ||||
| 515 | sequences with the same annotations they will all be added. | ||||
| 516 | |||||
| 517 | =cut | ||||
| 518 | |||||
| 519 | sub cat { | ||||
| 520 | my ( $self, $seq, @seqs ) = @_; | ||||
| 521 | $self->throw( 'Object [$seq] ' | ||||
| 522 | . 'of class [' | ||||
| 523 | . ref($seq) | ||||
| 524 | . '] should be a Bio::PrimarySeqI ' ) | ||||
| 525 | unless $seq->isa('Bio::PrimarySeqI'); | ||||
| 526 | |||||
| 527 | for my $catseq (@seqs) { | ||||
| 528 | $self->throw( 'Object [$catseq] ' | ||||
| 529 | . 'of class [' | ||||
| 530 | . ref($catseq) | ||||
| 531 | . '] should be a Bio::PrimarySeqI ' ) | ||||
| 532 | unless $catseq->isa('Bio::PrimarySeqI'); | ||||
| 533 | |||||
| 534 | $self->throw( | ||||
| 535 | 'Trying to concatenate sequences with different alphabets: ' | ||||
| 536 | . $seq->display_id . '(' | ||||
| 537 | . $seq->alphabet | ||||
| 538 | . ') and ' | ||||
| 539 | . $catseq->display_id . '(' | ||||
| 540 | . $catseq->alphabet | ||||
| 541 | . ')' ) | ||||
| 542 | unless $catseq->alphabet eq $seq->alphabet; | ||||
| 543 | |||||
| 544 | my $length = $seq->length; | ||||
| 545 | $seq->seq( $seq->seq . $catseq->seq ); | ||||
| 546 | |||||
| 547 | # move annotations | ||||
| 548 | if ( $seq->isa("Bio::AnnotatableI") | ||||
| 549 | and $catseq->isa("Bio::AnnotatableI") ) | ||||
| 550 | { | ||||
| 551 | foreach my $key ( $catseq->annotation->get_all_annotation_keys() ) { | ||||
| 552 | |||||
| 553 | foreach my $value ( $catseq->annotation->get_Annotations($key) ) | ||||
| 554 | { | ||||
| 555 | $seq->annotation->add_Annotation( $key, $value ); | ||||
| 556 | } | ||||
| 557 | } | ||||
| 558 | } | ||||
| 559 | |||||
| 560 | # move SeqFeatures | ||||
| 561 | if ( $seq->isa('Bio::SeqI') and $catseq->isa('Bio::SeqI') ) { | ||||
| 562 | for my $feat ( $catseq->get_SeqFeatures ) { | ||||
| 563 | $seq->add_SeqFeature( $self->_coord_adjust( $feat, $length ) ); | ||||
| 564 | } | ||||
| 565 | } | ||||
| 566 | |||||
| 567 | } | ||||
| 568 | 1; | ||||
| 569 | } | ||||
| 570 | |||||
| 571 | =head2 trunc_with_features | ||||
| 572 | |||||
| 573 | Title : trunc_with_features | ||||
| 574 | Usage : $trunc=Bio::SeqUtils->trunc_with_features($seq, $start, $end); | ||||
| 575 | Function: Like Bio::Seq::trunc, but keeps features (adjusting coordinates | ||||
| 576 | where necessary. Features that partially overlap the region have | ||||
| 577 | their location changed to a Bio::Location::Fuzzy. | ||||
| 578 | Returns : A new sequence object | ||||
| 579 | Args : A sequence object, start coordinate, end coordinate (inclusive) | ||||
| 580 | |||||
| 581 | |||||
| 582 | =cut | ||||
| 583 | |||||
| 584 | sub trunc_with_features { | ||||
| 585 | 2 | 3.96ms | 1 | 1.12ms | # spent 1.12ms (896µs+220µs) within Bio::SeqUtils::BEGIN@585 which was called:
# once (896µs+220µs) by Bio::Tools::CodonTable::BEGIN@191 at line 585 # spent 1.12ms making 1 call to Bio::SeqUtils::BEGIN@585 |
| 586 | my ( $self, $seq, $start, $end ) = @_; | ||||
| 587 | $self->throw( 'Object [$seq] ' | ||||
| 588 | . 'of class [' | ||||
| 589 | . ref($seq) | ||||
| 590 | . '] should be a Bio::SeqI ' ) | ||||
| 591 | unless $seq->isa('Bio::SeqI'); | ||||
| 592 | my $trunc = $seq->trunc( $start, $end ); | ||||
| 593 | my $truncrange = | ||||
| 594 | Bio::Range->new( -start => $start, -end => $end, -strand => 0 ); | ||||
| 595 | |||||
| 596 | # make sure that there is no annotation or features in $trunc | ||||
| 597 | # (->trunc() now clone objects except for Bio::Seq::LargePrimarySeq) | ||||
| 598 | $trunc->annotation->remove_Annotations; | ||||
| 599 | $trunc->remove_SeqFeatures; | ||||
| 600 | |||||
| 601 | # move annotations | ||||
| 602 | foreach my $key ( $seq->annotation->get_all_annotation_keys() ) { | ||||
| 603 | foreach my $value ( $seq->annotation->get_Annotations($key) ) { | ||||
| 604 | $trunc->annotation->add_Annotation( $key, $value ); | ||||
| 605 | } | ||||
| 606 | } | ||||
| 607 | |||||
| 608 | # move features | ||||
| 609 | foreach ( | ||||
| 610 | grep { | ||||
| 611 | $_ = $self->_coord_adjust( $_, 1 - $start, $end + 1 - $start ) | ||||
| 612 | if $_->overlaps($truncrange) | ||||
| 613 | } $seq->get_SeqFeatures | ||||
| 614 | ) | ||||
| 615 | { | ||||
| 616 | $trunc->add_SeqFeature($_); | ||||
| 617 | } | ||||
| 618 | return $trunc; | ||||
| 619 | } | ||||
| 620 | |||||
| 621 | =head2 delete | ||||
| 622 | |||||
| 623 | Title : delete | ||||
| 624 | Function: cuts a segment out of a sequence and re-joins the left and right fragments | ||||
| 625 | (like splicing or digesting and re-ligating a molecule). | ||||
| 626 | Positions (and types) of sequence features are adjusted accordingly: | ||||
| 627 | Features that span the cut site are converted to split featuress to | ||||
| 628 | indicate the disruption. | ||||
| 629 | Features that extend into the cut-out fragment are truncated. | ||||
| 630 | A new molecule is created and returned. | ||||
| 631 | Usage : my $cutseq = Bio::SeqUtils::PbrTools->cut( $seq, 1000, 1100 ); | ||||
| 632 | Args : a Bio::PrimarySeqI compliant object to cut, | ||||
| 633 | first nt of the segment to be deleted | ||||
| 634 | last nt of the segment to be deleted | ||||
| 635 | optional: | ||||
| 636 | hash-ref of options: | ||||
| 637 | clone_obj: if true, clone the input sequence object rather | ||||
| 638 | than calling "new" on the object's class | ||||
| 639 | |||||
| 640 | Returns : a new Bio::Seq object | ||||
| 641 | |||||
| 642 | =cut | ||||
| 643 | |||||
| 644 | sub delete { | ||||
| 645 | my $self = shift; | ||||
| 646 | my ( $seq, $left, $right, $opts_ref ) = @_; | ||||
| 647 | $self->throw( 'was expecting 3-4 paramters but got ' . @_ ) | ||||
| 648 | unless @_ == 3 || @_ == 4; | ||||
| 649 | |||||
| 650 | $self->throw( | ||||
| 651 | 'Object of class [' . ref($seq) . '] should be a Bio::PrimarySeqI ' ) | ||||
| 652 | unless blessed($seq) && $seq->isa('Bio::PrimarySeqI'); | ||||
| 653 | |||||
| 654 | $self->throw("Left coordinate ($left) must be >= 1") if $left < 1; | ||||
| 655 | if ( $right > $seq->length ) { | ||||
| 656 | $self->throw( "Right coordinate ($right) must be less than " | ||||
| 657 | . 'sequence length (' | ||||
| 658 | . $seq->length | ||||
| 659 | . ')' ); | ||||
| 660 | } | ||||
| 661 | |||||
| 662 | # piece together the sequence string of the remaining fragments | ||||
| 663 | my $left_seq = $seq->subseq( 1, $left - 1 ); | ||||
| 664 | my $right_seq = $seq->subseq( $right + 1, $seq->length ); | ||||
| 665 | if ( !$left_seq || !$right_seq ) { | ||||
| 666 | $self->throw( | ||||
| 667 | 'could not assemble sequences. At least one of the fragments is empty' | ||||
| 668 | ); | ||||
| 669 | } | ||||
| 670 | my $seq_str = $left_seq . $right_seq; | ||||
| 671 | |||||
| 672 | # create the new seq object with the same class as the recipient | ||||
| 673 | # or (if requested), make a clone of the existing object. In the | ||||
| 674 | # latter case we need to remove sequence features from the cloned | ||||
| 675 | # object instead of copying them | ||||
| 676 | my $product; | ||||
| 677 | if ( $opts_ref->{clone_obj} ) { | ||||
| 678 | $product = $self->_new_seq_via_clone( $seq, $seq_str ); | ||||
| 679 | } | ||||
| 680 | else { | ||||
| 681 | $product = $self->_new_seq_from_old( $seq, { seq => $seq_str } ); | ||||
| 682 | } | ||||
| 683 | |||||
| 684 | # move sequence features | ||||
| 685 | if ( $product->isa('Bio::SeqI') && $seq->isa('Bio::SeqI') ) { | ||||
| 686 | for my $feat ( $seq->get_SeqFeatures ) { | ||||
| 687 | my $adjfeat = $self->_coord_adjust_deletion( $feat, $left, $right ); | ||||
| 688 | $product->add_SeqFeature($adjfeat) if $adjfeat; | ||||
| 689 | } | ||||
| 690 | } | ||||
| 691 | |||||
| 692 | # add a feature to annotatde the deletion | ||||
| 693 | my $deletion_feature = Bio::SeqFeature::Generic->new( | ||||
| 694 | -primary_tag => 'misc_feature', | ||||
| 695 | -tag => { note => 'deletion of ' . ( $right - $left + 1 ) . 'bp' }, | ||||
| 696 | -location => Bio::Location::Simple->new( | ||||
| 697 | -start => $left - 1, | ||||
| 698 | -end => $left, | ||||
| 699 | -location_type => 'IN-BETWEEN' | ||||
| 700 | ) | ||||
| 701 | ); | ||||
| 702 | $product->add_SeqFeature($deletion_feature); | ||||
| 703 | return $product; | ||||
| 704 | } | ||||
| 705 | |||||
| 706 | =head2 insert | ||||
| 707 | |||||
| 708 | Title : insert | ||||
| 709 | Function: inserts a fragment (a Bio::Seq object) into a nother sequence object | ||||
| 710 | adding all annotations and features to the final product. | ||||
| 711 | Features that span the insertion site are converted to split | ||||
| 712 | features to indicate the disruption. | ||||
| 713 | A new feature is added to indicate the inserted fragment itself. | ||||
| 714 | A new molecule is created and returned. | ||||
| 715 | Usage : # insert a fragment after pos 1000 | ||||
| 716 | my $insert_seq = Bio::SeqUtils::PbrTools->insert( | ||||
| 717 | $recipient_seq, | ||||
| 718 | $fragment_seq, | ||||
| 719 | 1000 | ||||
| 720 | ); | ||||
| 721 | Args : recipient sequence (a Bio::PrimarySeqI compliant object), | ||||
| 722 | a fragmetn to insert (Bio::PrimarySeqI compliant object), | ||||
| 723 | insertion position (fragment is inserted to the right of this pos) | ||||
| 724 | pos=0 will prepend the fragment to the recipient | ||||
| 725 | optional: | ||||
| 726 | hash-ref of options: | ||||
| 727 | clone_obj: if true, clone the input sequence object rather | ||||
| 728 | than calling "new" on the object's class | ||||
| 729 | Returns : a new Bio::Seq object | ||||
| 730 | |||||
| 731 | =cut | ||||
| 732 | |||||
| 733 | sub insert { | ||||
| 734 | my $self = shift; | ||||
| 735 | my ( $recipient, $fragment, $insert_pos, $opts_ref ) = @_; | ||||
| 736 | $self->throw( 'was expecting 3-4 paramters but got ' . @_ ) | ||||
| 737 | unless @_ == 3 || @_ == 4; | ||||
| 738 | |||||
| 739 | $self->throw( 'Recipient object of class [' | ||||
| 740 | . ref($recipient) | ||||
| 741 | . '] should be a Bio::PrimarySeqI ' ) | ||||
| 742 | unless blessed($recipient) && $recipient->isa('Bio::PrimarySeqI'); | ||||
| 743 | |||||
| 744 | $self->throw( 'Fragment object of class [' | ||||
| 745 | . ref($fragment) | ||||
| 746 | . '] should be a Bio::PrimarySeqI ' ) | ||||
| 747 | unless blessed($fragment) && $fragment->isa('Bio::PrimarySeqI'); | ||||
| 748 | |||||
| 749 | $self->throw( 'Can\'t concatenate sequences with different alphabets: ' | ||||
| 750 | . 'recipient is ' | ||||
| 751 | . $recipient->alphabet | ||||
| 752 | . ' and fragment is ' | ||||
| 753 | . $fragment->alphabet ) | ||||
| 754 | unless $recipient->alphabet eq $fragment->alphabet; | ||||
| 755 | |||||
| 756 | if ( $insert_pos < 0 or $insert_pos > $recipient->length ) { | ||||
| 757 | $self->throw( "insertion position ($insert_pos) must be between 0 and " | ||||
| 758 | . 'recipient sequence length (' | ||||
| 759 | . $recipient->length | ||||
| 760 | . ')' ); | ||||
| 761 | } | ||||
| 762 | |||||
| 763 | if ( $fragment->can('is_circular') && $fragment->is_circular ) { | ||||
| 764 | $self->throw('Can\'t insert circular fragments'); | ||||
| 765 | } | ||||
| 766 | |||||
| 767 | if ( !$recipient->seq ) { | ||||
| 768 | $self->throw( | ||||
| 769 | 'Recipient has no sequence, can not insert into this object'); | ||||
| 770 | } | ||||
| 771 | |||||
| 772 | # construct raw sequence of the new molecule | ||||
| 773 | my $left_seq = | ||||
| 774 | $insert_pos > 0 | ||||
| 775 | ? $recipient->subseq( 1, $insert_pos ) | ||||
| 776 | : ''; | ||||
| 777 | my $mid_seq = $fragment->seq; | ||||
| 778 | my $right_seq = | ||||
| 779 | $insert_pos < $recipient->length | ||||
| 780 | ? $recipient->subseq( $insert_pos + 1, $recipient->length ) | ||||
| 781 | : ''; | ||||
| 782 | my $seq_str = $left_seq . $mid_seq . $right_seq; | ||||
| 783 | |||||
| 784 | # create the new seq object with the same class as the recipient | ||||
| 785 | # or (if requested), make a clone of the existing object. In the | ||||
| 786 | # latter case we need to remove sequence features from the cloned | ||||
| 787 | # object instead of copying them | ||||
| 788 | my $product; | ||||
| 789 | if ( $opts_ref->{clone_obj} ) { | ||||
| 790 | $product = $self->_new_seq_via_clone( $recipient, $seq_str ); | ||||
| 791 | } | ||||
| 792 | else { | ||||
| 793 | my @desc; | ||||
| 794 | push @desc, 'Inserted fragment: ' . $fragment->desc | ||||
| 795 | if defined $fragment->desc; | ||||
| 796 | push @desc, 'Recipient: ' . $recipient->desc | ||||
| 797 | if defined $recipient->desc; | ||||
| 798 | $product = $self->_new_seq_from_old( | ||||
| 799 | $recipient, | ||||
| 800 | { | ||||
| 801 | seq => $seq_str, | ||||
| 802 | display_id => $recipient->display_id, | ||||
| 803 | accession_number => $recipient->accession_number || '', | ||||
| 804 | alphabet => $recipient->alphabet, | ||||
| 805 | desc => join( '; ', @desc ), | ||||
| 806 | verbose => $recipient->verbose || $fragment->verbose, | ||||
| 807 | is_circular => $recipient->is_circular || 0, | ||||
| 808 | } | ||||
| 809 | ); | ||||
| 810 | |||||
| 811 | } # if clone_obj | ||||
| 812 | |||||
| 813 | # move annotations from fragment to product | ||||
| 814 | if ( $product->isa("Bio::AnnotatableI") | ||||
| 815 | && $fragment->isa("Bio::AnnotatableI") ) | ||||
| 816 | { | ||||
| 817 | foreach my $key ( $fragment->annotation->get_all_annotation_keys ) { | ||||
| 818 | foreach my $value ( $fragment->annotation->get_Annotations($key) ) { | ||||
| 819 | $product->annotation->add_Annotation( $key, $value ); | ||||
| 820 | } | ||||
| 821 | } | ||||
| 822 | } | ||||
| 823 | |||||
| 824 | # move sequence features to product with adjusted coordinates | ||||
| 825 | if ( $product->isa('Bio::SeqI') ) { | ||||
| 826 | |||||
| 827 | # for the fragment, just shift the features to new position | ||||
| 828 | if ( $fragment->isa('Bio::SeqI') ) { | ||||
| 829 | for my $feat ( $fragment->get_SeqFeatures ) { | ||||
| 830 | my $adjfeat = $self->_coord_adjust( $feat, $insert_pos ); | ||||
| 831 | $product->add_SeqFeature($adjfeat) if $adjfeat; | ||||
| 832 | } | ||||
| 833 | } | ||||
| 834 | |||||
| 835 | # for recipient, shift and modify features according to insertion. | ||||
| 836 | if ( $recipient->isa('Bio::SeqI') ) { | ||||
| 837 | for my $feat ( $recipient->get_SeqFeatures ) { | ||||
| 838 | my $adjfeat = | ||||
| 839 | $self->_coord_adjust_insertion( $feat, $insert_pos, | ||||
| 840 | $fragment->length ); | ||||
| 841 | $product->add_SeqFeature($adjfeat) if $adjfeat; | ||||
| 842 | } | ||||
| 843 | } | ||||
| 844 | } | ||||
| 845 | |||||
| 846 | # add a feature to annotate the insertion | ||||
| 847 | my $insertion_feature = Bio::SeqFeature::Generic->new( | ||||
| 848 | -start => $insert_pos + 1, | ||||
| 849 | -end => $insert_pos + $fragment->length, | ||||
| 850 | -primary_tag => 'misc_feature', | ||||
| 851 | -tag => { note => 'inserted fragment' }, | ||||
| 852 | ); | ||||
| 853 | $product->add_SeqFeature($insertion_feature); | ||||
| 854 | |||||
| 855 | return $product; | ||||
| 856 | } | ||||
| 857 | |||||
| 858 | =head2 ligate | ||||
| 859 | |||||
| 860 | title : ligate | ||||
| 861 | function: pastes a fragment (which can also have features) into a recipient | ||||
| 862 | sequence between two "cut" sites, preserving features and adjusting | ||||
| 863 | their locations. | ||||
| 864 | This is a shortcut for deleting a segment from a sequence object followed | ||||
| 865 | by an insertion of a fragmnet and is supposed to be used to simulate | ||||
| 866 | in-vitro cloning where a recipient (a vector) is digested and a fragment | ||||
| 867 | is then ligated into the recipient molecule. The fragment can be flipped | ||||
| 868 | (reverse-complemented with all its features). | ||||
| 869 | A new sequence object is returned to represent the product of the reaction. | ||||
| 870 | Features and annotations are transferred from the insert to the product | ||||
| 871 | and features on the recipient are adjusted according to the methods | ||||
| 872 | L</"delete"> amd L</"insert">: | ||||
| 873 | Features spanning the insertion site will be split up into two sub-locations. | ||||
| 874 | (Sub-)features in the deleted region are themselves deleted. | ||||
| 875 | (Sub-)features that extend into the deleted region are truncated. | ||||
| 876 | The class of the product object depends on the class of the recipient (vector) | ||||
| 877 | sequence object. if it is not possible to instantiate a new | ||||
| 878 | object of that class, a Bio::Primaryseq object is created instead. | ||||
| 879 | usage : # insert the flipped fragment between positions 1000 and 1100 of the | ||||
| 880 | # vector, i.e. everything between these two positions is deleted and | ||||
| 881 | # replaced by the fragment | ||||
| 882 | my $new_molecule = Bio::Sequtils::Pbrtools->ligate( | ||||
| 883 | -recipient => $vector, | ||||
| 884 | -fragment => $fragment, | ||||
| 885 | -left => 1000, | ||||
| 886 | -right => 1100, | ||||
| 887 | -flip => 1, | ||||
| 888 | -clone_obj => 1 | ||||
| 889 | ); | ||||
| 890 | args : recipient: the recipient/vector molecule | ||||
| 891 | fragment: molecule that is to be ligated into the vector | ||||
| 892 | left: left cut site (fragment will be inserted to the right of | ||||
| 893 | this position) | ||||
| 894 | optional: | ||||
| 895 | right: right cut site (fragment will be inseterted to the | ||||
| 896 | left of this position). defaults to left+1 | ||||
| 897 | flip: boolean, if true, the fragment is reverse-complemented | ||||
| 898 | (including features) before inserting | ||||
| 899 | clone_obj: if true, clone the recipient object to create the product | ||||
| 900 | instead of calling "new" on its class | ||||
| 901 | returns : a new Bio::Seq object of the ligated fragments | ||||
| 902 | |||||
| 903 | =cut | ||||
| 904 | |||||
| 905 | sub ligate { | ||||
| 906 | my $self = shift; | ||||
| 907 | my ( $recipient, $fragment, $left, $right, $flip, $clone_obj ) = | ||||
| 908 | $self->_rearrange( [qw(RECIPIENT FRAGMENT LEFT RIGHT FLIP CLONE_OBJ )], | ||||
| 909 | @_ ); | ||||
| 910 | $self->throw("missing required parameter 'recipient'") unless $recipient; | ||||
| 911 | $self->throw("missing required parameter 'fragment'") unless $fragment; | ||||
| 912 | $self->throw("missing required parameter 'left'") unless defined $left; | ||||
| 913 | |||||
| 914 | $right ||= $left + 1; | ||||
| 915 | |||||
| 916 | $self->throw( | ||||
| 917 | "Fragment must be a Bio::PrimarySeqI compliant object but it is a " | ||||
| 918 | . ref($fragment) ) | ||||
| 919 | unless blessed($fragment) && $fragment->isa('Bio::PrimarySeqI'); | ||||
| 920 | |||||
| 921 | $fragment = $self->revcom_with_features($fragment) if $flip; | ||||
| 922 | |||||
| 923 | my $opts_ref = {}; | ||||
| 924 | $opts_ref->{clone_obj} = 1 if $clone_obj; | ||||
| 925 | |||||
| 926 | # clone in two steps: first delete between the insertion sites, | ||||
| 927 | # then insert the fragment. Step 1 is skipped if insert positions | ||||
| 928 | # are adjacent (no deletion) | ||||
| 929 | my ( $product1, $product2 ); | ||||
| 930 | eval { | ||||
| 931 | if ( $right == $left + 1 ) { | ||||
| 932 | $product1 = $recipient; | ||||
| 933 | } | ||||
| 934 | else { | ||||
| 935 | $product1 = | ||||
| 936 | $self->delete( $recipient, $left + 1, $right - 1, $opts_ref ); | ||||
| 937 | } | ||||
| 938 | }; | ||||
| 939 | $self->throw( "Failed in step 1 (cut recipient): " . $@ ) if $@; | ||||
| 940 | eval { $product2 = $self->insert( $product1, $fragment, $left, $opts_ref ) }; | ||||
| 941 | $self->throw( "Failed in step 2 (insert fragment): " . $@ ) if $@; | ||||
| 942 | |||||
| 943 | return $product2; | ||||
| 944 | |||||
| 945 | } | ||||
| 946 | |||||
| 947 | =head2 _coord_adjust_deletion | ||||
| 948 | |||||
| 949 | title : _coord_adjust_deletion | ||||
| 950 | function: recursively adjusts coordinates of seqfeatures on a molecule | ||||
| 951 | where a segment has been deleted. | ||||
| 952 | (sub)features that span the deletion site become split features. | ||||
| 953 | (sub)features that extend into the deletion site are truncated. | ||||
| 954 | A note is added to the feature to inform about the size and | ||||
| 955 | position of the deletion. | ||||
| 956 | usage : my $adjusted_feature = Bio::Sequtils::_coord_adjust_deletion( | ||||
| 957 | $feature, | ||||
| 958 | $start, | ||||
| 959 | $end | ||||
| 960 | ); | ||||
| 961 | args : a Bio::SeqFeatureI compliant object, | ||||
| 962 | start (inclusive) position of the deletion site, | ||||
| 963 | end (inclusive) position of the deletion site | ||||
| 964 | returns : a Bio::SeqFeatureI compliant object | ||||
| 965 | |||||
| 966 | =cut | ||||
| 967 | |||||
| 968 | sub _coord_adjust_deletion { | ||||
| 969 | my ( $self, $feat, $left, $right ) = @_; | ||||
| 970 | |||||
| 971 | $self->throw( 'object [$feat] ' | ||||
| 972 | . 'of class [' | ||||
| 973 | . ref($feat) | ||||
| 974 | . '] should be a Bio::SeqFeatureI ' ) | ||||
| 975 | unless $feat->isa('Bio::SeqFeatureI'); | ||||
| 976 | $self->throw('missing coordinates: need a left and a right position') | ||||
| 977 | unless defined $left && defined $right; | ||||
| 978 | |||||
| 979 | if ( $left > $right ) { | ||||
| 980 | if ( $feat->can('is_circular') && $feat->is_circular ) { | ||||
| 981 | |||||
| 982 | # todo handle circular molecules | ||||
| 983 | $self->throw( | ||||
| 984 | 'can not yet handle deletions in circular molecules if deletion spans origin' | ||||
| 985 | ); | ||||
| 986 | } | ||||
| 987 | else { | ||||
| 988 | $self->throw( | ||||
| 989 | "left coordinate ($left) must be less than right ($right)" | ||||
| 990 | . " but it was greater" ); | ||||
| 991 | } | ||||
| 992 | } | ||||
| 993 | my $deletion = Bio::Location::Simple->new( | ||||
| 994 | -start => $left, | ||||
| 995 | -end => $right, | ||||
| 996 | ); | ||||
| 997 | my $del_length = $right - $left + 1; | ||||
| 998 | |||||
| 999 | my @adjsubfeat; | ||||
| 1000 | for my $subfeat ( $feat->get_SeqFeatures ) { | ||||
| 1001 | my $adjsubfeat = | ||||
| 1002 | $self->_coord_adjust_deletion( $subfeat, $left, $right ); | ||||
| 1003 | push @adjsubfeat, $adjsubfeat if $adjsubfeat; | ||||
| 1004 | } | ||||
| 1005 | |||||
| 1006 | my @loc; | ||||
| 1007 | my $note; | ||||
| 1008 | for ( $feat->location->each_Location ) { | ||||
| 1009 | next if $deletion->contains($_); # this location will be deleted; | ||||
| 1010 | my $strand = $_->strand; | ||||
| 1011 | my $type = $_->location_type; | ||||
| 1012 | my $start = $_->start; | ||||
| 1013 | my $start_type = $_->can('start_pos_type') ? $_->start_pos_type : undef; | ||||
| 1014 | my $end = $_->end; | ||||
| 1015 | my $end_type = $_->can('end_pos_type') ? $_->end_pos_type : undef; | ||||
| 1016 | my @newcoords = (); | ||||
| 1017 | if ( $start < $deletion->start && $end > $deletion->end ) | ||||
| 1018 | { # split the feature | ||||
| 1019 | @newcoords = ( | ||||
| 1020 | [ $start, ( $deletion->start - 1 ), $start_type, $end_type ], | ||||
| 1021 | [ | ||||
| 1022 | ( $deletion->start ), $end - $del_length, | ||||
| 1023 | $start_type, $end_type | ||||
| 1024 | ] | ||||
| 1025 | ); | ||||
| 1026 | $note = | ||||
| 1027 | $del_length | ||||
| 1028 | . 'bp internal deletion between pos ' | ||||
| 1029 | . ( $deletion->start - 1 ) . ' and ' | ||||
| 1030 | . $deletion->start; | ||||
| 1031 | } | ||||
| 1032 | elsif ( $_->start < $deletion->start && $_->end >= $deletion->start ) | ||||
| 1033 | { # truncate feature end | ||||
| 1034 | @newcoords = | ||||
| 1035 | ( [ $start, ( $deletion->start - 1 ), $start_type, $end_type ] ); | ||||
| 1036 | $note = | ||||
| 1037 | ( $end - $deletion->start + 1 ) . 'bp deleted from feature '; | ||||
| 1038 | if ( $feat->strand ) { | ||||
| 1039 | $note .= $feat->strand == 1 ? "3' " : "5' "; | ||||
| 1040 | } | ||||
| 1041 | $note .= 'end'; | ||||
| 1042 | } | ||||
| 1043 | elsif ( $_->start <= $deletion->end && $_->end > $deletion->end ) | ||||
| 1044 | { # truncate feature start and shift end | ||||
| 1045 | @newcoords = ( | ||||
| 1046 | [ | ||||
| 1047 | ( $deletion->start ), $end - $del_length, | ||||
| 1048 | $start_type, $end_type | ||||
| 1049 | ] | ||||
| 1050 | ); | ||||
| 1051 | $note = | ||||
| 1052 | ( $deletion->end - $start + 1 ) . 'bp deleted from feature '; | ||||
| 1053 | if ( $feat->strand ) { | ||||
| 1054 | $note .= $feat->strand == 1 ? "5' end" : "3' end"; | ||||
| 1055 | } | ||||
| 1056 | else { | ||||
| 1057 | $note .= 'start'; | ||||
| 1058 | } | ||||
| 1059 | } | ||||
| 1060 | elsif ( $start >= $deletion->end ) { # just shift entire location | ||||
| 1061 | @newcoords = ( | ||||
| 1062 | [ | ||||
| 1063 | $start - $del_length, $end - $del_length, | ||||
| 1064 | $start_type, $end_type | ||||
| 1065 | ] | ||||
| 1066 | ); | ||||
| 1067 | } | ||||
| 1068 | else { # not affected by deletion | ||||
| 1069 | @newcoords = ( [ $start, $end, $start_type, $end_type ] ); | ||||
| 1070 | } | ||||
| 1071 | |||||
| 1072 | # if we have no coordinates, we return nothing | ||||
| 1073 | # the feature is deleted | ||||
| 1074 | return unless @newcoords; | ||||
| 1075 | |||||
| 1076 | my @subloc = | ||||
| 1077 | $self->_location_objects_from_coordinate_list( \@newcoords, $strand, | ||||
| 1078 | $type ); | ||||
| 1079 | push @loc, $self->_single_loc_object_from_collection(@subloc); | ||||
| 1080 | } # each location | ||||
| 1081 | |||||
| 1082 | # create new feature based on original one and move annotation across | ||||
| 1083 | my $newfeat = | ||||
| 1084 | Bio::SeqFeature::Generic->new( -primary => $feat->primary_tag ); | ||||
| 1085 | foreach my $key ( $feat->annotation->get_all_annotation_keys() ) { | ||||
| 1086 | foreach my $value ( $feat->annotation->get_Annotations($key) ) { | ||||
| 1087 | $newfeat->annotation->add_Annotation( $key, $value ); | ||||
| 1088 | } | ||||
| 1089 | } | ||||
| 1090 | foreach my $key ( $feat->get_all_tags() ) { | ||||
| 1091 | $newfeat->add_tag_value( $key, $feat->get_tag_values($key) ); | ||||
| 1092 | } | ||||
| 1093 | |||||
| 1094 | # If we have a note about the deleted bases, add it | ||||
| 1095 | if ($note) { | ||||
| 1096 | $newfeat->add_tag_value( 'note', $note ); | ||||
| 1097 | } | ||||
| 1098 | |||||
| 1099 | # set modified location(s) for the new feature and | ||||
| 1100 | # add its subfeatures if any | ||||
| 1101 | my $loc = $self->_single_loc_object_from_collection(@loc); | ||||
| 1102 | $loc ? $newfeat->location($loc) : return; | ||||
| 1103 | $newfeat->add_SeqFeature($_) for @adjsubfeat; | ||||
| 1104 | |||||
| 1105 | return $newfeat; | ||||
| 1106 | |||||
| 1107 | } | ||||
| 1108 | |||||
| 1109 | =head2 _coord_adjust_insertion | ||||
| 1110 | |||||
| 1111 | title : _coord_adjust_insertion | ||||
| 1112 | function: recursively adjusts coordinates of seqfeatures on a molecule | ||||
| 1113 | where another sequence has been inserted. | ||||
| 1114 | (sub)features that span the insertion site become split features | ||||
| 1115 | and a note is added about the size and positin of the insertion. | ||||
| 1116 | Features with an IN-BETWEEN location at the insertion site | ||||
| 1117 | are lost (such features can only exist between adjacent bases) | ||||
| 1118 | usage : my $adjusted_feature = Bio::Sequtils::_coord_adjust_insertion( | ||||
| 1119 | $feature, | ||||
| 1120 | $insert_pos, | ||||
| 1121 | $insert_length | ||||
| 1122 | ); | ||||
| 1123 | args : a Bio::SeqFeatureI compliant object, | ||||
| 1124 | insertion position (insert to the right of this position) | ||||
| 1125 | length of inserted fragment | ||||
| 1126 | returns : a Bio::SeqFeatureI compliant object | ||||
| 1127 | |||||
| 1128 | =cut | ||||
| 1129 | |||||
| 1130 | sub _coord_adjust_insertion { | ||||
| 1131 | my ( $self, $feat, $insert_pos, $insert_len ) = @_; | ||||
| 1132 | |||||
| 1133 | $self->throw( 'object [$feat] ' | ||||
| 1134 | . 'of class [' | ||||
| 1135 | . ref($feat) | ||||
| 1136 | . '] should be a Bio::SeqFeatureI ' ) | ||||
| 1137 | unless $feat->isa('Bio::SeqFeatureI'); | ||||
| 1138 | $self->throw('missing insert position') unless defined $insert_pos; | ||||
| 1139 | $self->throw('missing insert length') unless defined $insert_len; | ||||
| 1140 | |||||
| 1141 | my @adjsubfeat; | ||||
| 1142 | for my $subfeat ( $feat->get_SeqFeatures ) { | ||||
| 1143 | push @adjsubfeat, | ||||
| 1144 | $self->_coord_adjust_insertion( $subfeat, $insert_pos, $insert_len ); | ||||
| 1145 | } | ||||
| 1146 | |||||
| 1147 | my @loc; | ||||
| 1148 | my $note; | ||||
| 1149 | for ( $feat->location->each_Location ) { | ||||
| 1150 | |||||
| 1151 | # loose IN-BETWEEN features at the insertion site | ||||
| 1152 | next | ||||
| 1153 | if ( $_->location_type eq 'IN-BETWEEN' && $_->start == $insert_pos ); | ||||
| 1154 | my $strand = $_->strand; | ||||
| 1155 | my $type = $_->location_type; | ||||
| 1156 | my $start = $_->start; | ||||
| 1157 | my $start_type = $_->can('start_pos_type') ? $_->start_pos_type : undef; | ||||
| 1158 | my $end = $_->end; | ||||
| 1159 | my $end_type = $_->can('end_pos_type') ? $_->end_pos_type : undef; | ||||
| 1160 | my @newcoords = (); | ||||
| 1161 | if ( $start <= $insert_pos && $end > $insert_pos ) { # split the feature | ||||
| 1162 | @newcoords = ( | ||||
| 1163 | [ $start, $insert_pos, $start_type, $end_type ], | ||||
| 1164 | [ | ||||
| 1165 | ( $insert_pos + 1 + $insert_len ), $end + $insert_len, | ||||
| 1166 | $start_type, $end_type | ||||
| 1167 | ] | ||||
| 1168 | ); | ||||
| 1169 | $note = | ||||
| 1170 | $insert_len | ||||
| 1171 | . 'bp internal insertion between pos ' | ||||
| 1172 | . $insert_pos . ' and ' | ||||
| 1173 | . ( $insert_pos + $insert_len + 1 ); | ||||
| 1174 | |||||
| 1175 | } | ||||
| 1176 | elsif ( $start > $insert_pos ) { # just shift entire location | ||||
| 1177 | @newcoords = ( | ||||
| 1178 | [ | ||||
| 1179 | $start + $insert_len, $end + $insert_len, | ||||
| 1180 | $start_type, $end_type | ||||
| 1181 | ] | ||||
| 1182 | ); | ||||
| 1183 | } | ||||
| 1184 | else { # not affected | ||||
| 1185 | @newcoords = ( [ $start, $end, $start_type, $end_type ] ); | ||||
| 1186 | } | ||||
| 1187 | |||||
| 1188 | # if we have deleted all coordinates, return nothing | ||||
| 1189 | # (possible if all locations are IN-BETWEEN) | ||||
| 1190 | return unless @newcoords; | ||||
| 1191 | |||||
| 1192 | my @subloc = | ||||
| 1193 | $self->_location_objects_from_coordinate_list( \@newcoords, $strand, | ||||
| 1194 | $type ); | ||||
| 1195 | |||||
| 1196 | # put together final location which could be a split now | ||||
| 1197 | push @loc, $self->_single_loc_object_from_collection(@subloc); | ||||
| 1198 | } # each location | ||||
| 1199 | |||||
| 1200 | # create new feature based on original one and move annotation across | ||||
| 1201 | my $newfeat = | ||||
| 1202 | Bio::SeqFeature::Generic->new( -primary => $feat->primary_tag ); | ||||
| 1203 | foreach my $key ( $feat->annotation->get_all_annotation_keys() ) { | ||||
| 1204 | foreach my $value ( $feat->annotation->get_Annotations($key) ) { | ||||
| 1205 | $newfeat->annotation->add_Annotation( $key, $value ); | ||||
| 1206 | } | ||||
| 1207 | } | ||||
| 1208 | foreach my $key ( $feat->get_all_tags() ) { | ||||
| 1209 | $newfeat->add_tag_value( $key, $feat->get_tag_values($key) ); | ||||
| 1210 | } | ||||
| 1211 | |||||
| 1212 | # If we have a note about the inserted bases, add it | ||||
| 1213 | if ($note) { | ||||
| 1214 | $newfeat->add_tag_value( 'note', $note ); | ||||
| 1215 | } | ||||
| 1216 | |||||
| 1217 | # set modified location(s) for the new feature and | ||||
| 1218 | # add its subfeatures if any | ||||
| 1219 | my $loc = $self->_single_loc_object_from_collection(@loc); | ||||
| 1220 | $loc ? $newfeat->location($loc) : return; | ||||
| 1221 | $newfeat->add_SeqFeature($_) for @adjsubfeat; | ||||
| 1222 | |||||
| 1223 | return $newfeat; | ||||
| 1224 | |||||
| 1225 | } | ||||
| 1226 | |||||
| 1227 | =head2 _single_loc_object_from_collection | ||||
| 1228 | |||||
| 1229 | Title : _single_loc_object_from_collection | ||||
| 1230 | Function: takes an array of location objects. Returns either a split | ||||
| 1231 | location object if there are more than one locations in the | ||||
| 1232 | array or returns the single location if there is only one | ||||
| 1233 | Usage : my $loc = _single_loc_object_from_collection( @sublocs ); | ||||
| 1234 | Args : array of Bio::Location objects | ||||
| 1235 | Returns : a single Bio:;Location object containing all locations | ||||
| 1236 | |||||
| 1237 | =cut | ||||
| 1238 | |||||
| 1239 | sub _single_loc_object_from_collection { | ||||
| 1240 | my ( $self, @locs ) = @_; | ||||
| 1241 | my $loc; | ||||
| 1242 | if ( @locs > 1 ) { | ||||
| 1243 | $loc = Bio::Location::Split->new; | ||||
| 1244 | $loc->add_sub_Location(@locs); | ||||
| 1245 | } | ||||
| 1246 | elsif ( @locs == 1 ) { | ||||
| 1247 | $loc = shift @locs; | ||||
| 1248 | } | ||||
| 1249 | return $loc; | ||||
| 1250 | } # _single_loc_object_from_collection | ||||
| 1251 | |||||
| 1252 | =head2 _location_objects_from_coordinate_list | ||||
| 1253 | |||||
| 1254 | Title : _location_objects_from_coordinate_list | ||||
| 1255 | Function: takes an array-ref of start/end coordinates, a strand and a | ||||
| 1256 | type and returns a list of Bio::Location objects (Fuzzy by | ||||
| 1257 | default, Simple in case of in-between coordinates). | ||||
| 1258 | If location type is not "IN-BETWEEN", individual types may be | ||||
| 1259 | passed in for start and end location as per Bio::Location::Fuzzy | ||||
| 1260 | documentation. | ||||
| 1261 | Usage : my @loc_objs = $self->_location_objects_from_coordinate_list( | ||||
| 1262 | \@coords, | ||||
| 1263 | $strand, | ||||
| 1264 | $type | ||||
| 1265 | ); | ||||
| 1266 | Args : array-ref of array-refs each containing: | ||||
| 1267 | start, end [, start-type, end-type] | ||||
| 1268 | where types are optional. If given, must be | ||||
| 1269 | a one of ('BEFORE', 'AFTER', 'EXACT','WITHIN', 'BETWEEN') | ||||
| 1270 | strand (all locations must be on same strand) | ||||
| 1271 | location-type (EXACT, IN-BETWEEN etc) | ||||
| 1272 | Returns : list of Bio::Location objects | ||||
| 1273 | |||||
| 1274 | =cut | ||||
| 1275 | |||||
| 1276 | sub _location_objects_from_coordinate_list { | ||||
| 1277 | my $self = shift; | ||||
| 1278 | my ( $coords_ref, $strand, $type ) = @_; | ||||
| 1279 | $self->throw( 'expected 3 parameters but got ' . @_ ) unless @_ == 3; | ||||
| 1280 | $self->throw('first argument must be an ARRAY reference#') | ||||
| 1281 | unless ref($coords_ref) eq 'ARRAY'; | ||||
| 1282 | |||||
| 1283 | my @loc; | ||||
| 1284 | foreach my $coords_set (@$coords_ref) { | ||||
| 1285 | my ( $start, $end, $start_type, $end_type ) = @$coords_set; | ||||
| 1286 | |||||
| 1287 | # taken from Bio::SeqUtils::_coord_adjust | ||||
| 1288 | if ( $type ne 'IN-BETWEEN' ) { | ||||
| 1289 | my $loc = Bio::Location::Fuzzy->new( | ||||
| 1290 | -start => $start, | ||||
| 1291 | -end => $end, | ||||
| 1292 | -strand => $strand, | ||||
| 1293 | -location_type => $type | ||||
| 1294 | ); | ||||
| 1295 | $loc->start_pos_type($start_type) if $start_type; | ||||
| 1296 | $loc->end_pos_type($end_type) if $end_type; | ||||
| 1297 | push @loc, $loc; | ||||
| 1298 | } | ||||
| 1299 | else { | ||||
| 1300 | push @loc, | ||||
| 1301 | Bio::Location::Simple->new( | ||||
| 1302 | -start => $start, | ||||
| 1303 | -end => $end, | ||||
| 1304 | -strand => $strand, | ||||
| 1305 | -location_type => $type | ||||
| 1306 | ); | ||||
| 1307 | } | ||||
| 1308 | } # each coords_set | ||||
| 1309 | return @loc; | ||||
| 1310 | } # _location_objects_from_coordinate_list | ||||
| 1311 | |||||
| 1312 | =head2 _new_seq_via_clone | ||||
| 1313 | |||||
| 1314 | Title : _new_seq_via_clone | ||||
| 1315 | Function: clone a sequence object using Bio::Root::Root::clone and set the new sequence string | ||||
| 1316 | sequence features are removed. | ||||
| 1317 | Usage : my $new_seq = $self->_new_seq_via_clone( $seq_obj, $seq_str ); | ||||
| 1318 | Args : original seq object [, new sequence string] | ||||
| 1319 | Returns : a clone of the original sequence object, optionally with new sequence string | ||||
| 1320 | |||||
| 1321 | =cut | ||||
| 1322 | |||||
| 1323 | sub _new_seq_via_clone { | ||||
| 1324 | my ( $self, $in_seq_obj, $seq_str ) = @_; | ||||
| 1325 | my $out_seq_obj = $in_seq_obj->clone; | ||||
| 1326 | $out_seq_obj->remove_SeqFeatures if $out_seq_obj->can('remove_SeqFeatures'); | ||||
| 1327 | if ( blessed $out_seq_obj->seq | ||||
| 1328 | && $out_seq_obj->seq->isa('Bio::PrimarySeq') ) | ||||
| 1329 | { | ||||
| 1330 | $out_seq_obj->seq->seq($seq_str); | ||||
| 1331 | } | ||||
| 1332 | else { | ||||
| 1333 | $out_seq_obj->seq($seq_str); | ||||
| 1334 | } | ||||
| 1335 | return $out_seq_obj; | ||||
| 1336 | |||||
| 1337 | } # _new_seq_via_clone | ||||
| 1338 | |||||
| 1339 | =head2 _new_seq_from_old | ||||
| 1340 | |||||
| 1341 | Title : _new_seq_from_old | ||||
| 1342 | Function: creates a new sequence obejct, if possible of the same class as the old and adds | ||||
| 1343 | attributes to it. Also copies annotation across to the new object. | ||||
| 1344 | Usage : my $new_seq = $self->_new_seq_from_old( $seq_obj, { seq => $seq_str, display_id => 'some_ID'}); | ||||
| 1345 | Args : old sequence object | ||||
| 1346 | hashref of attributes for the new sequence (sequence string etc.) | ||||
| 1347 | Returns : a new Bio::Seq object | ||||
| 1348 | |||||
| 1349 | =cut | ||||
| 1350 | |||||
| 1351 | sub _new_seq_from_old { | ||||
| 1352 | my ( $self, $in_seq_obj, $attr ) = @_; | ||||
| 1353 | $self->throw('attributes must be a hashref') | ||||
| 1354 | if $attr && ref($attr) ne 'HASH'; | ||||
| 1355 | |||||
| 1356 | my $seqclass; | ||||
| 1357 | if ( $in_seq_obj->can_call_new ) { | ||||
| 1358 | $seqclass = ref($in_seq_obj); | ||||
| 1359 | } | ||||
| 1360 | else { | ||||
| 1361 | $seqclass = 'Bio::Primaryseq'; | ||||
| 1362 | $self->_attempt_to_load_seq; | ||||
| 1363 | } | ||||
| 1364 | |||||
| 1365 | my $out_seq_obj = $seqclass->new( | ||||
| 1366 | -seq => $attr->{seq} || $in_seq_obj->seq, | ||||
| 1367 | -display_id => $attr->{display_id} || $in_seq_obj->display_id, | ||||
| 1368 | -accession_number => $attr->{accession_number} | ||||
| 1369 | || $in_seq_obj->accession_number | ||||
| 1370 | || '', | ||||
| 1371 | -alphabet => $in_seq_obj->alphabet, | ||||
| 1372 | -desc => $attr->{desc} || $in_seq_obj->desc, | ||||
| 1373 | -verbose => $attr->{verbose} || $in_seq_obj->verbose, | ||||
| 1374 | -is_circular => $attr->{is_circular} || $in_seq_obj->is_circular || 0, | ||||
| 1375 | ); | ||||
| 1376 | |||||
| 1377 | # move the annotation across to the product | ||||
| 1378 | if ( $out_seq_obj->isa("Bio::AnnotatableI") | ||||
| 1379 | && $in_seq_obj->isa("Bio::AnnotatableI") ) | ||||
| 1380 | { | ||||
| 1381 | foreach my $key ( $in_seq_obj->annotation->get_all_annotation_keys ) { | ||||
| 1382 | foreach my $value ( $in_seq_obj->annotation->get_Annotations($key) ) | ||||
| 1383 | { | ||||
| 1384 | $out_seq_obj->annotation->add_Annotation( $key, $value ); | ||||
| 1385 | } | ||||
| 1386 | } | ||||
| 1387 | } | ||||
| 1388 | return $out_seq_obj; | ||||
| 1389 | } # _new_seq_from_old | ||||
| 1390 | |||||
| 1391 | =head2 _coord_adjust | ||||
| 1392 | |||||
| 1393 | Title : _coord_adjust | ||||
| 1394 | Usage : my $newfeat=Bio::SeqUtils->_coord_adjust($feature, 100, $seq->length); | ||||
| 1395 | Function: Recursive subroutine to adjust the coordinates of a feature | ||||
| 1396 | and all its subfeatures. If a sequence length is specified, then | ||||
| 1397 | any adjusted features that have locations beyond the boundaries | ||||
| 1398 | of the sequence are converted to Bio::Location::Fuzzy objects. | ||||
| 1399 | |||||
| 1400 | Returns : A Bio::SeqFeatureI compliant object. | ||||
| 1401 | Args : A Bio::SeqFeatureI compliant object, | ||||
| 1402 | the number of bases to add to the coordinates | ||||
| 1403 | (optional) the length of the parent sequence | ||||
| 1404 | |||||
| 1405 | |||||
| 1406 | =cut | ||||
| 1407 | |||||
| 1408 | sub _coord_adjust { | ||||
| 1409 | my ( $self, $feat, $add, $length ) = @_; | ||||
| 1410 | $self->throw( 'Object [$feat] ' | ||||
| 1411 | . 'of class [' | ||||
| 1412 | . ref($feat) | ||||
| 1413 | . '] should be a Bio::SeqFeatureI ' ) | ||||
| 1414 | unless $feat->isa('Bio::SeqFeatureI'); | ||||
| 1415 | my @adjsubfeat; | ||||
| 1416 | for my $subfeat ( $feat->get_SeqFeatures ) { | ||||
| 1417 | push @adjsubfeat, $self->_coord_adjust( $subfeat, $add, $length ); | ||||
| 1418 | } | ||||
| 1419 | my @loc; | ||||
| 1420 | for ( $feat->location->each_Location ) { | ||||
| 1421 | my @coords = ( $_->start, $_->end ); | ||||
| 1422 | my $strand = $_->strand; | ||||
| 1423 | my $type = $_->location_type; | ||||
| 1424 | foreach (@coords) { | ||||
| 1425 | $self->throw("can not handle negative feature positions (got: $_)") | ||||
| 1426 | if $_ < 0; | ||||
| 1427 | if ( $add + $_ < 1 ) { | ||||
| 1428 | $_ = '<1'; | ||||
| 1429 | } | ||||
| 1430 | elsif ( defined $length and $add + $_ > $length ) { | ||||
| 1431 | $_ = ">$length"; | ||||
| 1432 | } | ||||
| 1433 | else { | ||||
| 1434 | $_ = $add + $_; | ||||
| 1435 | } | ||||
| 1436 | } | ||||
| 1437 | push @loc, | ||||
| 1438 | $self->_location_objects_from_coordinate_list( [ \@coords ], | ||||
| 1439 | $strand, $type ); | ||||
| 1440 | } | ||||
| 1441 | my $newfeat = | ||||
| 1442 | Bio::SeqFeature::Generic->new( -primary => $feat->primary_tag ); | ||||
| 1443 | foreach my $key ( $feat->annotation->get_all_annotation_keys() ) { | ||||
| 1444 | foreach my $value ( $feat->annotation->get_Annotations($key) ) { | ||||
| 1445 | $newfeat->annotation->add_Annotation( $key, $value ); | ||||
| 1446 | } | ||||
| 1447 | } | ||||
| 1448 | foreach my $key ( $feat->get_all_tags() ) { | ||||
| 1449 | $newfeat->add_tag_value( $key, $feat->get_tag_values($key) ); | ||||
| 1450 | } | ||||
| 1451 | my $loc = $self->_single_loc_object_from_collection(@loc); | ||||
| 1452 | $loc ? $newfeat->location($loc) : return; | ||||
| 1453 | $newfeat->add_SeqFeature($_) for @adjsubfeat; | ||||
| 1454 | return $newfeat; | ||||
| 1455 | } | ||||
| 1456 | |||||
| 1457 | =head2 revcom_with_features | ||||
| 1458 | |||||
| 1459 | Title : revcom_with_features | ||||
| 1460 | Usage : $revcom=Bio::SeqUtils->revcom_with_features($seq); | ||||
| 1461 | Function: Like Bio::Seq::revcom, but keeps features (adjusting coordinates | ||||
| 1462 | as appropriate. | ||||
| 1463 | Returns : A new sequence object | ||||
| 1464 | Args : A sequence object | ||||
| 1465 | |||||
| 1466 | |||||
| 1467 | =cut | ||||
| 1468 | |||||
| 1469 | sub revcom_with_features { | ||||
| 1470 | my ( $self, $seq ) = @_; | ||||
| 1471 | $self->throw( 'Object [$seq] ' | ||||
| 1472 | . 'of class [' | ||||
| 1473 | . ref($seq) | ||||
| 1474 | . '] should be a Bio::SeqI ' ) | ||||
| 1475 | unless $seq->isa('Bio::SeqI'); | ||||
| 1476 | my $revcom = $seq->revcom; | ||||
| 1477 | |||||
| 1478 | # make sure that there is no annotation or features in $trunc | ||||
| 1479 | # (->revcom() now clone objects except for Bio::Seq::LargePrimarySeq) | ||||
| 1480 | $revcom->annotation->remove_Annotations; | ||||
| 1481 | $revcom->remove_SeqFeatures; | ||||
| 1482 | |||||
| 1483 | #move annotations | ||||
| 1484 | foreach my $key ( $seq->annotation->get_all_annotation_keys() ) { | ||||
| 1485 | foreach my $value ( $seq->annotation->get_Annotations($key) ) { | ||||
| 1486 | $revcom->annotation->add_Annotation( $key, $value ); | ||||
| 1487 | } | ||||
| 1488 | } | ||||
| 1489 | |||||
| 1490 | #move features | ||||
| 1491 | for ( map { $self->_feature_revcom( $_, $seq->length ) } | ||||
| 1492 | reverse $seq->get_SeqFeatures ) | ||||
| 1493 | { | ||||
| 1494 | $revcom->add_SeqFeature($_); | ||||
| 1495 | } | ||||
| 1496 | return $revcom; | ||||
| 1497 | } | ||||
| 1498 | |||||
| 1499 | =head2 _feature_revcom | ||||
| 1500 | |||||
| 1501 | Title : _feature_revcom | ||||
| 1502 | Usage : my $newfeat=Bio::SeqUtils->_feature_revcom($feature, $seq->length); | ||||
| 1503 | Function: Recursive subroutine to reverse complement a feature and | ||||
| 1504 | all its subfeatures. The length of the parent sequence must be | ||||
| 1505 | specified. | ||||
| 1506 | |||||
| 1507 | Returns : A Bio::SeqFeatureI compliant object. | ||||
| 1508 | Args : A Bio::SeqFeatureI compliant object, | ||||
| 1509 | the length of the parent sequence | ||||
| 1510 | |||||
| 1511 | |||||
| 1512 | =cut | ||||
| 1513 | |||||
| 1514 | sub _feature_revcom { | ||||
| 1515 | my ( $self, $feat, $length ) = @_; | ||||
| 1516 | $self->throw( 'Object [$feat] ' | ||||
| 1517 | . 'of class [' | ||||
| 1518 | . ref($feat) | ||||
| 1519 | . '] should be a Bio::SeqFeatureI ' ) | ||||
| 1520 | unless $feat->isa('Bio::SeqFeatureI'); | ||||
| 1521 | my @adjsubfeat; | ||||
| 1522 | for my $subfeat ( $feat->get_SeqFeatures ) { | ||||
| 1523 | push @adjsubfeat, $self->_feature_revcom( $subfeat, $length ); | ||||
| 1524 | } | ||||
| 1525 | my @loc; | ||||
| 1526 | for ( $feat->location->each_Location ) { | ||||
| 1527 | my $type = $_->location_type; | ||||
| 1528 | my $strand; | ||||
| 1529 | if ( $_->strand == -1 ) { $strand = 1 } | ||||
| 1530 | elsif ( $_->strand == 1 ) { $strand = -1 } | ||||
| 1531 | else { $strand = $_->strand } | ||||
| 1532 | my $newend = | ||||
| 1533 | $self->_coord_revcom( $_->start, $_->start_pos_type, $length ); | ||||
| 1534 | my $newstart = | ||||
| 1535 | $self->_coord_revcom( $_->end, $_->end_pos_type, $length ); | ||||
| 1536 | my $newstart_type = $_->end_pos_type; | ||||
| 1537 | $newstart_type = 'BEFORE' if $_->end_pos_type eq 'AFTER'; | ||||
| 1538 | $newstart_type = 'AFTER' if $_->end_pos_type eq 'BEFORE'; | ||||
| 1539 | my $newend_type = $_->start_pos_type; | ||||
| 1540 | $newend_type = 'BEFORE' if $_->start_pos_type eq 'AFTER'; | ||||
| 1541 | $newend_type = 'AFTER' if $_->start_pos_type eq 'BEFORE'; | ||||
| 1542 | push @loc, | ||||
| 1543 | $self->_location_objects_from_coordinate_list( | ||||
| 1544 | [ [ $newstart, $newend, $newstart_type, $newend_type ] ], | ||||
| 1545 | $strand, $type ); | ||||
| 1546 | } | ||||
| 1547 | my $newfeat = | ||||
| 1548 | Bio::SeqFeature::Generic->new( -primary => $feat->primary_tag ); | ||||
| 1549 | foreach my $key ( $feat->annotation->get_all_annotation_keys() ) { | ||||
| 1550 | foreach my $value ( $feat->annotation->get_Annotations($key) ) { | ||||
| 1551 | $newfeat->annotation->add_Annotation( $key, $value ); | ||||
| 1552 | } | ||||
| 1553 | } | ||||
| 1554 | foreach my $key ( $feat->get_all_tags() ) { | ||||
| 1555 | $newfeat->add_tag_value( $key, $feat->get_tag_values($key) ); | ||||
| 1556 | } | ||||
| 1557 | |||||
| 1558 | my $loc = $self->_single_loc_object_from_collection(@loc); | ||||
| 1559 | $loc ? $newfeat->location($loc) : return; | ||||
| 1560 | |||||
| 1561 | $newfeat->add_SeqFeature($_) for @adjsubfeat; | ||||
| 1562 | return $newfeat; | ||||
| 1563 | } | ||||
| 1564 | |||||
| 1565 | sub _coord_revcom { | ||||
| 1566 | my ( $self, $coord, $type, $length ) = @_; | ||||
| 1567 | if ( $type eq 'BETWEEN' or $type eq 'WITHIN' ) { | ||||
| 1568 | $coord =~ s/(\d+)(\D*)(\d+)/$length+1-$3.$2.$length+1-$1/ge; | ||||
| 1569 | } | ||||
| 1570 | else { | ||||
| 1571 | $coord =~ s/(\d+)/$length+1-$1/ge; | ||||
| 1572 | $coord =~ tr/<>/></; | ||||
| 1573 | $coord = '>' . $coord | ||||
| 1574 | if $type eq 'BEFORE' and substr( $coord, 0, 1 ) ne '>'; | ||||
| 1575 | $coord = '<' . $coord | ||||
| 1576 | if $type eq 'AFTER' and substr( $coord, 0, 1 ) ne '<'; | ||||
| 1577 | } | ||||
| 1578 | return $coord; | ||||
| 1579 | } | ||||
| 1580 | |||||
| 1581 | =head2 evolve | ||||
| 1582 | |||||
| 1583 | Title : evolve | ||||
| 1584 | Usage : my $newseq = Bio::SeqUtils-> | ||||
| 1585 | evolve($seq, $similarity, $transition_transversion_rate); | ||||
| 1586 | Function: Mutates the sequence by point mutations until the similarity of | ||||
| 1587 | the new sequence has decreased to the required level. | ||||
| 1588 | Transition/transversion rate is adjustable. | ||||
| 1589 | Returns : A new Bio::PrimarySeq object | ||||
| 1590 | Args : sequence object | ||||
| 1591 | percentage similarity (e.g. 80) | ||||
| 1592 | tr/tv rate, optional, defaults to 1 (= 1:1) | ||||
| 1593 | |||||
| 1594 | Set the verbosity of the Bio::SeqUtils object to positive integer to | ||||
| 1595 | see the mutations as they happen. | ||||
| 1596 | |||||
| 1597 | This method works only on nucleotide sequences. It prints a warning if | ||||
| 1598 | you set the target similarity to be less than 25%. | ||||
| 1599 | |||||
| 1600 | Transition/transversion ratio is an observed attribute of an sequence | ||||
| 1601 | comparison. We are dealing here with the transition/transversion rate | ||||
| 1602 | that we set for our model of sequence evolution. | ||||
| 1603 | |||||
| 1604 | =cut | ||||
| 1605 | |||||
| 1606 | sub evolve { | ||||
| 1607 | my ( $self, $seq, $sim, $rate ) = @_; | ||||
| 1608 | $rate ||= 1; | ||||
| 1609 | |||||
| 1610 | $self->throw( 'Object [$seq] ' | ||||
| 1611 | . 'of class [' | ||||
| 1612 | . ref($seq) | ||||
| 1613 | . '] should be a Bio::PrimarySeqI ' ) | ||||
| 1614 | unless $seq->isa('Bio::PrimarySeqI'); | ||||
| 1615 | |||||
| 1616 | $self->throw( | ||||
| 1617 | "[$sim] " . ' should be a positive integer or float under 100' ) | ||||
| 1618 | unless $sim =~ /^[+\d.]+$/ and $sim <= 100; | ||||
| 1619 | |||||
| 1620 | $self->warn( | ||||
| 1621 | "Nucleotide sequences are 25% similar by chance. | ||||
| 1622 | Do you really want to set similarity to [$sim]%?\n" | ||||
| 1623 | ) unless $sim > 25; | ||||
| 1624 | |||||
| 1625 | $self->throw('Only nucleotide sequences are supported') | ||||
| 1626 | if $seq->alphabet eq 'protein'; | ||||
| 1627 | |||||
| 1628 | # arrays of possible changes have transitions as first items | ||||
| 1629 | my %changes; | ||||
| 1630 | $changes{'a'} = [ 't', 'c', 'g' ]; | ||||
| 1631 | $changes{'t'} = [ 'a', 'c', 'g' ]; | ||||
| 1632 | $changes{'c'} = [ 'g', 'a', 't' ]; | ||||
| 1633 | $changes{'g'} = [ 'c', 'a', 't' ]; | ||||
| 1634 | |||||
| 1635 | # given the desired rate, find out where cut off points need to be | ||||
| 1636 | # when random numbers are generated from 0 to 100 | ||||
| 1637 | # we are ignoring identical mutations (e.g. A->A) to speed things up | ||||
| 1638 | my $bin_size = 100 / ( $rate + 2 ); | ||||
| 1639 | my $transition = 100 - ( 2 * $bin_size ); | ||||
| 1640 | my $first_transversion = $transition + $bin_size; | ||||
| 1641 | |||||
| 1642 | # unify the look of sequence strings | ||||
| 1643 | my $string = lc $seq->seq; # lower case | ||||
| 1644 | $string =~ | ||||
| 1645 | s/u/t/; # simplyfy our life; modules should deal with the change anyway | ||||
| 1646 | # store the original sequence string | ||||
| 1647 | my $oristring = $string; | ||||
| 1648 | my $length = $seq->length; | ||||
| 1649 | |||||
| 1650 | # stop evolving if the limit has been reached | ||||
| 1651 | until ( $self->_get_similarity( $oristring, $string ) <= $sim ) { | ||||
| 1652 | |||||
| 1653 | # find the location in the string to change | ||||
| 1654 | my $loc = int( rand $length ) + 1; | ||||
| 1655 | |||||
| 1656 | # nucleotide to change | ||||
| 1657 | my $oldnuc = substr $string, $loc - 1, 1; | ||||
| 1658 | my $newnuc; | ||||
| 1659 | |||||
| 1660 | # nucleotide it is changed to | ||||
| 1661 | my $choose = rand(100); | ||||
| 1662 | if ( $choose < $transition ) { | ||||
| 1663 | $newnuc = $changes{$oldnuc}[0]; | ||||
| 1664 | } | ||||
| 1665 | elsif ( $choose < $first_transversion ) { | ||||
| 1666 | $newnuc = $changes{$oldnuc}[1]; | ||||
| 1667 | } | ||||
| 1668 | else { | ||||
| 1669 | $newnuc = $changes{$oldnuc}[2]; | ||||
| 1670 | } | ||||
| 1671 | |||||
| 1672 | # do the change | ||||
| 1673 | substr $string, $loc - 1, 1, $newnuc; | ||||
| 1674 | |||||
| 1675 | $self->debug("$loc$oldnuc>$newnuc\n"); | ||||
| 1676 | } | ||||
| 1677 | |||||
| 1678 | return new Bio::PrimarySeq( | ||||
| 1679 | -id => $seq->id . "-$sim", | ||||
| 1680 | -description => $seq->description, | ||||
| 1681 | -seq => $string | ||||
| 1682 | ); | ||||
| 1683 | } | ||||
| 1684 | |||||
| 1685 | sub _get_similarity { | ||||
| 1686 | my ( $self, $oriseq, $seq ) = @_; | ||||
| 1687 | |||||
| 1688 | my $len = length($oriseq); | ||||
| 1689 | my $c; | ||||
| 1690 | |||||
| 1691 | for ( my $i = 0 ; $i < $len ; $i++ ) { | ||||
| 1692 | $c++ if substr( $oriseq, $i, 1 ) eq substr( $seq, $i, 1 ); | ||||
| 1693 | } | ||||
| 1694 | return 100 * $c / $len; | ||||
| 1695 | } | ||||
| 1696 | |||||
| 1697 | 1 | 23µs | 1; | ||
# spent 12µs within Bio::SeqUtils::CORE:match which was called 27 times, avg 437ns/call:
# 27 times (12µs+0s) by Bio::SeqUtils::valid_aa at line 429, avg 437ns/call | |||||
# spent 6µs within Bio::SeqUtils::CORE:sort which was called:
# once (6µs+0s) by Bio::SeqUtils::valid_aa at line 428 |