| Filename | /Users/ap13/perl5/lib/perl5/Bio/SeqIO.pm |
| Statements | Executed 17 statements in 1.94ms |
| Calls | P | F | Exclusive Time |
Inclusive Time |
Subroutine |
|---|---|---|---|---|---|
| 1 | 1 | 1 | 2.61ms | 2.72ms | Bio::SeqIO::BEGIN@332 |
| 1 | 1 | 1 | 1.20ms | 1.81ms | Bio::SeqIO::BEGIN@330 |
| 1 | 1 | 1 | 1.06ms | 1.59ms | Bio::SeqIO::BEGIN@331 |
| 1 | 1 | 1 | 15µs | 31µs | Bio::SeqIO::BEGIN@327 |
| 1 | 1 | 1 | 11µs | 491µs | Bio::SeqIO::BEGIN@335 |
| 1 | 1 | 1 | 11µs | 50µs | Bio::SeqIO::BEGIN@333 |
| 1 | 1 | 1 | 8µs | 14µs | Bio::SeqIO::BEGIN@328 |
| 0 | 0 | 0 | 0s | 0s | Bio::SeqIO::DESTROY |
| 0 | 0 | 0 | 0s | 0s | Bio::SeqIO::PRINT |
| 0 | 0 | 0 | 0s | 0s | Bio::SeqIO::READLINE |
| 0 | 0 | 0 | 0s | 0s | Bio::SeqIO::TIEHANDLE |
| 0 | 0 | 0 | 0s | 0s | Bio::SeqIO::_concatenate_lines |
| 0 | 0 | 0 | 0s | 0s | Bio::SeqIO::_filehandle |
| 0 | 0 | 0 | 0s | 0s | Bio::SeqIO::_guess_format |
| 0 | 0 | 0 | 0s | 0s | Bio::SeqIO::_initialize |
| 0 | 0 | 0 | 0s | 0s | Bio::SeqIO::_load_format_module |
| 0 | 0 | 0 | 0s | 0s | Bio::SeqIO::alphabet |
| 0 | 0 | 0 | 0s | 0s | Bio::SeqIO::fh |
| 0 | 0 | 0 | 0s | 0s | Bio::SeqIO::location_factory |
| 0 | 0 | 0 | 0s | 0s | Bio::SeqIO::new |
| 0 | 0 | 0 | 0s | 0s | Bio::SeqIO::newFh |
| 0 | 0 | 0 | 0s | 0s | Bio::SeqIO::next_seq |
| 0 | 0 | 0 | 0s | 0s | Bio::SeqIO::object_factory |
| 0 | 0 | 0 | 0s | 0s | Bio::SeqIO::sequence_builder |
| 0 | 0 | 0 | 0s | 0s | Bio::SeqIO::sequence_factory |
| 0 | 0 | 0 | 0s | 0s | Bio::SeqIO::write_seq |
| Line | State ments |
Time on line |
Calls | Time in subs |
Code |
|---|---|---|---|---|---|
| 1 | # BioPerl module for Bio::SeqIO | ||||
| 2 | # | ||||
| 3 | # Please direct questions and support issues to <bioperl-l@bioperl.org> | ||||
| 4 | # | ||||
| 5 | # Cared for by Ewan Birney <birney@ebi.ac.uk> | ||||
| 6 | # and Lincoln Stein <lstein@cshl.org> | ||||
| 7 | # | ||||
| 8 | # Copyright Ewan Birney | ||||
| 9 | # | ||||
| 10 | # You may distribute this module under the same terms as perl itself | ||||
| 11 | # | ||||
| 12 | # _history | ||||
| 13 | # October 18, 1999 Largely rewritten by Lincoln Stein | ||||
| 14 | |||||
| 15 | # POD documentation - main docs before the code | ||||
| 16 | |||||
| 17 | =head1 NAME | ||||
| 18 | |||||
| 19 | Bio::SeqIO - Handler for SeqIO Formats | ||||
| 20 | |||||
| 21 | =head1 SYNOPSIS | ||||
| 22 | |||||
| 23 | use Bio::SeqIO; | ||||
| 24 | |||||
| 25 | $in = Bio::SeqIO->new(-file => "inputfilename" , | ||||
| 26 | -format => 'Fasta'); | ||||
| 27 | $out = Bio::SeqIO->new(-file => ">outputfilename" , | ||||
| 28 | -format => 'EMBL'); | ||||
| 29 | |||||
| 30 | while ( my $seq = $in->next_seq() ) { | ||||
| 31 | $out->write_seq($seq); | ||||
| 32 | } | ||||
| 33 | |||||
| 34 | # Now, to actually get at the sequence object, use the standard Bio::Seq | ||||
| 35 | # methods (look at Bio::Seq if you don't know what they are) | ||||
| 36 | |||||
| 37 | use Bio::SeqIO; | ||||
| 38 | |||||
| 39 | $in = Bio::SeqIO->new(-file => "inputfilename" , | ||||
| 40 | -format => 'genbank'); | ||||
| 41 | |||||
| 42 | while ( my $seq = $in->next_seq() ) { | ||||
| 43 | print "Sequence ",$seq->id, " first 10 bases ", | ||||
| 44 | $seq->subseq(1,10), "\n"; | ||||
| 45 | } | ||||
| 46 | |||||
| 47 | |||||
| 48 | # The SeqIO system does have a filehandle binding. Most people find this | ||||
| 49 | # a little confusing, but it does mean you can write the world's | ||||
| 50 | # smallest reformatter | ||||
| 51 | |||||
| 52 | use Bio::SeqIO; | ||||
| 53 | |||||
| 54 | $in = Bio::SeqIO->newFh(-file => "inputfilename" , | ||||
| 55 | -format => 'Fasta'); | ||||
| 56 | $out = Bio::SeqIO->newFh(-format => 'EMBL'); | ||||
| 57 | |||||
| 58 | # World's shortest Fasta<->EMBL format converter: | ||||
| 59 | print $out $_ while <$in>; | ||||
| 60 | |||||
| 61 | |||||
| 62 | =head1 DESCRIPTION | ||||
| 63 | |||||
| 64 | Bio::SeqIO is a handler module for the formats in the SeqIO set (eg, | ||||
| 65 | Bio::SeqIO::fasta). It is the officially sanctioned way of getting at | ||||
| 66 | the format objects, which most people should use. | ||||
| 67 | |||||
| 68 | The Bio::SeqIO system can be thought of like biological file handles. | ||||
| 69 | They are attached to filehandles with smart formatting rules (eg, | ||||
| 70 | genbank format, or EMBL format, or binary trace file format) and | ||||
| 71 | can either read or write sequence objects (Bio::Seq objects, or | ||||
| 72 | more correctly, Bio::SeqI implementing objects, of which Bio::Seq is | ||||
| 73 | one such object). If you want to know what to do with a Bio::Seq | ||||
| 74 | object, read L<Bio::Seq>. | ||||
| 75 | |||||
| 76 | The idea is that you request a stream object for a particular format. | ||||
| 77 | All the stream objects have a notion of an internal file that is read | ||||
| 78 | from or written to. A particular SeqIO object instance is configured | ||||
| 79 | for either input or output. A specific example of a stream object is | ||||
| 80 | the Bio::SeqIO::fasta object. | ||||
| 81 | |||||
| 82 | Each stream object has functions | ||||
| 83 | |||||
| 84 | $stream->next_seq(); | ||||
| 85 | |||||
| 86 | and | ||||
| 87 | |||||
| 88 | $stream->write_seq($seq); | ||||
| 89 | |||||
| 90 | As an added bonus, you can recover a filehandle that is tied to the | ||||
| 91 | SeqIO object, allowing you to use the standard E<lt>E<gt> and print | ||||
| 92 | operations to read and write sequence objects: | ||||
| 93 | |||||
| 94 | use Bio::SeqIO; | ||||
| 95 | |||||
| 96 | $stream = Bio::SeqIO->newFh(-format => 'Fasta', | ||||
| 97 | -fh => \*ARGV); | ||||
| 98 | # read from standard input or the input filenames | ||||
| 99 | |||||
| 100 | while ( $seq = <$stream> ) { | ||||
| 101 | # do something with $seq | ||||
| 102 | } | ||||
| 103 | |||||
| 104 | and | ||||
| 105 | |||||
| 106 | print $stream $seq; # when stream is in output mode | ||||
| 107 | |||||
| 108 | This makes the simplest ever reformatter | ||||
| 109 | |||||
| 110 | #!/usr/bin/perl | ||||
| 111 | use strict; | ||||
| 112 | my $format1 = shift; | ||||
| 113 | my $format2 = shift || die | ||||
| 114 | "Usage: reformat format1 format2 < input > output"; | ||||
| 115 | |||||
| 116 | use Bio::SeqIO; | ||||
| 117 | |||||
| 118 | my $in = Bio::SeqIO->newFh(-format => $format1, -fh => \*ARGV ); | ||||
| 119 | my $out = Bio::SeqIO->newFh(-format => $format2 ); | ||||
| 120 | # Note: you might want to quote -format to keep older | ||||
| 121 | # perl's from complaining. | ||||
| 122 | |||||
| 123 | print $out $_ while <$in>; | ||||
| 124 | |||||
| 125 | |||||
| 126 | =head1 CONSTRUCTORS | ||||
| 127 | |||||
| 128 | =head2 Bio::SeqIO-E<gt>new() | ||||
| 129 | |||||
| 130 | $seqIO = Bio::SeqIO->new(-file => 'seqs.fasta', -format => $format); | ||||
| 131 | $seqIO = Bio::SeqIO->new(-fh => \*FILEHANDLE, -format => $format); | ||||
| 132 | $seqIO = Bio::SeqIO->new(-string => $string , -format => $format); | ||||
| 133 | $seqIO = Bio::SeqIO->new(-format => $format); | ||||
| 134 | |||||
| 135 | The new() class method constructs a new Bio::SeqIO object. The returned object | ||||
| 136 | can be used to retrieve or print Seq objects. new() accepts the following | ||||
| 137 | parameters: | ||||
| 138 | |||||
| 139 | =over 5 | ||||
| 140 | |||||
| 141 | =item -file | ||||
| 142 | |||||
| 143 | A file path to be opened for reading or writing. The usual Perl | ||||
| 144 | conventions apply: | ||||
| 145 | |||||
| 146 | 'file' # open file for reading | ||||
| 147 | '>file' # open file for writing | ||||
| 148 | '>>file' # open file for appending | ||||
| 149 | '+<file' # open file read/write | ||||
| 150 | 'command |' # open a pipe from the command | ||||
| 151 | '| command' # open a pipe to the command | ||||
| 152 | |||||
| 153 | =item -fh | ||||
| 154 | |||||
| 155 | You may use new() with a opened filehandle, provided as a glob reference. For | ||||
| 156 | example, to read from STDIN: | ||||
| 157 | |||||
| 158 | my $seqIO = Bio::SeqIO->new(-fh => \*STDIN); | ||||
| 159 | |||||
| 160 | A string filehandle is handy if you want to modify the output in the | ||||
| 161 | memory, before printing it out. The following program reads in EMBL | ||||
| 162 | formatted entries from a file and prints them out in fasta format with | ||||
| 163 | some HTML tags: | ||||
| 164 | |||||
| 165 | use Bio::SeqIO; | ||||
| 166 | use IO::String; | ||||
| 167 | my $in = Bio::SeqIO->new(-file => "emblfile", | ||||
| 168 | -format => 'EMBL'); | ||||
| 169 | while ( my $seq = $in->next_seq() ) { | ||||
| 170 | # the output handle is reset for every file | ||||
| 171 | my $stringio = IO::String->new($string); | ||||
| 172 | my $out = Bio::SeqIO->new(-fh => $stringio, | ||||
| 173 | -format => 'fasta'); | ||||
| 174 | # output goes into $string | ||||
| 175 | $out->write_seq($seq); | ||||
| 176 | # modify $string | ||||
| 177 | $string =~ s|(>)(\w+)|$1<font color="Red">$2</font>|g; | ||||
| 178 | # print into STDOUT | ||||
| 179 | print $string; | ||||
| 180 | } | ||||
| 181 | |||||
| 182 | =item -string | ||||
| 183 | |||||
| 184 | A string to read the sequences from. For example: | ||||
| 185 | |||||
| 186 | my $string = ">seq1\nACGCTAGCTAGC\n"; | ||||
| 187 | my $seqIO = Bio::SeqIO->new(-string => $string); | ||||
| 188 | |||||
| 189 | =item -format | ||||
| 190 | |||||
| 191 | Specify the format of the file. Supported formats include fasta, | ||||
| 192 | genbank, embl, swiss (SwissProt), Entrez Gene and tracefile formats | ||||
| 193 | such as abi (ABI) and scf. There are many more, for a complete listing | ||||
| 194 | see the SeqIO HOWTO (L<http://bioperl.open-bio.org/wiki/HOWTO:SeqIO>). | ||||
| 195 | |||||
| 196 | If no format is specified and a filename is given then the module will | ||||
| 197 | attempt to deduce the format from the filename suffix. If there is no | ||||
| 198 | suffix that Bioperl understands then it will attempt to guess the | ||||
| 199 | format based on file content. If this is unsuccessful then SeqIO will | ||||
| 200 | throw a fatal error. | ||||
| 201 | |||||
| 202 | The format name is case-insensitive: 'FASTA', 'Fasta' and 'fasta' are | ||||
| 203 | all valid. | ||||
| 204 | |||||
| 205 | Currently, the tracefile formats (except for SCF) require installation | ||||
| 206 | of the external Staden "io_lib" package, as well as the | ||||
| 207 | Bio::SeqIO::staden::read package available from the bioperl-ext | ||||
| 208 | repository. | ||||
| 209 | |||||
| 210 | =item -alphabet | ||||
| 211 | |||||
| 212 | Sets the alphabet ('dna', 'rna', or 'protein'). When the alphabet is | ||||
| 213 | set then Bioperl will not attempt to guess what the alphabet is. This | ||||
| 214 | may be important because Bioperl does not always guess correctly. | ||||
| 215 | |||||
| 216 | =item -flush | ||||
| 217 | |||||
| 218 | By default, all files (or filehandles) opened for writing sequences | ||||
| 219 | will be flushed after each write_seq() (making the file immediately | ||||
| 220 | usable). If you do not need this facility and would like to marginally | ||||
| 221 | improve the efficiency of writing multiple sequences to the same file | ||||
| 222 | (or filehandle), pass the -flush option '0' or any other value that | ||||
| 223 | evaluates as defined but false: | ||||
| 224 | |||||
| 225 | my $gb = Bio::SeqIO->new(-file => "<gball.gbk", | ||||
| 226 | -format => "gb"); | ||||
| 227 | my $fa = Bio::SeqIO->new(-file => ">gball.fa", | ||||
| 228 | -format => "fasta", | ||||
| 229 | -flush => 0); # go as fast as we can! | ||||
| 230 | while($seq = $gb->next_seq) { $fa->write_seq($seq) } | ||||
| 231 | |||||
| 232 | =item -seqfactory | ||||
| 233 | |||||
| 234 | Provide a Bio::Factory::SequenceFactoryI object. See the sequence_factory() method. | ||||
| 235 | |||||
| 236 | =item -locfactory | ||||
| 237 | |||||
| 238 | Provide a Bio::Factory::LocationFactoryI object. See the location_factory() method. | ||||
| 239 | |||||
| 240 | =item -objbuilder | ||||
| 241 | |||||
| 242 | Provide a Bio::Factory::ObjectBuilderI object. See the object_builder() method. | ||||
| 243 | |||||
| 244 | =back | ||||
| 245 | |||||
| 246 | =head2 Bio::SeqIO-E<gt>newFh() | ||||
| 247 | |||||
| 248 | $fh = Bio::SeqIO->newFh(-fh => \*FILEHANDLE, -format=>$format); | ||||
| 249 | $fh = Bio::SeqIO->newFh(-format => $format); | ||||
| 250 | # etc. | ||||
| 251 | |||||
| 252 | This constructor behaves like new(), but returns a tied filehandle | ||||
| 253 | rather than a Bio::SeqIO object. You can read sequences from this | ||||
| 254 | object using the familiar E<lt>E<gt> operator, and write to it using | ||||
| 255 | print(). The usual array and $_ semantics work. For example, you can | ||||
| 256 | read all sequence objects into an array like this: | ||||
| 257 | |||||
| 258 | @sequences = <$fh>; | ||||
| 259 | |||||
| 260 | Other operations, such as read(), sysread(), write(), close(), and | ||||
| 261 | printf() are not supported. | ||||
| 262 | |||||
| 263 | =head1 OBJECT METHODS | ||||
| 264 | |||||
| 265 | See below for more detailed summaries. The main methods are: | ||||
| 266 | |||||
| 267 | =head2 $sequence = $seqIO-E<gt>next_seq() | ||||
| 268 | |||||
| 269 | Fetch the next sequence from the stream, or nothing if no more. | ||||
| 270 | |||||
| 271 | =head2 $seqIO-E<gt>write_seq($sequence [,$another_sequence,...]) | ||||
| 272 | |||||
| 273 | Write the specified sequence(s) to the stream. | ||||
| 274 | |||||
| 275 | =head2 TIEHANDLE(), READLINE(), PRINT() | ||||
| 276 | |||||
| 277 | These provide the tie interface. See L<perltie> for more details. | ||||
| 278 | |||||
| 279 | =head1 FEEDBACK | ||||
| 280 | |||||
| 281 | =head2 Mailing Lists | ||||
| 282 | |||||
| 283 | User feedback is an integral part of the evolution of this and other | ||||
| 284 | Bioperl modules. Send your comments and suggestions preferably to one | ||||
| 285 | of the Bioperl mailing lists. | ||||
| 286 | |||||
| 287 | Your participation is much appreciated. | ||||
| 288 | |||||
| 289 | bioperl-l@bioperl.org - General discussion | ||||
| 290 | http://bioperl.org/wiki/Mailing_lists - About the mailing lists | ||||
| 291 | |||||
| 292 | =head2 Support | ||||
| 293 | |||||
| 294 | Please direct usage questions or support issues to the mailing list: | ||||
| 295 | |||||
| 296 | bioperl-l@bioperl.org | ||||
| 297 | |||||
| 298 | rather than to the module maintainer directly. Many experienced and | ||||
| 299 | responsive experts will be able look at the problem and quickly | ||||
| 300 | address it. Please include a thorough description of the problem | ||||
| 301 | with code and data examples if at all possible. | ||||
| 302 | |||||
| 303 | =head2 Reporting Bugs | ||||
| 304 | |||||
| 305 | Report bugs to the Bioperl bug tracking system to help us keep track | ||||
| 306 | the bugs and their resolution. Bug reports can be submitted via the | ||||
| 307 | web: | ||||
| 308 | |||||
| 309 | https://github.com/bioperl/bioperl-live/issues | ||||
| 310 | |||||
| 311 | =head1 AUTHOR - Ewan Birney, Lincoln Stein | ||||
| 312 | |||||
| 313 | Email birney@ebi.ac.uk | ||||
| 314 | lstein@cshl.org | ||||
| 315 | |||||
| 316 | =head1 APPENDIX | ||||
| 317 | |||||
| 318 | The rest of the documentation details each of the object | ||||
| 319 | methods. Internal methods are usually preceded with a _ | ||||
| 320 | |||||
| 321 | =cut | ||||
| 322 | |||||
| 323 | #' Let the code begin... | ||||
| 324 | |||||
| 325 | package Bio::SeqIO; | ||||
| 326 | |||||
| 327 | 2 | 25µs | 2 | 48µs | # spent 31µs (15+16) within Bio::SeqIO::BEGIN@327 which was called:
# once (15µs+16µs) by Bio::Roary::Output::GroupsMultifastaNucleotide::BEGIN@18 at line 327 # spent 31µs making 1 call to Bio::SeqIO::BEGIN@327
# spent 16µs making 1 call to strict::import |
| 328 | 2 | 22µs | 2 | 20µs | # spent 14µs (8+6) within Bio::SeqIO::BEGIN@328 which was called:
# once (8µs+6µs) by Bio::Roary::Output::GroupsMultifastaNucleotide::BEGIN@18 at line 328 # spent 14µs making 1 call to Bio::SeqIO::BEGIN@328
# spent 6µs making 1 call to warnings::import |
| 329 | |||||
| 330 | 2 | 153µs | 1 | 1.81ms | # spent 1.81ms (1.20+612µs) within Bio::SeqIO::BEGIN@330 which was called:
# once (1.20ms+612µs) by Bio::Roary::Output::GroupsMultifastaNucleotide::BEGIN@18 at line 330 # spent 1.81ms making 1 call to Bio::SeqIO::BEGIN@330 |
| 331 | 2 | 149µs | 1 | 1.59ms | # spent 1.59ms (1.06+534µs) within Bio::SeqIO::BEGIN@331 which was called:
# once (1.06ms+534µs) by Bio::Roary::Output::GroupsMultifastaNucleotide::BEGIN@18 at line 331 # spent 1.59ms making 1 call to Bio::SeqIO::BEGIN@331 |
| 332 | 2 | 180µs | 1 | 2.72ms | # spent 2.72ms (2.61+109µs) within Bio::SeqIO::BEGIN@332 which was called:
# once (2.61ms+109µs) by Bio::Roary::Output::GroupsMultifastaNucleotide::BEGIN@18 at line 332 # spent 2.72ms making 1 call to Bio::SeqIO::BEGIN@332 |
| 333 | 2 | 29µs | 2 | 90µs | # spent 50µs (11+40) within Bio::SeqIO::BEGIN@333 which was called:
# once (11µs+40µs) by Bio::Roary::Output::GroupsMultifastaNucleotide::BEGIN@18 at line 333 # spent 50µs making 1 call to Bio::SeqIO::BEGIN@333
# spent 40µs making 1 call to Exporter::import |
| 334 | |||||
| 335 | 2 | 1.38ms | 2 | 971µs | # spent 491µs (11+480) within Bio::SeqIO::BEGIN@335 which was called:
# once (11µs+480µs) by Bio::Roary::Output::GroupsMultifastaNucleotide::BEGIN@18 at line 335 # spent 491µs making 1 call to Bio::SeqIO::BEGIN@335
# spent 480µs making 1 call to parent::import |
| 336 | |||||
| 337 | 1 | 200ns | my %valid_alphabet_cache; | ||
| 338 | |||||
| 339 | |||||
| 340 | =head2 new | ||||
| 341 | |||||
| 342 | Title : new | ||||
| 343 | Usage : $stream = Bio::SeqIO->new(-file => 'sequences.fasta', | ||||
| 344 | -format => 'fasta'); | ||||
| 345 | Function: Returns a new sequence stream | ||||
| 346 | Returns : A Bio::SeqIO stream initialised with the appropriate format | ||||
| 347 | Args : Named parameters indicating where to read the sequences from or to | ||||
| 348 | write them to: | ||||
| 349 | -file => filename, OR | ||||
| 350 | -fh => filehandle to attach to, OR | ||||
| 351 | -string => string | ||||
| 352 | |||||
| 353 | Additional arguments, all with reasonable defaults: | ||||
| 354 | -format => format of the sequences, usually auto-detected | ||||
| 355 | -alphabet => 'dna', 'rna', or 'protein' | ||||
| 356 | -flush => 0 or 1 (default: flush filehandles after each write) | ||||
| 357 | -seqfactory => sequence factory | ||||
| 358 | -locfactory => location factory | ||||
| 359 | -objbuilder => object builder | ||||
| 360 | |||||
| 361 | See L<Bio::SeqIO::Handler> | ||||
| 362 | |||||
| 363 | =cut | ||||
| 364 | |||||
| 365 | 1 | 400ns | my $entry = 0; | ||
| 366 | |||||
| 367 | sub new { | ||||
| 368 | my ($caller, @args) = @_; | ||||
| 369 | my $class = ref($caller) || $caller; | ||||
| 370 | |||||
| 371 | # or do we want to call SUPER on an object if $caller is an | ||||
| 372 | # object? | ||||
| 373 | if( $class =~ /Bio::SeqIO::(\S+)/ ) { | ||||
| 374 | my ($self) = $class->SUPER::new(@args); | ||||
| 375 | $self->_initialize(@args); | ||||
| 376 | return $self; | ||||
| 377 | } else { | ||||
| 378 | my %params = @args; | ||||
| 379 | @params{ map { lc $_ } keys %params } = values %params; # lowercase keys | ||||
| 380 | |||||
| 381 | unless( defined $params{-file} || | ||||
| 382 | defined $params{-fh} || | ||||
| 383 | defined $params{-string} ) { | ||||
| 384 | $class->throw("file argument provided, but with an undefined value") | ||||
| 385 | if exists $params{'-file'}; | ||||
| 386 | $class->throw("fh argument provided, but with an undefined value") | ||||
| 387 | if exists $params{'-fh'}; | ||||
| 388 | $class->throw("string argument provided, but with an undefined value") | ||||
| 389 | if exists($params{'-string'}); | ||||
| 390 | # $class->throw("No file, fh, or string argument provided"); # neither defined | ||||
| 391 | } | ||||
| 392 | |||||
| 393 | # Determine or guess sequence format and variant | ||||
| 394 | my $format = $params{'-format'}; | ||||
| 395 | if (! $format ) { | ||||
| 396 | if ($params{-file}) { | ||||
| 397 | # Guess from filename extension, and then from file content | ||||
| 398 | $format = $class->_guess_format( $params{-file} ) || | ||||
| 399 | Bio::Tools::GuessSeqFormat->new(-file => $params{-file} )->guess; | ||||
| 400 | } elsif ($params{-fh}) { | ||||
| 401 | # Guess from filehandle content | ||||
| 402 | $format = Bio::Tools::GuessSeqFormat->new(-fh => $params{-fh} )->guess; | ||||
| 403 | } elsif ($params{-string}) { | ||||
| 404 | # Guess from string content | ||||
| 405 | $format = Bio::Tools::GuessSeqFormat->new(-text => $params{-string})->guess; | ||||
| 406 | } | ||||
| 407 | } | ||||
| 408 | |||||
| 409 | # changed 1-3-11; no need to print out an empty string (only way this | ||||
| 410 | # exception is triggered) - cjfields | ||||
| 411 | $class->throw("Could not guess format from file, filehandle or string") | ||||
| 412 | if not $format; | ||||
| 413 | $format = "\L$format"; # normalize capitalization to lower case | ||||
| 414 | |||||
| 415 | if ($format =~ /-/) { | ||||
| 416 | ($format, my $variant) = split('-', $format, 2); | ||||
| 417 | $params{-variant} = $variant; | ||||
| 418 | } | ||||
| 419 | |||||
| 420 | return unless( $class->_load_format_module($format) ); | ||||
| 421 | return "Bio::SeqIO::$format"->new(%params); | ||||
| 422 | } | ||||
| 423 | } | ||||
| 424 | |||||
| 425 | |||||
| 426 | =head2 newFh | ||||
| 427 | |||||
| 428 | Title : newFh | ||||
| 429 | Usage : $fh = Bio::SeqIO->newFh(-file=>$filename,-format=>'Format') | ||||
| 430 | Function: Does a new() followed by an fh() | ||||
| 431 | Example : $fh = Bio::SeqIO->newFh(-file=>$filename,-format=>'Format') | ||||
| 432 | $sequence = <$fh>; # read a sequence object | ||||
| 433 | print $fh $sequence; # write a sequence object | ||||
| 434 | Returns : filehandle tied to the Bio::SeqIO::Fh class | ||||
| 435 | Args : | ||||
| 436 | |||||
| 437 | See L<Bio::SeqIO::Fh> | ||||
| 438 | |||||
| 439 | =cut | ||||
| 440 | |||||
| 441 | sub newFh { | ||||
| 442 | my $class = shift; | ||||
| 443 | return unless my $self = $class->new(@_); | ||||
| 444 | return $self->fh; | ||||
| 445 | } | ||||
| 446 | |||||
| 447 | |||||
| 448 | =head2 fh | ||||
| 449 | |||||
| 450 | Title : fh | ||||
| 451 | Usage : $obj->fh | ||||
| 452 | Function: Get or set the IO filehandle | ||||
| 453 | Example : $fh = $obj->fh; # make a tied filehandle | ||||
| 454 | $sequence = <$fh>; # read a sequence object | ||||
| 455 | print $fh $sequence; # write a sequence object | ||||
| 456 | Returns : filehandle tied to Bio::SeqIO class | ||||
| 457 | Args : none | ||||
| 458 | |||||
| 459 | =cut | ||||
| 460 | |||||
| 461 | sub fh { | ||||
| 462 | my $self = shift; | ||||
| 463 | my $class = ref($self) || $self; | ||||
| 464 | my $s = Symbol::gensym; | ||||
| 465 | tie $$s,$class,$self; | ||||
| 466 | return $s; | ||||
| 467 | } | ||||
| 468 | |||||
| 469 | |||||
| 470 | # _initialize is chained for all SeqIO classes | ||||
| 471 | |||||
| 472 | sub _initialize { | ||||
| 473 | my($self, @args) = @_; | ||||
| 474 | |||||
| 475 | # flush is initialized by the Root::IO init | ||||
| 476 | |||||
| 477 | my ($seqfact,$locfact,$objbuilder, $alphabet) = | ||||
| 478 | $self->_rearrange([qw(SEQFACTORY | ||||
| 479 | LOCFACTORY | ||||
| 480 | OBJBUILDER | ||||
| 481 | ALPHABET) | ||||
| 482 | ], @args); | ||||
| 483 | |||||
| 484 | $locfact = Bio::Factory::FTLocationFactory->new(-verbose => $self->verbose) | ||||
| 485 | if ! $locfact; | ||||
| 486 | $objbuilder = Bio::Seq::SeqBuilder->new(-verbose => $self->verbose) | ||||
| 487 | unless $objbuilder; | ||||
| 488 | $self->sequence_builder($objbuilder); | ||||
| 489 | $self->location_factory($locfact); | ||||
| 490 | |||||
| 491 | # note that this should come last because it propagates the sequence | ||||
| 492 | # factory to the sequence builder | ||||
| 493 | $seqfact && $self->sequence_factory($seqfact); | ||||
| 494 | |||||
| 495 | #bug 2160 | ||||
| 496 | $alphabet && $self->alphabet($alphabet); | ||||
| 497 | |||||
| 498 | # initialize the IO part | ||||
| 499 | $self->_initialize_io(@args); | ||||
| 500 | } | ||||
| 501 | |||||
| 502 | |||||
| 503 | =head2 next_seq | ||||
| 504 | |||||
| 505 | Title : next_seq | ||||
| 506 | Usage : $seq = stream->next_seq | ||||
| 507 | Function: Reads the next sequence object from the stream and returns it. | ||||
| 508 | |||||
| 509 | Certain driver modules may encounter entries in the stream | ||||
| 510 | that are either misformatted or that use syntax not yet | ||||
| 511 | understood by the driver. If such an incident is | ||||
| 512 | recoverable, e.g., by dismissing a feature of a feature | ||||
| 513 | table or some other non-mandatory part of an entry, the | ||||
| 514 | driver will issue a warning. In the case of a | ||||
| 515 | non-recoverable situation an exception will be thrown. Do | ||||
| 516 | not assume that you can resume parsing the same stream | ||||
| 517 | after catching the exception. Note that you can always turn | ||||
| 518 | recoverable errors into exceptions by calling | ||||
| 519 | $stream->verbose(2). | ||||
| 520 | |||||
| 521 | Returns : a Bio::Seq sequence object, or nothing if no more sequences | ||||
| 522 | are available | ||||
| 523 | |||||
| 524 | Args : none | ||||
| 525 | |||||
| 526 | See L<Bio::Root::RootI>, L<Bio::Factory::SeqStreamI>, L<Bio::Seq> | ||||
| 527 | |||||
| 528 | =cut | ||||
| 529 | |||||
| 530 | sub next_seq { | ||||
| 531 | my ($self, $seq) = @_; | ||||
| 532 | $self->throw("Sorry, you cannot read from a generic Bio::SeqIO object."); | ||||
| 533 | } | ||||
| 534 | |||||
| 535 | |||||
| 536 | =head2 write_seq | ||||
| 537 | |||||
| 538 | Title : write_seq | ||||
| 539 | Usage : $stream->write_seq($seq) | ||||
| 540 | Function: writes the $seq object into the stream | ||||
| 541 | Returns : 1 for success and 0 for error | ||||
| 542 | Args : Bio::Seq object | ||||
| 543 | |||||
| 544 | =cut | ||||
| 545 | |||||
| 546 | sub write_seq { | ||||
| 547 | my ($self, $seq) = @_; | ||||
| 548 | $self->throw("Sorry, you cannot write to a generic Bio::SeqIO object."); | ||||
| 549 | } | ||||
| 550 | |||||
| 551 | |||||
| 552 | =head2 format | ||||
| 553 | |||||
| 554 | Title : format | ||||
| 555 | Usage : $format = $stream->format() | ||||
| 556 | Function: Get the sequence format | ||||
| 557 | Returns : sequence format, e.g. fasta, fastq | ||||
| 558 | Args : none | ||||
| 559 | |||||
| 560 | =cut | ||||
| 561 | |||||
| 562 | # format() method inherited from Bio::Root::IO | ||||
| 563 | |||||
| 564 | |||||
| 565 | =head2 alphabet | ||||
| 566 | |||||
| 567 | Title : alphabet | ||||
| 568 | Usage : $self->alphabet($newval) | ||||
| 569 | Function: Set/get the molecule type for the Seq objects to be created. | ||||
| 570 | Example : $seqio->alphabet('protein') | ||||
| 571 | Returns : value of alphabet: 'dna', 'rna', or 'protein' | ||||
| 572 | Args : newvalue (optional) | ||||
| 573 | Throws : Exception if the argument is not one of 'dna', 'rna', or 'protein' | ||||
| 574 | |||||
| 575 | =cut | ||||
| 576 | |||||
| 577 | sub alphabet { | ||||
| 578 | my ($self, $value) = @_; | ||||
| 579 | |||||
| 580 | if ( defined $value) { | ||||
| 581 | $value = lc $value; | ||||
| 582 | unless ($valid_alphabet_cache{$value}) { | ||||
| 583 | # instead of hard-coding the allowed values once more, we check by | ||||
| 584 | # creating a dummy sequence object | ||||
| 585 | eval { | ||||
| 586 | require Bio::PrimarySeq; | ||||
| 587 | my $seq = Bio::PrimarySeq->new( -verbose => $self->verbose, | ||||
| 588 | -alphabet => $value ); | ||||
| 589 | }; | ||||
| 590 | if ($@) { | ||||
| 591 | $self->throw("Invalid alphabet: $value\n. See Bio::PrimarySeq for allowed values."); | ||||
| 592 | } | ||||
| 593 | $valid_alphabet_cache{$value} = 1; | ||||
| 594 | } | ||||
| 595 | $self->{'alphabet'} = $value; | ||||
| 596 | } | ||||
| 597 | return $self->{'alphabet'}; | ||||
| 598 | } | ||||
| 599 | |||||
| 600 | |||||
| 601 | =head2 _load_format_module | ||||
| 602 | |||||
| 603 | Title : _load_format_module | ||||
| 604 | Usage : *INTERNAL SeqIO stuff* | ||||
| 605 | Function: Loads up (like use) a module at run time on demand | ||||
| 606 | Example : | ||||
| 607 | Returns : | ||||
| 608 | Args : | ||||
| 609 | |||||
| 610 | =cut | ||||
| 611 | |||||
| 612 | sub _load_format_module { | ||||
| 613 | my ($self, $format) = @_; | ||||
| 614 | my $module = "Bio::SeqIO::" . $format; | ||||
| 615 | my $ok; | ||||
| 616 | |||||
| 617 | eval { | ||||
| 618 | $ok = $self->_load_module($module); | ||||
| 619 | }; | ||||
| 620 | if ( $@ ) { | ||||
| 621 | print STDERR <<END; | ||||
| 622 | $self: $format cannot be found | ||||
| 623 | Exception $@ | ||||
| 624 | For more information about the SeqIO system please see the SeqIO docs. | ||||
| 625 | This includes ways of checking for formats at compile time, not run time | ||||
| 626 | END | ||||
| 627 | ; | ||||
| 628 | } | ||||
| 629 | return $ok; | ||||
| 630 | } | ||||
| 631 | |||||
| 632 | |||||
| 633 | =head2 _concatenate_lines | ||||
| 634 | |||||
| 635 | Title : _concatenate_lines | ||||
| 636 | Usage : $s = _concatenate_lines($line, $continuation_line) | ||||
| 637 | Function: Private. Concatenates two strings assuming that the second stems | ||||
| 638 | from a continuation line of the first. Adds a space between both | ||||
| 639 | unless the first ends with a dash. | ||||
| 640 | |||||
| 641 | Takes care of either arg being empty. | ||||
| 642 | Example : | ||||
| 643 | Returns : A string. | ||||
| 644 | Args : | ||||
| 645 | |||||
| 646 | =cut | ||||
| 647 | |||||
| 648 | sub _concatenate_lines { | ||||
| 649 | my ($self, $s1, $s2) = @_; | ||||
| 650 | $s1 .= " " if($s1 && ($s1 !~ /-$/) && $s2); | ||||
| 651 | return ($s1 ? $s1 : "") . ($s2 ? $s2 : ""); | ||||
| 652 | } | ||||
| 653 | |||||
| 654 | |||||
| 655 | =head2 _filehandle | ||||
| 656 | |||||
| 657 | Title : _filehandle | ||||
| 658 | Usage : $obj->_filehandle($newval) | ||||
| 659 | Function: This method is deprecated. Call _fh() instead. | ||||
| 660 | Example : | ||||
| 661 | Returns : value of _filehandle | ||||
| 662 | Args : newvalue (optional) | ||||
| 663 | |||||
| 664 | =cut | ||||
| 665 | |||||
| 666 | sub _filehandle { | ||||
| 667 | my ($self,@args) = @_; | ||||
| 668 | return $self->_fh(@args); | ||||
| 669 | } | ||||
| 670 | |||||
| 671 | |||||
| 672 | =head2 _guess_format | ||||
| 673 | |||||
| 674 | Title : _guess_format | ||||
| 675 | Usage : $obj->_guess_format($filename) | ||||
| 676 | Function: guess format based on file suffix | ||||
| 677 | Example : | ||||
| 678 | Returns : guessed format of filename (lower case) | ||||
| 679 | Args : | ||||
| 680 | Notes : formats that _filehandle() will guess include fasta, | ||||
| 681 | genbank, scf, pir, embl, raw, gcg, ace, bsml, swissprot, | ||||
| 682 | fastq and phd/phred | ||||
| 683 | |||||
| 684 | =cut | ||||
| 685 | |||||
| 686 | sub _guess_format { | ||||
| 687 | my $class = shift; | ||||
| 688 | return unless $_ = shift; | ||||
| 689 | |||||
| 690 | return 'abi' if /\.ab[i1]$/i; | ||||
| 691 | return 'ace' if /\.ace$/i; | ||||
| 692 | return 'alf' if /\.alf$/i; | ||||
| 693 | return 'bsml' if /\.(bsm|bsml)$/i; | ||||
| 694 | return 'ctf' if /\.ctf$/i; | ||||
| 695 | return 'embl' if /\.(embl|ebl|emb|dat)$/i; | ||||
| 696 | return 'entrezgene' if /\.asn$/i; | ||||
| 697 | return 'exp' if /\.exp$/i; | ||||
| 698 | return 'fasta' if /\.(fasta|fast|fas|seq|fa|fsa|nt|aa|fna|faa)$/i; | ||||
| 699 | return 'fastq' if /\.fastq$/i; | ||||
| 700 | return 'gcg' if /\.gcg$/i; | ||||
| 701 | return 'genbank' if /\.(gb|gbank|genbank|gbk|gbs)$/i; | ||||
| 702 | return 'phd' if /\.(phd|phred)$/i; | ||||
| 703 | return 'pir' if /\.pir$/i; | ||||
| 704 | return 'pln' if /\.pln$/i; | ||||
| 705 | return 'qual' if /\.qual$/i; | ||||
| 706 | return 'raw' if /\.txt$/i; | ||||
| 707 | return 'scf' if /\.scf$/i; | ||||
| 708 | # from Strider 1.4 Release Notes: The file name extensions used by | ||||
| 709 | # Strider 1.4 are ".xdna", ".xdgn", ".xrna" and ".xprt" for DNA, | ||||
| 710 | # DNA Degenerate, RNA and Protein Sequence Files, respectively | ||||
| 711 | return 'strider' if /\.(xdna|xdgn|xrna|xprt)$/i; | ||||
| 712 | return 'swiss' if /\.(swiss|sp)$/i; | ||||
| 713 | return 'ztr' if /\.ztr$/i; | ||||
| 714 | } | ||||
| 715 | |||||
| 716 | |||||
| 717 | sub DESTROY { | ||||
| 718 | my $self = shift; | ||||
| 719 | $self->close(); | ||||
| 720 | } | ||||
| 721 | |||||
| 722 | |||||
| 723 | sub TIEHANDLE { | ||||
| 724 | my ($class,$val) = @_; | ||||
| 725 | return bless {'seqio' => $val}, $class; | ||||
| 726 | } | ||||
| 727 | |||||
| 728 | |||||
| 729 | sub READLINE { | ||||
| 730 | my $self = shift; | ||||
| 731 | return $self->{'seqio'}->next_seq() || undef unless wantarray; | ||||
| 732 | my (@list, $obj); | ||||
| 733 | push @list, $obj while $obj = $self->{'seqio'}->next_seq(); | ||||
| 734 | return @list; | ||||
| 735 | } | ||||
| 736 | |||||
| 737 | |||||
| 738 | sub PRINT { | ||||
| 739 | my $self = shift; | ||||
| 740 | $self->{'seqio'}->write_seq(@_); | ||||
| 741 | } | ||||
| 742 | |||||
| 743 | |||||
| 744 | =head2 sequence_factory | ||||
| 745 | |||||
| 746 | Title : sequence_factory | ||||
| 747 | Usage : $seqio->sequence_factory($seqfactory) | ||||
| 748 | Function: Get/Set the Bio::Factory::SequenceFactoryI | ||||
| 749 | Returns : Bio::Factory::SequenceFactoryI | ||||
| 750 | Args : [optional] Bio::Factory::SequenceFactoryI | ||||
| 751 | |||||
| 752 | =cut | ||||
| 753 | |||||
| 754 | sub sequence_factory { | ||||
| 755 | my ($self, $obj) = @_; | ||||
| 756 | if( defined $obj ) { | ||||
| 757 | if( ! ref($obj) || ! $obj->isa('Bio::Factory::SequenceFactoryI') ) { | ||||
| 758 | $self->throw("Must provide a valid Bio::Factory::SequenceFactoryI object to ".ref($self)."::sequence_factory()"); | ||||
| 759 | } | ||||
| 760 | $self->{'_seqio_seqfactory'} = $obj; | ||||
| 761 | my $builder = $self->sequence_builder(); | ||||
| 762 | if($builder && $builder->can('sequence_factory') && | ||||
| 763 | (! $builder->sequence_factory())) { | ||||
| 764 | $builder->sequence_factory($obj); | ||||
| 765 | } | ||||
| 766 | } | ||||
| 767 | $self->{'_seqio_seqfactory'}; | ||||
| 768 | } | ||||
| 769 | |||||
| 770 | |||||
| 771 | =head2 object_factory | ||||
| 772 | |||||
| 773 | Title : object_factory | ||||
| 774 | Usage : $obj->object_factory($newval) | ||||
| 775 | Function: This is an alias to sequence_factory with a more generic name. | ||||
| 776 | Example : | ||||
| 777 | Returns : value of object_factory (a scalar) | ||||
| 778 | Args : on set, new value (a scalar or undef, optional) | ||||
| 779 | |||||
| 780 | =cut | ||||
| 781 | |||||
| 782 | sub object_factory{ | ||||
| 783 | return shift->sequence_factory(@_); | ||||
| 784 | } | ||||
| 785 | |||||
| 786 | |||||
| 787 | =head2 sequence_builder | ||||
| 788 | |||||
| 789 | Title : sequence_builder | ||||
| 790 | Usage : $seqio->sequence_builder($seqfactory) | ||||
| 791 | Function: Get/Set the Bio::Factory::ObjectBuilderI used to build sequence | ||||
| 792 | objects. This applies to rich sequence formats only, e.g. genbank | ||||
| 793 | but not fasta. | ||||
| 794 | |||||
| 795 | If you do not set the sequence object builder yourself, it | ||||
| 796 | will in fact be an instance of L<Bio::Seq::SeqBuilder>, and | ||||
| 797 | you may use all methods documented there to configure it. | ||||
| 798 | |||||
| 799 | Returns : a Bio::Factory::ObjectBuilderI compliant object | ||||
| 800 | Args : [optional] a Bio::Factory::ObjectBuilderI compliant object | ||||
| 801 | |||||
| 802 | =cut | ||||
| 803 | |||||
| 804 | sub sequence_builder { | ||||
| 805 | my ($self, $obj) = @_; | ||||
| 806 | if( defined $obj ) { | ||||
| 807 | if( ! ref($obj) || ! $obj->isa('Bio::Factory::ObjectBuilderI') ) { | ||||
| 808 | $self->throw("Must provide a valid Bio::Factory::ObjectBuilderI object to ".ref($self)."::sequence_builder()"); | ||||
| 809 | } | ||||
| 810 | $self->{'_object_builder'} = $obj; | ||||
| 811 | } | ||||
| 812 | $self->{'_object_builder'}; | ||||
| 813 | } | ||||
| 814 | |||||
| 815 | |||||
| 816 | =head2 location_factory | ||||
| 817 | |||||
| 818 | Title : location_factory | ||||
| 819 | Usage : $seqio->location_factory($locfactory) | ||||
| 820 | Function: Get/Set the Bio::Factory::LocationFactoryI object to be used for | ||||
| 821 | location string parsing | ||||
| 822 | Returns : a Bio::Factory::LocationFactoryI implementing object | ||||
| 823 | Args : [optional] on set, a Bio::Factory::LocationFactoryI implementing | ||||
| 824 | object. | ||||
| 825 | |||||
| 826 | =cut | ||||
| 827 | |||||
| 828 | sub location_factory { | ||||
| 829 | my ($self,$obj) = @_; | ||||
| 830 | if( defined $obj ) { | ||||
| 831 | if( ! ref($obj) || ! $obj->isa('Bio::Factory::LocationFactoryI') ) { | ||||
| 832 | $self->throw("Must provide a valid Bio::Factory::LocationFactoryI" . | ||||
| 833 | " object to ".ref($self)."->location_factory()"); | ||||
| 834 | } | ||||
| 835 | $self->{'_seqio_locfactory'} = $obj; | ||||
| 836 | } | ||||
| 837 | $self->{'_seqio_locfactory'}; | ||||
| 838 | } | ||||
| 839 | |||||
| 840 | 1 | 4µs | 1; | ||
| 841 |