#!/pro/bin/perl

package Spreadsheet::Read;

=head1 NAME

Spreadsheet::Read - Read the data from a spreadsheet

=head1 SYNOPSYS

use Spreadsheet::Read;
my $csv = ReadData ("test.csv", sep => ";");
my $sxc = ReadData ("test.sxc");
my $xls = ReadData ("test.xls");

=cut

use strict;
use warnings;

our $VERSION = "0.04";
sub  Version { $VERSION }

use Exporter;
our @ISA     = qw( Exporter );
our @EXPORT  = qw( ReadData cell2cr cr2cell );

use File::Temp           qw( );
use Spreadsheet::ReadSXC qw( read_sxc read_xml_file read_xml_string );
use Spreadsheet::ParseExcel;
use Text::CSV_XS;

my  $debug = 0;

# Helper functions

# cr2cell (4, 18) => "D18"
sub cr2cell ($$)
{
    my ($c, $r) = @_;
    my $cell = "";
    while ($c) {
	use integer;

	substr ($cell, 0, 0) = chr (--$c % 26 + ord "A");
	$c /= 26;
	}
    "$cell$r";
    } # cr2cell

# cell2cr ("D18") => (4, 18)
sub cell2cr ($)
{
    my ($cc, $r) = ((uc $_[0]) =~ m/^([A-Z]+)(\d+)$/) or return (0, 0);
    my $c = 0;
    while ($cc =~ s/^([A-Z])//) {
	$c = 26 * $c + 1 + ord ($1) - ord ("A");
	}
    ($c, $r);
    } # cell2cr

sub ReadData ($;@)
{
    my $txt = shift	or  return;
    ref $txt		and return;	# TODO: support IO stream

    my $tmpfile;

    my %opt = @_ && ref ($_[0]) eq "HASH" ? @{shift@_} : @_;
    defined $opt{rc}	or $opt{rc}	= 1;
    defined $opt{cell}	or $opt{cell}	= 1;

    # CSV not supported from streams
    if ($txt =~ m/\.(csv)$/i and -f $txt) {
	open my $in, "< $txt" or return;
	my $csv;
	my @data = (
	    {	type   => "csv",
		sheets => 1,
		sheet  => { $txt => 1 },
		},
	    {	label	=> $txt,
		maxrow	=> 0,
		maxcol	=> 0,
		cell	=> [],
		},
	    );
	while (<$in>) {
	    unless ($csv) {
		my $quo = defined $opt{quote} ? $opt{quote} : '"';
		my $sep = # If explicitely set, use it
		   defined $opt{sep} ? $opt{sep} :
		       # otherwise start auto-detect with quoted strings
		       m/["\d];["\d;]/  ? ";"  :
		       m/["\d],["\d,]/  ? ","  :
		       m/["\d]\t["\d,]/ ? "\t" :
		       # If neither, then for unquoted strings
		       m/\w;[\w;]/      ? ";"  :
		       m/\w,[\w,]/      ? ","  :
		       m/\w\t[\w,]/     ? "\t" :
					  ","  ;
		$csv = Text::CSV_XS->new ({
		    sep_char   => $sep,
		    quote_char => $quo,
		    binary     => 1,
		    });
		}
	    $csv->parse ($_);
	    my @row = $csv->fields () or next;
	    my $r = ++$data[1]{maxcol};
	    @row > $data[1]{maxrow} and $data[1]{maxrow} = @row;
	    foreach my $c (0 .. $#row) {
		my $val = $row[$c];
		my $cell = cr2cell ($c + 1, $r);
		$opt{rc}   and $data[1]{cell}[$c + 1][$r] = $val;
		$opt{cell} and $data[1]{$cell} = $val;
		}
	    }
	for (@{$data[1]{cell}}) {
	    defined $_ or $_ = [];
	    }
	close $in;
	return [ @data ];
	}

    # From /etc/magic: Microsoft Office Document
    if ($txt =~ m/^(\376\067\0\043
		   |\320\317\021\340\241\261\032\341
		   |\333\245-\0\0\0)/x) {
	$tmpfile = File::Temp->new (SUFFIX => ".xls", UNLINK => 1);
	print $tmpfile $txt;
	$txt = "$tmpfile";
	}
    if ($txt =~ m/\.xls$/i and -f $txt) {
	$debug and print STDERR "Opening XLS $txt\n";
	my $oBook = Spreadsheet::ParseExcel::Workbook->Parse ($txt);
	my @data = ( {
	    type   => "xls",
	    sheets => $oBook->{SheetCount},
	    sheet  => {},
	    } );
	$debug and print STDERR "\t$data[0]{sheets} sheets\n";
	foreach my $oWkS (@{$oBook->{Worksheet}}) {
	    my %sheet = (
		label	=> $oWkS->{Name},
		maxrow	=> 0,
		maxcol	=> 0,
		cell	=> [],
		);
	    exists $oWkS->{MaxRow} and $sheet{maxrow} = $oWkS->{MaxRow} + 1;
	    exists $oWkS->{MaxCol} and $sheet{maxcol} = $oWkS->{MaxCol} + 1;
	    my $sheet_idx = 1 + @data;
	    $debug and print STDERR "\tSheet $sheet_idx '$sheet{label}' $sheet{maxrow} x $sheet{maxcol}\n";
	    if (exists $oWkS->{MinRow}) {
		foreach my $r ($oWkS->{MinRow} .. $sheet{maxrow}) { 
		    foreach my $c ($oWkS->{MinCol} .. $sheet{maxcol}) { 
			my $oWkC = $oWkS->{Cells}[$r][$c] or next;
			my $val = $oWkC->{Val} or next;
			my $cell = cr2cell ($c + 1, $r + 1);
			$opt{rc}   and $sheet{cell}[$c + 1][$r + 1] = $val;	# Original
			$opt{cell} and $sheet{$cell} = $oWkC->Value;	# Formatted
			}
		    }
		}
	    for (@{$sheet{cell}}) {
		defined $_ or $_ = [];
		}
	    push @data, { %sheet };
#	    $data[0]{sheets}++;
	    $data[0]{sheet}{$sheet{label}} = $#data;
	    }
	return [ @data ];
	}

    if ($txt =~ m/^# .*SquirrelCalc/ or $txt =~ m/\.sc$/ && -f $txt) {
	if (-f $txt) {
	    local $/;
	    open my $sc, "< $txt" or return;
	    $txt = <$sc>;
	    $txt =~ m/\S/ or return;
	    }
	my @data = (
	    {	type   => "sc",
		sheets => 1,
		sheet  => { sheet => 1 },
		},
	    {	label	=> "sheet",
		maxrow	=> 0,
		maxcol	=> 0,
		cell	=> [],
		},
	    );

	for (split m/\s*[\r\n]\s*/, $txt) {
	    if (m/^dimension.*of (\d+) rows.*of (\d+) columns/i) {
		@{$data[1]}{qw(maxrow maxcol)} = ($1, $2);
		next;
		}
	    s/^r(\d+)c(\d+)\s*=\s*// or next;
	    my ($c, $r) = map { $_ + 1 } $2, $1;
	    if (m/.* {(.*)}$/ or m/"(.*)"/) {
		my $cell = cr2cell ($c, $r);
		$opt{rc}   and $data[1]{cell}[$c][$r] = $1;
		$opt{cell} and $data[1]{$cell} = $1;
		next;
		}
	    # Now only formula's remain. Ignore for now
	    # r67c7 = [P2L] 2*(1000*r67c5-60)
	    }
	for (@{$data[1]{cell}}) {
	    defined $_ or $_ = [];
	    }
	return [ @data ];
	}

    if ($txt =~ m/^<\?xml/ or -f $txt) {
	my $sxc;
	   if ($txt =~ m/\.sxc$/i) {
	    $debug and print STDERR "Opening SXC $txt\n";
	    $sxc = read_sxc ($txt)		or  return;
	    }
	elsif ($txt =~ m/\.xml$/i) {
	    $debug and print STDERR "Opening XML $txt\n";
	    $sxc = read_xml_file ($txt)	or  return;
	    }
	# need to test on pattern to prevent stat warning
	# on filename with newline
	elsif ($txt !~ m/^<\?xml/i and -f $txt) {
	    $debug and print STDERR "Opening XML $txt\n";
	    open my $f, "<$txt"		or  return;
	    local $/;
	    $txt = <$f>;
	    }
	!$sxc && $txt =~ m/^<\?xml/i and $sxc = read_xml_string ($txt);
	if ($sxc) {
	    my @data = ( {
		type   => "sxc",
		sheets => 0,
		sheet  => {},
		} );
	    foreach my $sheet (keys %$sxc) {
		my @sheet = @{$sxc->{$sheet}};
		my %sheet = (
		    label	=> $sheet,
		    maxrow	=> scalar @sheet,
		    maxcol	=> 0,
		    cell	=> [],
		    );
		my $sheet_idx = 1 + @data;
		$debug and print STDERR "\tSheet $sheet_idx '$sheet{label}' $sheet{maxrow} rows\n";
		foreach my $r (0 .. $#sheet) {
		    my @row = @{$sheet[$r]} or next;
		    foreach my $c (0 .. $#row) {
			my $val = $row[$c] or next;
			my $C = $c + 1;
			$C > $sheet{maxcol} and $sheet{maxcol} = $C;
			my $cell = cr2cell ($C, $r + 1);
			$opt{rc}   and $sheet{cell}[$C][$r + 1] = $val;
			$opt{cell} and $sheet{$cell} = $val;
			}
		    }
		for (@{$sheet{cell}}) {
		    defined $_ or $_ = [];
		    }
		$debug and print STDERR "\tSheet $sheet_idx '$sheet{label}' $sheet{maxrow} x $sheet{maxcol}\n";
		push @data, { %sheet };
		$data[0]{sheets}++;
		$data[0]{sheet}{$sheet} = $#data;
		}
	    return [ @data ];
	    }
	}

    return;
    } # ReadData

1;

=head1 DESCRIPTION

Spreadsheet::Read tries to transparantly read *any* spreadsheet and
return it's content in a universal manner independant of the parsing
module that does the actual spreadsheet scanning.

For OpenOffice this module uses Spreadsheet::ReadSXC

For Excel this module uses Spreadsheet::ParseExcel

For CSV this module uses Text::CSV_XS

=head2 Data structure

The data is returned as an array reference:

  $ref = [
 	# Entry 0 is the overall control hash
 	{ sheets => 2,
	  sheet  => {
	    "Sheet 1"	=> 1,
	    "Sheet 2"	=> 2,
	    },
	  type   => "xls",
	  },
 	# Entry 1 is the first sheet
 	{ label  => "Sheet 1",
 	  maxrow => 2,
 	  maxcol => 4,
 	  cell   => [ undef,
	    [ undef, 1 ],
	    [ undef, undef, undef, undef, undef, "Nugget" ],
	    ],
 	  A1     => 1,
 	  B4     => "Nugget",
 	  },
 	# Entry 2 is the second sheet
 	{ label => "Sheet 2",
 	  :
 	:

To keep as close contact to spreadsheet users, row and column 1 have
index 1 too in the C<cell> element of the sheet hash, so cell "A1" is
the same as C<cell> [1, 1] (column first). To switch between the two,
there are two helper functions available: C<cell2cr ()> and C<cr2cell ()>.

The C<cell> hash entry contains unformatted data, while the hash entries
with the traditional labels contain the formatted values (if applicable).

The control hash (the first entry in the returned array ref), contains
some spreadsheet metadata. The entry C<sheet> is there to be able to find
the sheets when accessing them by name:

  my %sheet2 = %{$ref->[$ref->[0]{sheet}{"Sheet 2"}]};

=head2 Functions

=over 2

=item C<my $ref = ReadData ($source [, option => value [, ... ]]);>

=item C<my $ref = ReadData ("file.csv", sep =&gt; ',', quote => '"');>

=item C<my $ref = ReadData ("file.xls");>

=item C<my $ref = ReadData ("file.sxc");>

=item C<my $ref = ReadData ("content.xml");>

=item C<my $ref = ReadData ($content);>

Tries to convert the given file, string, or stream to the data
structure described above.

Precessing data from a stream or content is supported for Excel (through a
File::Temp temporary file), or for XML (OpenOffice), but not for CSV.

Currently ReadSXC does not preserve sheet order.

Currently supported options are:

=over 2

=item cells

Control the generation of named cells ("A1" etc). Default is true.

=item rc

Control the generation of the {cell}[c][r] entries. Default is true.

=item sep

Set separator for CSV. Default is comma C<,>.

=item quote

Set quote character for CSV. Default is C<">.

=back

=item C<my $cell = cr2cell (col, row)>

C<cr2cell ()> converts a C<(column, row)> pair (1 based) to the
traditional cell notation:

  my $cell = cr2cell ( 4, 14); # $cell now "D14"
  my $cell = cr2cell (28,  4); # $cell now "AB4"

=item C<my ($col, $row) = cell2cr ($cell)>

=back

=head1 TODO

=over 4

=item Cell attributes

Future plans include cell attributes, available as for example:

 	{ label  => "Sheet 1",
 	  maxrow => 2,
 	  maxcol => 4,
 	  cell   => [ undef,
	    [ undef, 1 ],
	    [ undef, undef, undef, undef, undef, "Nugget" ],
	    ],
 	  attr   => [ undef,
 	    [ undef, {
 	      color  => "Red",
 	      font   => "Arial",
 	      size   => "12",
 	      format => "## ###.##",
 	      align  => "right",
 	      }, ]
	    [ undef, undef, undef, undef, undef, {
 	      color  => "#e2e2e2",
 	      font   => "LetterGothic",
 	      size   => "15",
 	      format => undef,
 	      align  => "left",
 	      }, ]
 	  A1     => 1,
 	  B4     => "Nugget",
 	  },

=item Options

Try to transparently support as many options as the encapsulated modules
support regarding (un)formatted values, (date) formats, hidden columns
rows or fields etc. These could be implemented like C<attr> above but
names C<meta>, or just be new values in the C<attr> hashes.

=item Other spreadsheet formats

I consider adding any spreadsheet interface that offers a usable API.

=item Safety / flexibility

Make the different formats/modules just load if available and ignore if
not available.

=item OO-ify

Consider making the ref an object, though I currently don't see the big
advantage (yet). Maybe I'll make it so that it is a hybrid functional /
OO interface.

=back

=head1 SEE ALSO

=over 2

=item Text::CSV_XS

http://search.cpan.org/~jwied/

A pure perl version is available on http://search.cpan.org/~makamaka/

=item Spreadsheet::ParseExcel

http://search.cpan.org/~kwitknr/

=item Spreadsheet::ReadSXC

http://search.cpan.org/~terhechte/

=item Text::CSV_XS, Text::CSV

http://search.cpan.org/~jwied/
http://search.cpan.org/~alancitt/

=back

=head1 AUTHOR

H.Merijn Brand, <h.m.brand@xs4all.nl>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2005-2005 H.Merijn Brand

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself. 

=cut
