#!/usr/bin/env perl
use v5.14.1;

use PICA::Data;
use PICA::Schema;
use PICA::Schema::Builder;
use Getopt::Long qw(:config bundling);
use Pod::Usage;
use HTTP::Tiny;
use JSON::PP;
use List::Util 'sum';

my $number = 0;
if (my ($i) = grep {$ARGV[$_] =~ /^-(\d+)$/} (0 .. @ARGV - 1)) {
    $number = -(splice @ARGV, $i, 1);
}

my $abbrev = grep {$_ eq '-B'} @ARGV;
my @pathes;

GetOptions(
    'from|f=s'   => \(my $from),
    'to|t:s'     => \(my $to),
    'schema|s=s' => \(my $schema),
    'build|B|b'  => \(my $build),
    'unknown|u!' => \(my $report_unknown),
    'count|c'    => \(my $count),
    'order|o'    => \(my $order),
    'path|p=s'   => \@pathes,
    "n|number:i" => \$number,
    'color|C'    => \(my $color),
    'mono|M'     => \(my $nocolor),
    'help|h|?'   => \(my $help),
    'version|V'  => \(my $version)
) or pod2usage(2);

if ($version) {
    say $PICA::Data::VERSION;
    exit
}

pod2usage(-verbose => 99, -sections => "SYNOPSIS|OPTIONS|DESCRIPTION")
    if $help
    or (!@ARGV and -t);

my %types = (
    bin    => 'Binary',
    dat    => 'Binary',
    binary => 'Binary',
    extpp  => 'Binary',
    ext    => 'Binary',
    plain  => 'Plain',
    pp     => 'Plain',
    plus   => 'Plus',
    norm   => 'Plus',
    normpp => 'Plus',
    xml    => 'XML',
    ppxml  => 'PPXML',
    json   => 'JSON',
    ndjson => 'JSON',
    fields => 'Fields',
);

my $input = '-';
if (!@pathes) {
    my $pattern = '[012.][0-9.][0-9.][A-Z@.](\$[^|]+)?';
    while (@ARGV && $ARGV[0] =~ /^$pattern(\s*\|\s*($pattern)?)*$/) {
        push @pathes, shift @ARGV;
    }
}

my $sfpath = 0;
if (@pathes) {
    @pathes = map {
        my $p = eval {PICA::Path->new($_)};
        $p || die "invalid pica path: $_\n";
    } grep {$_ ne ""} map {split /\s*\|\s*/, $_} @pathes;
}

$sfpath = sum map {length $_->subfields > 0} @pathes;
die "path expressions must either all select fields or select subfields!\n"
    if $sfpath and $sfpath ne @pathes;

$input = shift @ARGV if @ARGV;

$from = $1 if !$from && $input =~ /\.([a-z]+)$/ && $types{lc $1};

$from = 'plain' unless $from;
pod2usage("unknown serialization type: $from") unless $types{lc $from};

if ($schema) {
    my $json;
    if ($schema =~ qr{^https?://}) {
        my $res = HTTP::Tiny->new->get($schema);
        die "HTTP request failed: $schema\n" unless $res->{success};
        $json = $res->{content};
    } else {
        my $fh = IO::File->new($schema);
        $json = join "\n", <$fh>;
    }
    $schema = PICA::Schema->new(decode_json($json));
}

$to = $from unless ($to or $count or $schema or $build or $sfpath);
pod2usage("unknown serialization type: $to") unless !$to or $types{lc $to};

$build = PICA::Schema::Builder->new if $build;

$input = *STDIN if $input eq '-';
my $parser = "PICA::Parser::${types{$from}}"->new($input, bless => 1);

my $writer;
$color
    = !$nocolor && ($color || -t *STDOUT)
    ? {tag => 'blue', occurrence => 'blue', code => 'red', value => 'green'}
    : undef;
binmode *STDOUT, ':encoding(UTF-8)';
if ($to) {
    $writer = "PICA::Writer::${types{$to}}";
    $writer = $writer->new(color => $color, schema => $schema);
}

my %schema_options = (ignore_unknown => !$report_unknown);
my $stats = {records => 0, holdings => 0, items => 0, fields => 0};
my $invalid = 0;

while (my $record = $parser->next) {
    if ($sfpath) {
        say $_ for map {@{$record->match($_, split => 1) // []}} @pathes;
    }

    $record = $record->fields(@pathes) if @pathes;
    $record = $record->sort if $order;

    $writer->write($record) if $writer;
    if ($schema && $to ne 'fields') {
        my @errors = $schema->check($record, %schema_options);
        if (@errors) {
            for (@errors) {
                my $msg
                    = defined $record->{_id} ? $record->{_id} . ": $_" : $_;
                print "$msg\n";
            }
            $invalid++;
        }
    }
    $build->add($record) if $build;

    if ($count) {
        $stats->{holdings} += @{$record->holdings};
        $stats->{items}    += @{$record->items};
        $stats->{fields}   += @{$record->{record}};
    }
    $stats->{records}++;
    last if $number and $stats->{records} >= $number;
}

$writer->end() if $writer;

if ($count) {
    $stats->{invalid} = $invalid if defined $invalid;
    print $stats->{$_} . " $_\n"
        for grep {defined $stats->{$_}}
        qw(records invalid holdings items fields);
}

if ($build) {
    my $JSON = JSON::PP->new->indent->space_after->canonical->convert_blessed;
    my $schema = $build->schema;
    print $JSON->encode($abbrev ? $schema->abbreviated : $schema);
}

__END__

=head1 NAME

picadata - parse and validate PICA+ data

=head1 SYNOPSIS

picadata [--from TYPE] [--schema FILE] [--to TYPE] {OPTIONS} [FILE]

=head1 DESCRIPTION

Parse, validate and/or serialize PICA+ data from the command line, e.g.:

  picadata pica.dat -t xml           # convert binary to XML
  picadata -c -f plain < pica.plain  # parse and count records
  picadata -p 003@ pica.xml -t       # extract field 003@
  picadata pica.xml -s schema.json   # validate against Avram schema

  # document fields used in a record
  picadata -t fields pica.xml -s https://format.k10plus.de/avram.pl?profile=k10plus

=head1 OPTIONS

=head2 --from, -f

PICA serialization type (plain, plus, binary, XML, ppxml) with XML as default.
Guessed from input filename unless specified. See format documentation at
L<http://format.gbv.de/pica>

=head2 --to, -t

PICA serialization type to enable writing parsed PICA data. Use C<fields> to
list all fields in the data (with optional schema information).

=head2 --number, -n

Stop parsing after C<n> records. Can be abbreviated as C<-1>, C<-2>...

=head2 --order, -o

Sort record fields by field identifier or by occurrence for level 2.

=head2 --count, -c

Count number of records, holdings, items, and fields.

=head2 --path, -p

Select fields or subfield values specified by PICA Path expressions. Multiple
expressions can be separated by C<|> or by repeating the option. 

=head2 --schema, -s

L<Avram Schema|http://format.gbv.de/schema/avram/specification> to validate
against. Can be a file or an URL.

=head2 --unknown, -u

Report unknown fields and subfields on validation (disabled by default).

=head2 --build, -b, -B

Build an Avram schema from records. Option C<-B> abbreviates the schema.

=head2 --color, -C

Colorize output. Only supported for PICA plain and PICA plus format.

=head2 --mono, -M

Monochrome (don't colorize output).

=head2 --version, -V

Print version number and exit.

=head1 SEE ALSO

See L<catmandu> for a more elaborated command line tool for data processing
(transformation, API access...), including PICA+ with L<Catmandu::PICA>.

=cut
