#!/usr/bin/env perl
use strict;
use warnings;
use Git;
use Git::FastExport;
use File::Spec::Functions qw( rel2abs );

our $VERSION = '0.04';

my %repo;

# process command-line parameters
my $name = 'A';
while (@ARGV) {
    my ( $repo, $dir ) = split /:/, shift @ARGV, 2;
    $repo = rel2abs($repo);
    $dir ||= '';

    # create an export parser for each repo
    my $parser
        = Git::FastExport->new( Git->repository( Directory => $repo ) );
    $parser->fast_export(qw( --progress=1 --all --date-order ));
    $parser->{mapdir} = $dir;

    # update the %repo hash
    $repo                = $parser->{source};
    $repo{$repo}{repo}   = $repo;
    $repo{$repo}{dir}    = $dir;
    $repo{$repo}{parser} = $parser;
    $repo{$repo}{name}   = $dir || $name;
    $name++;
}

# repositories that we will process
my @repos = values %repo;

my $mark = 1_000_000;    # mark counter in the new repo
my %mark_map;            # map marks in source repos to marks in the new repo

# get the first commits
$_->{commit} = next_commit( $_->{parser} ) for @repos;

# main loop
my $last;
my %commits;
while (@repos) {

    # sort by date
    @repos = sort { $a->{commit}{date} <=> $b->{commit}{date} } @repos;
    my $repo = $repos[0];

    # next commit to dump
    my $commit = $repo->{commit};

    # update marks & dir in files
    for ( @{ $commit->{files} } ) {
        s/^M (\d+) :(\d+)/M $1 :$mark_map{$repo->{repo}}{$2}/;
        if ( my $dir = $repo->{dir} ) {
            s!^(M \d+ :\d+) (.*)!$1 $dir/$2!;    # filemodify
            s!^D (.*)!D $dir/$1!;                # filedelete

            # /!\ quotes may happen - die and fix if needed
            die "Choked on quoted paths in $repo->{repo}! Culprit:\n$_\n"
                if /^[CR] \S+ \S+ /;

            # filecopy | filerename
            s!^([CR]) (\S+) (\S+)!$1 $dir/$2 $dir/$3!;
        }
    }

    # first commit in the old repo linked to latest commit in new repo
    if ( $last && !$commit->{from} ) {
        $commit->{from} = ["from :$last"];
    }

    # update historical information
    my ($id) = $commit->{mark}[0] =~ /:(\d+)/g;
    $last = $id;    # last commit applied
    my $branch = ( split / /, $commit->{header} )[1];
    my $node = $commits{$id} = {
        name     => $id,
        repo     => $repo->{repo},
        branch   => $branch,
        children => [],
        merge    => exists $commit->{merge},
    };

    # mark our original source
    $commit->{header} =~ s/$/-$repo->{name}/;

    # this commit's parents
    my @parents = map {/:(\d+)/g} @{ $commit->{from} || [] },
        @{ $commit->{merge} || [] };

    # map each parent to its last "alien" commit
    my %parent_map = map {
        $_ => last_alien_child( $commits{$_}, $repo->{repo}, $branch )->{name}
    } @parents;

    # map parent marks
    for ( @{ $commit->{from} || [] }, @{ $commit->{merge} || [] } ) {
        if (m/^(from|merge) /) {
            s/:(\d+)/:$parent_map{$1}/g;
        }
    }

    # update the parents information
    for my $parent ( map { $parent_map{$_} } @parents ) {
        push @{ $commits{$parent}{children} }, $node->{name};
    }

    # dump the commit
    print $commit->as_string;

    # load next commit
    $repo->{commit} = next_commit( $repo->{parser} )
        or shift @repos;    # no more blocks in this export
}

# return the next commit
# - print out the intermediate blocks
# - offset the old marks
sub next_commit {
    my ($parser) = @_;
    my $block;

    while ( $block = $parser->next_block() ) {

        # map to the new mark
        for ( @{ $block->{mark} || [] } ) {
            s/:(\d+)/:$mark/
                and $mark_map{ $parser->{source} }{$1} = $mark++;
        }

        # update marks in from & merge
        for ( @{ $block->{from} || [] }, @{ $block->{merge} || [] } ) {
            if (m/^(from|merge) /) {
                s/:(\d+)/:$mark_map{$parser->{source}}{$1}/g;
            }
        }
        last if $block->{type} eq 'commit';
        print $block->as_string();
    }
    return $block;
}

# find the last child of this node
# that has either no child
# or a child in our repo
sub last_alien_child {
    my ( $node, $repo, $branch ) = @_;

    while (1) {

        # no children nodes
        return $node if ( !@{ $node->{children} } );

        # some children nodes are local
        return $node
            if grep { $commits{$_}->{repo} eq $repo } @{ $node->{children} };

        # there's a child in the same branch
        if ( my ($peer)
            = grep { $commits{$_}->{branch} eq $branch }
            @{ $node->{children} } )
        {

            # but don't go past another repo's merges
            # FIXME - unless it only includes ancestors of ours
            return $node if $commits{$peer}->{merge};
            $node = $commits{$peer};
        }

        # or pick the first child (as good as any)
        else {
            $node = $commits{ $node->{children}[0] };
        }
    }
}

__END__

=head1 NAME

git-stitch-repo - Stitch several git repositories into a git-fast-import stream

=head1 SYNOPSIS

git-stitch-repo repo1 repo2:dir2 ...

=head1 DESCRIPTION

B<git-stitch-repo> will process the output of C<git-fast-export --all
--date-order> on the git repositories given on the command-line,
and create a stream suitable for B<git-fast-import> that will create
a new repository containing all the commits in a new commit tree
that respects the history of all the source repositories.

Typical usage is like this:

    $ ls
    A  B
    $ mkdir RESULT
    $ cd RESULT
    $ git-init
    $ git-stitch-repo ../A:A ../B:B | git-fast-import

The C<RESULT> repository will contain all commits from repositories A
and B, with the files from A in subdirectory F<A/> and the files from
B in subdirectory F<B/>.

B<git-stich-repo> works perfectly with repositories that have a B<linear>
history (no merges). It has successfully been tested with 16 linear
repositories, and produced the expected result.

=head2 Example

Imagine we have two repositories A and B that we want to stitch into
a repository C so that all the files from A are in subdirectory F<A>
and all the files from B are in subdirectory F<B>.

Note: in the following ASCII art graphs, horizontal order is chronological.

Repository A:

                topic
                '    ,master
          ,----A3---A5
         /         /
    A1--A2---A4---'

Branch I<master> points to A5 and branch I<topic> points to A3.

Repository B:

                           ,topic    ,master
           ,---------B3---B5---B7---B8
          /                   /
    B1---B2---B4---B6--------'

Branch I<master> points to B8 and branch I<topic> points to B5.

The RESULT repository should preserve chronology, commit relationships and
branches as much as possible, while giving the impression that the
directories F<A/> & F<B/> did live side-by-side all the time.

Assuming additional timestamps not shown on the above graphs,
B<git-stitch-repo> will produce a B<git-fast-import> stream that will
create the following history:

                                       ,topic-A
                      ,---------------A3---B3 master-A
                     /                      \ '    ,topic-B
                    /         ,--------------A5---B5
                   /         /                     \      ,master-B
   A1---B1---A2---B2---A4---B4---B6-----------------B7---B8


=head1 BUGS & IMPROVEMENTS

Any mathematician will tell you there are many many ways to stitch two
trees together. This programs tries very hard not to create inconsistent
history with regard to each input repository.

The current implementation can (and will be) improved. I'm very interested
in test repositories that do not give the expected results.

One of the issues is that we currently refuse to stitch a node after
a merge from another repository. For the current example, that would
mean that any commit having A5 as a parent would be attached to B5,
and not to B8.

Fixing this is in the TODO list.

=head1 AUTHOR

Philippe Bruhat (BooK), C<< <book@cpan.org> >>.

=head1 ACKNOWLEDGEMENTS

The original version of this script was created as part of my work
for BOOKING.COM, which authorized its publication/distribution
under the same terms as Perl itself.

=head1 COPYRIGHT

Copyright 2008 Philippe Bruhat (BooK), All Rights Reserved.

=head1 LICENSE

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

=cut

