#!/usr/bin/perl
# aux2index.pl 0.9
# $Id: $

=head1 NAME

aux2index.pl - reformat DocBook index files generated by collateindex.pl

=head1 SYNOPSIS

B<aux2index.pl> [F<auxfile>] [F<indexfile>] 

=head1 DESCRIPTION

B<aux2index.pl> is a Perl script that uses the .aux file generated by the final run of B<pdfjadetex> to insert "hard-coded" page numbers into a DocBook SGML index, and to format those page numbers in a pretty format.

=cut

$usage = "usage: $0 auxfile indexfile
Do 'perldoc $0' for documentation.\n";

=head1 OPTIONS

=over 5

=item F<auxfile>

The .aux file generated by pdfjadetex from the source SGML document.

=item F<indexfile>

The index file generated by B<collateindex.pl> from the source SGML document.

=back

=cut

die $usage unless (defined $ARGV[1]);
use Roman;

# Variables
$highestroman = 54; # The highest page number expressed in roman numerals
$maxcompare = 800; # If page isn't found, compare this many surrounding elements
$fudge = 0; # If page numbers are still consistently wrong, add this value
($aux,$file) = (@ARGV[0],@ARGV[1]);

open (AUX, "<$aux");
while (<AUX>) {
  if (/^\\pagelabel{(\d+)}{([ivxlcdm|\d]+)}/) {
    ($label,$page) = ($1,$2);
    $label--;
    $pagelabel{$label} = $page;
  }
}
close AUX;
open (FILE, "<$file");
while (<FILE>) {
  undef $error;
  if ( /role="(AEN)*(\S+?)"/ ) {
    $label = $2;
    if ( $pagelabel{$label} ) {
        $pagelabel{$label} += $fudge;
    } else {
      # Sometimes the page has no label in Thesis.aux (why?) - pick the nearest
      while ( $i <= $maxcompare ) {
        $i++;
        unless ( $lowerpagelabel ) {
          if ( $pagelabel{$label-$i} ) { $lowerpagelabel = $label-$i }
        };
        unless ( $higherpagelabel ) {
          if ( $pagelabel{$label+$i} ) { $higherpagelabel = $label+$i }
        }
      };
      if ( isroman($pagelabel{$lowerpagelabel}) or isroman($pagelabel{$higherpagelabel}) ) {
        undef $complower;
        undef $comphigher;
        if ( isroman($pagelabel{$lowerpagelabel}) ) { 
          $complower = arabic($pagelabel{$lowerpagelabel})
        } else {
          $complower += $highestroman;
        };
        if ( isroman($pagelabel{$higherpagelabel}) ) { 
          $comphigher = arabic($pagelabel{$higherpagelabel})
        } else {
          $comphigher += $highestroman;
        };
        $page = int((($complower+$comphigher)/2)+.5);
        $page += $fudge;
        if ( $page > $highestroman ) {
          $page -= $highestroman
        } else {
          $page = roman($page)
        };
      } else {
        $page = int((($pagelabel{$lowerpagelabel}+$pagelabel{$higherpagelabel})/2)+.5);
        $page += $fudge;
      };
      $pagelabel{$label} = $page;
      $error++ unless ( $page );
      undef $lowerpagelabel;
      undef $higherpagelabel;
      undef $i;
    }
  }
  if ( $error ) {
    warn "DEBUG: Error, no page for or near $label in $aux.\n";
  } else {
    s/<ulink.*<\/ulink>/<phrase role="pageno">$pagelabel{$label}<\/phrase>/;
    $index .= $_;
  };
}
while ($index =~ /^(.*?)((<phrase role="pageno">.*?<\/phrase>[,\s]*)+)/s) {
    $cindex .= $1;
    $_ = $2;
    $index = $';
    @pages = m/<phrase role="pageno">.*?<\/phrase>/sg;

    # Expand ranges
    if ($#pages >= 0) {
	my @mpages = ();
	foreach my $page (@pages) {
	    my $pageno = &pageno($page);
	    if ($pageno =~ /^([0-9]+)[^0-9]([0-9]+)$/) {
		for (my $count = $1; $count <= $2; $count++) {
		    push (@mpages, "<phrase role=\"$pageno\">$count</phrase>,");
		}
	    } else {
		push (@mpages, $page);
	    }
	}
	@pages = sort rangesort @mpages;
    }

    # Remove duplicates...
    if ($#pages > 0) {
	my @mpages = ();
	my $current = "";
	foreach my $page (@pages) {
	    my $pageno = &pageno($page);
	    if ($pageno ne $current) {
		push (@mpages, $page);
		$current = $pageno;
	    }
	}
	@pages = @mpages;
    }

    # Collapse ranges...
    if ($#pages > 1) {
	my @cpages = ();
	while (@pages) {
	    my $count = 0;
	    my $len = &rangelen($count, @pages);
	    if ($len <= 2) {
		my $page = shift @pages;
		push (@cpages, $page);
	    } else {
		my $fpage = shift @pages;
		my $lpage = "";
		while ($len > 1) {
		    $lpage = shift @pages;
		    $len--;
		}
		my $fpno = &pageno($fpage);
		my $lpno = &pageno($lpage);
		$fpage =~ s/>$fpno</>${fpno}&ndash;$lpno</s;
		push (@cpages, $fpage);
	    }
	}
	@pages = @cpages;
    }

    my $page = shift @pages;
    $page =~ s/\s*$//s;
    $cindex .= $page;
    while (@pages) {
	$page = shift @pages;
	$page =~ s/\s*$//s;
	$cindex .= ", $page";
    }
}
$cindex .= $index;

print "$cindex\n";

sub pageno {
    my $page = shift;

    $page =~ s/<phrase.*?>//;

    return $1 if $page =~ /^([^<>]+)/;
    return "?";
}

sub rangesort {
    my $apno = &pageno($a);
    my $bpno = &pageno($b);

    # Make sure roman pages come before arabic ones, otherwise sort them in order
    return -1 if ($apno !~ /^\d+/ && $bpno =~ /^\d+/);
    return  1 if ($apno =~ /^\d+/ && $bpno !~ /^\d+/);
    return $apno <=> $bpno;
}

sub rangelen {
    my $count = shift;
    my @pages = @_;
    my $len = 1;
    my $inrange = 1;

    my $current = &pageno($pages[$count]);
    while ($count < $#pages && $inrange) {
	$count++;
	my $next = &pageno($pages[$count]);
	if ($current + 1 eq $next) {
	    $current = $next;
	    $inrange = 1;
	    $len++;
	} else {
	    $inrange = 0;
	}
    }

    return $len;
}

=head1 EXAMPLE

B<aux2index.pl> F<mydocument.aux> F<index.sgml>

=head1 USAGE

Run B<aux2index.pl> after you have generated your PDF output using B<pdfjadetex> or similar, but before you have deleted the .aux file that it has left behind.  Then, after checking that the output is correct, rerun B<openjade> and B<pdfjadetex> just as you did before to include the new index. 

=head1 BUGS

The original SGML file was in a suitable format for the page numbers to be hyperlinked to the text.  When prettified, hyperlinking is no longer possible.

=head1 AUTHOR

Jeremy Malcolm E<lt>Jeremy@Malcolm.id.auE<gt>, borrowing code heavily from B<pdf2index> from the XSL project.

=cut

