#!/usr/bin/perl
# indexcrafter.pl 0.8
# $Id: indexcrafter.pl,v 1.7 2008/03/16 07:31:15 terminus Exp $

=head1 NAME

indexcrafter.pl - interactively or non-interactively edit DocBook SGML or XML index files

=head1 SYNOPSIS

B<indexcrafter.pl> <F<indexfile>> [-m F<macrofile>] [-o F<outputfile>]

=head1 DESCRIPTION

B<indexcrafter.pl> is intended for use on DocBook SGML or XML index files that have been automatically or semi-automatically generated, for example using the SGML tool B<collateindex.pl>.  B<indexcrafter.pl> modifies such an index by allowing the addition, deletion, renaming, merging, nesting and cross-referencing of index terms.  The operations performed on the index can be saved to a macro file to be repeated non-interactively on additional input files using the same index terms.

=head1 OPTIONS

=over 5

=item F<indexfile>

The input DocBook SGML or XML index file.

=item F<macrofile>

A macro file exported from a previous run of B<indexcrafter.pl>.

=item F<outputfile>

The file to which output should be saved.  Defaults to F<indexfile>.new.

=back

=cut

# Load modules
use XML::Twig;

# Check variables
unless ($ARGV[0]) {
  print "Usage: $0 <SGML index file> [-m macro file] [-o output file]\n";
  print "For documentation type \"perldoc $0\"\n";
  exit 1
}
$infilename = $ARGV[0];
for ( $i = 1; $i < @ARGV; $i++ ) {
  if ( $ARGV[$i] eq "-m" ) {
    $scriptfile = $ARGV[$i+1];
  };
  if ( $ARGV[$i] eq "-o" ) {
    $outfilename = $ARGV[$i+1];
  }
};
unless ($outfilename) {$outfilename = "$infilename.new"};

# Read the existing SGML file (as XML)
$twig = new XML::Twig(pretty_print => 'indented');
$twig->parsefile($infilename) or die "Error: invalid index file\n";
$root = $twig->root;

# Read the macro file, if any
if ($scriptfile) {
  loadscript();
} else {
  until ( $selection =~ /^q.*/ ) {
    refresh();
    display($page);
    print "\nEnter command or H for help: ";
    $selection = <STDIN>;
    $selection =~ tr/[A-Z]/[a-z]/;
    if ( $selection =~ /^h.*/ ) {
      print "\nAvailable commands: F for Following page, P for Previous page, R to Rename an\nentry, M to Merge two entries, N to Nest one entry into another, S to add a\nSee reference to another entry, V to View an entry's references, A to Add an\nentry, D to Delete an entry, X to export the operations you have performed\nto a macro file, L to load such a file or Q to quit.\n";
      print "\nPress enter to return to menu.";
      $prompt = <STDIN>;
    }
    elsif ( $selection =~ /^f.*/ ) { $page++ }
    elsif ( $selection =~ /^p.*/ ) { $page-- }
    elsif ( $selection =~ /^r.*/ ) { renameitem() }
    elsif ( $selection =~ /^m.*/ ) { mergeitem() }
    elsif ( $selection =~ /^n.*/ ) { nestitem() }
    elsif ( $selection =~ /^s.*/ ) { seeitem() }
    elsif ( $selection =~ /^v.*/ ) { viewitem() }
    elsif ( $selection =~ /^a.*/ ) { additem() }
    elsif ( $selection =~ /^d.*/ ) { deleteitem() }
    elsif ( $selection =~ /^x.*/ ) { exportscript() }
    elsif ( $selection =~ /^l.*/ ) { loadscript() }
  }
};

# Refresh variables
sub refresh {
  # Create hash of index entries and names
  undef %contentindex;
  @indexterms = $root->descendants('indexentry');
  foreach $indexterm (@indexterms) {
    ($name,undef)  = (split /,/,$indexterm->text,2);
    $contentindex{$name} = $indexterm;
    foreach $nesteditem(@nesteditems) {
      ($primaryindex,$secondaryindex) = split /\//,$nesteditem;
      if ($primaryindex eq $name) { 
        $contentindex{$nesteditem} = $indexterm;
      }
    }
  };
  # Create name and number hashes
  undef $numeration;
  undef %numberindex;
  undef %nameindex;
  foreach $name (sort {uc($a) cmp uc($b)} keys %contentindex) {
    $numeration++;
    $numberindex{$name} = $numeration;
  };
  %nameindex = reverse %numberindex;
}

# Display choices
sub display {
  # Check page is within range
  if ( $page > $numeration/60 ) { $page-- };
  if ( $page < 0 ) { $page = 0 };
  $endnumeration = $numeration+1;
  $numeration = (60 * $page+1);
  if ( $endnumeration > $numeration+20 ) { $endnumeration = $numeration+20 };
  # Display list of index items
  print "\n";
  while ( $numeration < $endnumeration ) {
    $firstname = $nameindex{$numeration};
    $firstname =~ s/.*\// /;
    $firstentry = "$numeration. $firstname";
    undef $secondentry;
    if ( $nameindex{($numeration+20)} ) { 
      $secondname = $nameindex{$numeration+20};
      $secondname =~ s/.*\// /;
      $secondentry = $numeration+20 . ". " . $secondname
    };
    undef $thirdentry;
    if ( $nameindex{($numeration+40)} ) { 
      $thirdname = $nameindex{$numeration+40};
      $thirdname =~ s/.*\// /;
      $thirdentry = $numeration+40 . ". " . $thirdname
    };
    write;
    $numeration++;
  }
}

# Letter prompt
sub letterprompt {
  $substring = shift;
  foreach $name (sort {$numberindex{$a} <=> $numberindex{$b}} keys %contentindex) {
    if ( $name =~ /$substring/i ) {
      print $numberindex{$name} . ". $name\n"
    }
  }
};

# Rename
sub renameitem {
  ($renamefromname,$renametoname) = (shift,shift);
  $renamefrom = $numberindex{$renamefromname};
  until ( $nameindex{$renamefrom} ) {
    print "Select the number of the item to rename: ";
    $renamefrom = <STDIN>;
    chomp $renamefrom;
    if ( $renamefrom =~ /[A-Za-z]/ ) {
      letterprompt($renamefrom);
      undef $renamefrom
    };
    if ($nameindex{$renamefrom} =~ /\//) {
      print "Sorry, renaming nested entries is not yet supported.\n";
      undef $renamefrom
    };
  };
  $renamefromname = $nameindex{$renamefrom};
  until ($renametoname) {
    print "Enter new name: ";
    $renametoname = <STDIN>;
    chomp $renametoname;
  }
  if ( $numberindex{$renametoname} ) {
    print "$renametoname exists; merge with the existing entry instead? ";
    $renameconfirm = <STDIN>;
    if ( $renameconfirm =~ /[yY]/ ) { 
      mergeitem($renamefrom,$numberindex{$renametoname});
      return
    }
    else { return }
  };
  unless ($scriptfile) {
    print "Rename $renamefromname to $renametoname? ";
    $renameconfirm = <STDIN>;
  };
  if ( ($renameconfirm =~ /[yY]/) or $scriptfile ) {
    push @scriptactions, "r,$renamefromname,$renametoname\n";
    # Move the entry under its new name into the appropriate location
    undef $n;
    for ( $i = 0; $i < @nesteditems; $i++ ) {
      @nesteditems[$i] =~ s/$renamefromname(\/.*)/$renametoname$1/;
    };
    @sorteditems = keys %contentindex;
    push @sorteditems, $renametoname;
    foreach $sorteditem (sort {uc($a) cmp uc($b)} @sorteditems) {
      $n++;
      if ($sorteditem eq $renametoname) { $sorteditemnumber = $n }
    };
    $renamefromref = $contentindex{$renamefromname};
    # Change its name in the tree
    $renamefromref->first_child->subs_text( qr/$renamefromname,/,"$renametoname,");
    $sorteditemtotal = @sorteditems;
    if ( $sorteditemnumber == $sorteditemtotal ) {
      $renamefromref->move( last_child => $root )
    } elsif ( $sorteditemnumber == 1 ) {
      $renamefromref->move( first_child => $root )
    } else {
      eval {
        $renamefromref->move( before => @indexterms[$sorteditemnumber] )
      };
      if ( $@ ) {
        warn $@;
        print "Check $renametoname has been correctly sorted before $location.\n";
        $location = $nameindex{$sorteditemnumber};
        $renamefromref->move( before => $contentindex{$location} )
      };
    }
  }
}

# Merge
sub mergeitem {
  ($mergefirstname,$mergesecondname) = (shift,shift);
  $mergefirst = $numberindex{$mergefirstname};
  until ( $nameindex{$mergefirst} ) {
    print "Select the number of the entry to be merged: ";
    $mergefirst = <STDIN>;
    chomp $mergefirst;
    if ( $mergefirst =~ /[A-Za-z]/ ) {
      letterprompt($mergefirst);
      undef $mergefirst
    } else {
      for ( $i = 0; $i < @nesteditems; $i++ ) {
        if (@nesteditems[$i] =~ /$nameindex{$mergefirst}/) {
          print "Sorry, merging nested entries is not yet supported.\n";
          undef $mergefirst
        }
      }
    }
  };
  $mergefirstname = $nameindex{$mergefirst};
  $mergefirstitem = $contentindex{$mergefirstname};
  $mergesecond = $numberindex{$mergesecondname};
  until ( $nameindex{$mergesecond} ) {
    print "Select the number of the entry to merge it into: ";
    $mergesecond = <STDIN>;
    chomp $mergesecond;
    if ( $mergesecond =~ /[A-Za-z]/ ) {
      letterprompt($mergesecond);
      undef $mergesecond
    };
    if ($nameindex{$mergesecond} =~ /\//) {
      print "Sorry, merging nested entries is not yet supported.\n";
      undef $mergesecond
    }
  };
  $mergesecondname = $nameindex{$mergesecond};
  $mergeseconditem = @indexterms[$mergesecond-1];
  unless ($scriptfile) {
    print "About to merge $mergefirstname into $mergesecondname, continue? ";
    $mergeconfirm = <STDIN>;
  };
  if ( ($mergeconfirm =~ /[yY]/) or $scriptfile ) {
    push @scriptactions, "m,$mergefirstname,$mergesecondname\n";
    $mergefirstchild = $mergefirstitem->first_child;
    @mergefirstchildren = $mergefirstchild->children('ulink');
    $offset = length($mergeseconditem->first_child);
    foreach $mergefirstchild (@mergefirstchildren) {
      $mergefirstchild->move(within => $mergeseconditem, $offset);
    };
    $mergefirstitem->delete;
  }
}

# Nest
sub nestitem {
  ($nestfirstname,$nestsecondname) = (shift,shift);
  $nestfirst = $numberindex{$nestfirstname};
  until ( $nameindex{$nestfirst} ) {
    print "Select the number of the entry to be nested: ";
    $nestfirst = <STDIN>;
    chomp $nestfirst;
    if ( $nestfirst =~ /[A-Za-z]/ ) {
      letterprompt($nestfirst);
      undef $nestfirst
    };
    if ($nameindex{$nestfirst} =~ /\//) {
      print "Sorry, sub-nesting entries is not yet supported.\n";
      undef $nestfirst
    };
  };
  $nestfirstname = $nameindex{$nestfirst};
  $nestfirstitem = $contentindex{$nestfirstname};
  $nestsecond = $numberindex{$nestsecondname};
  until ( $nameindex{$nestsecond} ) {
    print "Select the number of the entry to nest it into: ";
    $nestsecond = <STDIN>;
    chomp $nestsecond;
    if ( $nestsecond =~ /[A-Za-z]/ ) {
      letterprompt($nestsecond);
      undef $nestsecond
    };
    if ($nameindex{$nestsecond} =~ /\//) {
      print "Sorry, sub-nesting entries is not yet supported.\n";
      undef $nestsecond
    };
  };
  $nestsecondname = $nameindex{$nestsecond};
  $nestseconditem = $contentindex{$nestsecondname};
  unless ($scriptfile) {
    print "About to nest $nestfirstname inside $nestsecondname, continue? ";
    $nestconfirm = <STDIN>;
  }
  if ( ($nestconfirm =~ /[yY]/) or $scriptfile ) {
    push @scriptactions, "n,$nestfirstname,$nestsecondname\n";
    $nestfirstitem->first_child->move(last_child => $nestseconditem);
    $nestfirstitem->delete;
    $nestseconditem->last_child->erase;
    $nestseconditem->last_child->wrap_in('secondaryie');
    # This makes it disappear from contentindex, so add it to a persistent array
    push @nesteditems,"$nestsecondname/$nestfirstname";
  }
}

# See
sub seeitem {
  ($seefirstname,$seesecondname) = (shift,shift);
  $seefirst = $numberindex{$seefirstname};
  until ( $nameindex{$seefirst} ) {
    print "Select the number of the entry to contain the see reference: ";
    $seefirst = <STDIN>;
    chomp $seefirst;
    if ( $seefirst =~ /[A-Za-z]/ ) {
      letterprompt($seefirst);
      undef $seefirst
    };
    if ($nameindex{$seefirst} =~ /\//) {
      print "Sorry, sub-nested entries cannot currently contain see references.\n";
      undef $seefirst
    };
  };
  $seefirstname = $nameindex{$seefirst};
  $seefirstitem = $contentindex{$seefirstname};
  $seesecond = $numberindex{$seesecondname};
  until ( $nameindex{$seesecond} ) {
    print "Select the number of the entry for the see reference: ";
    $seesecond = <STDIN>;
    chomp $seesecond;
    if ( $seesecond =~ /[A-Za-z]/ ) {
      letterprompt($seesecond);
      undef $seesecond
    };
  };
  $seesecondname = $nameindex{$seesecond};
  $seesecondname =~ s/\//, /;
  push @scriptactions, "s,$seefirstname,$seesecondname\n";
  if ( $seefirstitem->first_child->has_children('ulink') ) {
    $see = "seealsoie"
  } else {
    $see = "seeie"
  };
  $seeindexentry = XML::Twig::Elt->new($see);
  $seeindexentry->set_text($seesecondname);
  $seeindexentry->paste( last_child => $seefirstitem )
};

# View
sub viewitem {
  until ( $nameindex{$viewitem} ) {
    print "Select the number of the entry to view: ";
    $viewitem = <STDIN>;
    chomp $viewitem;
    if ( $viewitem =~ /[A-Za-z]/ ) {
      letterprompt($viewitem);
      undef $viewitem
    };
  };
  print "View in source XML rather than plain text format [N]? ";
  $viewformat = <STDIN>;
  $viewitem = $nameindex{$viewitem};
  $viewitem = $contentindex{$viewitem};
  if ( $viewformat =~ /[yY]/ ) { 
    print "\n" . $viewitem->print . "\n";
  } else {
    print "\n" . $viewitem->text . "\n";
  }
  print "\nPress enter to return to menu.";
  $prompt = <STDIN>;
}

# Add
sub additem {
  $newname = shift;
  until ($newname) {
    print "Enter name for new entry: ";
    $newname = <STDIN>;
  };
  chomp $newname;
  while ( $numberindex{$newname} ) {
    print "$newname exists; please try again: ";
    $newname = <STDIN>;
    chomp $newname;
  };
  push @scriptactions, "a,$newname\n";
  # Sort the new entry into the appropriate location
  undef $n;
  undef @newprimaryie;
  @sorteditems = keys %contentindex;
  push @sorteditems, $newname;
  foreach $sorteditem (sort {uc($a) cmp uc($b)} @sorteditems) {
    $n++;
    if ($sorteditem eq $newname) { $sorteditemnumber = $n }
  };
  # Create nested primaryie: could have used insert or wrap_in, but this works
  $newindexentry = XML::Twig::Elt->new('indexentry');
  $newprimaryie = XML::Twig::Elt->new('primaryie');
  $newprimaryie->set_text($newname . ",");
  push @newprimaryie, $newprimaryie;
  $newindexentry->set_content(@newprimaryie);
  $sorteditemtotal = @sorteditems;
  if ( $sorteditemnumber == $sorteditemtotal ) {
    $newindexentry->paste( last_child => $root )
  } else {
    $location = $nameindex{$sorteditemnumber};
    $newindexentry->paste( before => $contentindex{$location} )
  }
}

# Delete
sub deleteitem {
  $deletename = shift;
  $deleteitem = $numberindex{$deletename};
  until ( $nameindex{$deleteitem} ) {
    print "Select the number of the item to delete: ";
    $deleteitem = <STDIN>;
    chomp $deleteitem;
    if ( $deleteitem =~ /[A-Za-z]/ ) {
      letterprompt($deleteitem);
      undef $deleteitem
    };
  };
  $deletename = $nameindex{$deleteitem};
  if ($deletename =~ /\//) {
    print "Sorry, deleting nested entries is not yet supported.\n";
    print "Press enter to return to menu.";
  $prompt = <STDIN>;
    return
  };
  unless ($scriptfile) {
    print "About to delete $deletename and all its references, continue? ";
    $deleteconfirm = <STDIN>;
  };
  if ( ($deleteconfirm =~ /[yY]/) or $scriptfile ) {
    push @scriptactions, "d,$deletename\n";
    $deleteitem = $contentindex{$deletename};
    $deleteitem->delete;
  }
}

# Export 
sub exportscript {
  print "Filename to export to? ";
  $filename = <STDIN>;
  chomp $filename;
  open (EXPORTSCRIPT,">$filename") or warn "Cannot write to $filename.\n";
  print EXPORTSCRIPT @scriptactions;
  close EXPORTSCRIPT;
}

# Load
sub loadscript {
  until (-e $scriptfile) {
    print "Filename to load? ";
    $scriptfile = <STDIN>;
    chomp $scriptfile
  };
  open (SCRIPTFILE,"<$scriptfile") or die "Error: cannot read macro file\n";
  while (<SCRIPTFILE>) {
    refresh();
    ($action, $fromaction, $toaction) = split(/,/,$_); 
    chomp $toaction;
    chomp $fromaction;
    if ( $action eq "r" ) { renameitem($fromaction,$toaction) }
    elsif ( $action eq "m" ) { mergeitem($fromaction,$toaction) }
    elsif ( $action eq "n" ) { nestitem($fromaction,$toaction) }
    elsif ( $action eq "s" ) { seeitem($fromaction,$toaction) }
    elsif ( $action eq "a" ) { additem($fromaction) }
    elsif ( $action eq "d" ) { deleteitem($fromaction) }
  };
  close SCRIPTFILE;
  undef $scriptfile
}

# Output a new SGML index (as XML)
open (OUTFILE,">$infilename.tmp") or die "Cannot write to temporary file.\n";
$twig->print(\*OUTFILE);
close OUTFILE;

# Post-process the saved file to correct formatting problems
open (INFILE,"<$infilename.tmp");
while (<INFILE>) {
  next if /^    ,$/;
  if (/<\/ulink><ulink/) {
    s/<\/ulink>/<\/ulink>,\n    /g
  };
  if (/<\/(prim|second)aryie>([^<]+),$/) {
    $secondaryie = $2;
    s/$2,$//;
    s/<secondaryie>//g;
    push @outfile, $_;
    push @outfile, "    <secondaryie>$secondaryie,\n";
    next;
  };
  # Same as previous, but with a see/see-also entry mixed up in it too
  if (/<\/(prim|second)aryie>(<see.*ie>)([^<]+),$/) {
    $seealsoie = $2;
    $secondaryie = $3;
    s/$2$3,$//;
    s/<secondaryie>//g;
    push @outfile, $_;
    push @outfile, "    $seealsoie\n";
    push @outfile, "    <secondaryie>$secondaryie,\n";
    next;
  };
  # This doesn't match entries added manually, which look different
  s/<secondaryie><ulink/<ulink/g;
  # There may be multiple lines in a single scalar; explode those
  if (/\n.+\n/) {
    @outlines = (split /\n/,$_);
    foreach $outline (@outlines) {
      push @outfile, $outline . "\n";
    }
  } else {
    push @outfile, $_
  }
};
close INFILE;

# Now sort merged entries
foreach $outline (@outfile) {
  if ($outline =~ /<ulink/) {
    push @ulinks, $outline;
  } else {
    if (@ulinks) {
      @sortedulinks = sort {($a =~ /AEN(\d+)/)[0] <=> ($b =~ /AEN(\d+)/)[0]} @ulinks;
      push @newfile, @sortedulinks;
      undef @ulinks;
    }
    push @newfile, $outline;
  }
};

# Now sort nested entry headings
undef $secondaryie;
open (OUTFILE,">$outfilename") or die "Cannot write to output file.\n";
foreach $_ (@newfile) {
  if (/<indexentry>/) {
    $indexentry++ 
  };
  if ($indexentry) {
    # This doesn't match secondary entries added manually, which look different
    if (/<secondaryie>(.*),$/) {
      $secondaryie = $1;
      undef @secondaryie
    };
    if ($secondaryie) {
      push @secondaryie, $_;
      if (/<\/secondaryie>/) {
        $secondaryies{$secondaryie} = [ @secondaryie ];
        undef $secondaryie
      }
    } else {
      print OUTFILE
    };
    if (/<\/indexentry>/) {
      if (%secondaryies) {
        foreach $secondaryie (sort {uc($a) cmp uc($b)} keys %secondaryies) {
          @secondaryie = @{$secondaryies{$secondaryie}};
          @secondaryie[$#secondaryie] =~ (s/<\/indexentry>//);
          chomp @secondaryie[$#secondaryie];
          print OUTFILE "@secondaryie\n";
        };
        print OUTFILE "    </indexentry>\n";
        undef $secondaryie;
        undef %secondaryies
      };
    undef $indexentry;
    }
  } else {
    print OUTFILE
  }
}
close OUTFILE;
unlink "$infilename.tmp";

format STDOUT =
@<<<<<<<<<<<<<<<<<<<<<<<<  @<<<<<<<<<<<<<<<<<<<<<<<<  @<<<<<<<<<<<<<<<<<<<<<<<<
$firstentry                $secondentry               $thirdentry
.

=head1 EXAMPLE

B<indexcrafter.pl> F<index.sgml> -s F<scriptfile>

=head1 USAGE

The first time B<indexcrafter.pl> is run it will display a numbered list of index terms.  Use F and P to page through the list if it exceeds one page.  At any other prompt when asked to select an item by number, entering letters instead will redisplay the entries that match that substring.

Choose R, N or M respectively to rename a term, to nest one term inside another, or to merge two terms together.  S creates a "see" reference in an empty index entry or a "see also" reference in a populated one.  Choose A to add a blank term (as a container for nesting others) or D delete a term.  Choose X to export the operations that you have performed to a macro file, L to load an existing macro file, and Q to quit.

Subsequent runs of B<indexcrafter.pl> may add the name of the exported macro file as an argument, in which case the operations recorded in that file will be performed non-interactively on the index.  This is useful if the source document has been edited since the first index was generated, which may have changed the locations of index terms in the document.

=head1 BUGS

Tertiary entries are not yet supported.

Can't yet delete, merge or rename secondary (nested) entries.

Manually-added secondary entries are not yet sorted correctly with those originating from the input file.

Added or renamed entries aren't sorted into the correct indexdiv if it does not already exist.

Secondary entries are not displayed in the correct sorting order in all cases.

=head1 HISTORY

0.1 - Initial public release (25 February 2008)	

0.2 - Added substring searching from submenu prompts and support for see and see-also references (26 February 2008)

0.3 - Added facility to load export file interactively (26 February 2008)

0.4 - New human-readable format for macro file (27 February 2008)

0.5 - Fixed bug where non-interactive mode did not save file (3 March 2008)

0.6 - Fixed bugs in sorting routines (4 March 2008)

0.7 - Fixed sorting of see/see-also references (12 March 2008)

0.8 - Fixed spurious empty lines with commas (13 March 2008)

=head1 AUTHOR

Jeremy Malcolm E<lt>Jeremy@Malcolm.id.auE<gt>.

=cut

