From: Norman Walsh Date: Wed, 21 Aug 2002 13:58:39 +0000 (+0000) Subject: Extract indexes from PDF and merge page number ranges X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=330e55ff0ab994b1617b43ab486c6dd5c38222c4;p=docbook-dsssl Extract indexes from PDF and merge page number ranges --- diff --git a/xsl/fo/pdf2index b/xsl/fo/pdf2index new file mode 100755 index 000000000..2a8439852 --- /dev/null +++ b/xsl/fo/pdf2index @@ -0,0 +1,114 @@ +#!/usr/bin/perl -- # -*- Perl -*- + +# this needs some cleanup... + +my $PSTOTEXT = "pstotext"; + +my $pdf = shift @ARGV; + +my $index = ""; +my $inindex = 0; +open (F, "$PSTOTEXT $pdf |"); +while () { + if (/^<\/index/) { + $index .= $_; + $inindex = 0; + } + $inindex = 1 if /^.*?<\/phrase>\s*)+)/s) { + $cindex .= $1; + $_ = $2; + $index = $'; # ' + + my @pages = m/.*?<\/phrase>\s*/sg; + + # Remove duplicates... + if ($#pages > 0) { + my @mpages = (); + my $current = ""; + foreach my $page (@pages) { + my $pageno = &pageno($page); + if ($pageno ne $current) { + push (@mpages, $page); + $current = $pageno; + } + } + @pages = @mpages; + } + + # Collapse ranges... + if ($#pages > 1) { + my @cpages = (); + while (@pages) { + my $count = 0; + my $len = &rangelen($count, @pages); + if ($len <= 2) { + my $page = shift @pages; + push (@cpages, $page); + } else { + my $fpage = shift @pages; + my $lpage = ""; + while ($len > 1) { + $lpage = shift @pages; + $len--; + } + my $fpno = &pageno($fpage); + my $lpno = &pageno($lpage); + $fpage =~ s/>$fpno${fpno}-$lpno//; + $page =~ s/^//; + + return $1 if $page =~ /^([^<>]+)/; + return "?"; +} + +sub rangelen { + my $count = shift; + my @pages = @_; + my $len = 1; + my $inrange = 1; + + my $current = &pageno($pages[$count]); + while ($count < $#pages && $inrange) { + $count++; + my $next = &pageno($pages[$count]); + if ($current + 1 eq $next) { + $current = $next; + $inrange = 1; + $len++; + } else { + $inrange = 0; + } + } + + return $len; +}