#! /usr/bin/perl -w use strict; use HTML::Parser; use Getopt::Long; use constant ( TAIL => "\t\n\n" ); no warnings 'deprecated'; my $VERSION = 1.0.0; use warnings; my ($opt_b, $opt_h, $opt_usage, $opt_tidy); my $head =''; # xml declaration through my $current = 0; # to form basename$current.html filenames # collect_head # Collect the head material and adjust the parser to # handle the bulk of the document. sub collect_head { my $p = shift; $head = shift; my $tag = shift; $head .= "<$tag>\n"; $p->handler(start => \&generate_file, 'skipped_text, tag'); $p->handler(end_document => \&generate_final_file, 'skipped_text'); $p->report_tags(($opt_h)); } sub generate_file { my $text = shift; my $tag = ''; $tag = '<' . shift() . '>' if $current; # skip heading tag on first file $current++; my $curfile = filename($current); warn "\n\n##### making $curfile #####\n\n"; open OUTFILE, "> $curfile" or die "can't open $curfile for writing ($!)\n"; print OUTFILE $head, $tag, $text, TAIL; close OUTFILE; tidy($curfile) if $opt_tidy; } sub generate_final_file { my $text = shift; my $tag = ''; $tag = '<' . $opt_h . '>'; return unless $current; # 0 here means never found our hn tag $current++; my $curfile = filename($current); warn "\n\n##### making $curfile #####\n\n"; open OUTFILE, "> $curfile" or die "can't open $curfile for writing ($!)\n"; print OUTFILE $head, $tag, $text; close OUTFILE; tidy($curfile) if $opt_tidy; } sub filename { my $number = shift; return "${opt_b}${number}.html"; } sub tidy { my $filename = shift; unless (length($filename)) { warn "No filename passed to tidy; skipping\n"; return; } my @tidy_args = qw(tidy -m -i); push @tidy_args, $filename; system(@tidy_args) == 0 or warn "Calling tidy failed: $?; exit value: ", $? >> 8, "\n"; } sub usage { my $V = sprintf('%vd', $VERSION); print << "E"; Usage: htsplit [options] [[<] filename] Options: --heading n : heading level, 1 through 6, on which to split (just the number, not the 'h'); required --basename name : name prefix for the generated files (default 'slide'); optional --tidy : run tidy -m -i on the output files; optional --help : show the help text and exit --version : show the help text and exit version $V E exit; } GetOptions('heading=i' => \$opt_h, 'basename=s' => \$opt_b, 'tidy' => \$opt_tidy, 'help' => \$opt_usage, 'version' => \$opt_usage, ); usage() if $opt_usage; no warnings; $opt_h = int $opt_h; usage() unless 0 < $opt_h and $opt_h < 7; $opt_h = 'h' . $opt_h; $opt_b ||= 'slide'; use warnings; my $tidy_prog; if ($opt_tidy) { my @tidy_locs = qw( /usr/local/bin/tidy /opt/local/bin/tidy /sw/local/bin/tidy ); $opt_tidy = 0; foreach my $prog (@tidy_locs) { if (-x $prog) { $opt_tidy = 1; $tidy_prog = $prog; last; } } warn "Tidy not found; will process without tidying\n" unless $opt_tidy; } # set up initial report_tags and handlers # these will change once we see the body tag my $p = HTML::Parser->new( report_tags => ['body'], start_h => [\&collect_head, 'self, skipped_text, tag'], xml_mode => 1, ); my $filename = shift; if ($filename) { open F, $filename or die "can't open $filename ($!)\n"; *INFILE = *F; } else { *INFILE = *STDIN; } $p->parse_file(*INFILE); warn "Found no headings of level $opt_h; no files generated.\n" unless $current; warn "\nTotal files generated: $current\n"; =head1 NAME htsplit - split XHTML files into separate, complete HTML files based on a specified heading level (1..6). =head1 DESCRIPTION htsplit creates complete XHTML files from an input XHTML file, copying the XML declaration, DOCTYPE declaration, and head element from the original file into each new file. This can be useful for breaking large files representing, say, entire books or a sequence of slides for a presentation, into individual files that load quickly. This script is a partial replacement for the -slides option that used to be part of HTML tidy (L). It is more flexible than that option, because tidy was hard coded to split on

, whereas htsplit accepts any user-specified heading level. =head1 README Installation: copy this script to any directory in your path. Options: --heading n : heading level, 1 through 6, on which to split (just the number, not the 'h') --basename name : name prefix for the generated files (optional, default 'slide') --tidy : optionally run tidy -m -i on the output files --help : show the help text and exit --version : show the help text and exit =head1 COPYRIGHT Copyright E 2003 Fred Condo. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 PREREQUISITES constant strict HTML::Parser Getopt::Long =pod SCRIPT CATEGORIES Web =cut