#!/usr/local/bin/perl -w # digest2html v. 21 Dec 1995, Dave Schweisguth # Converts RFC1153 digest format (and Chris Lewis' minimal format) to HTML # Inspired by work by Tom Fine, with hints by Seth Golub # Notes # ----- # Does not recognize Tom Fine's other format # Manpages are linked to a database of IRIX manpages # Version history # --------------- # 21 Dec 1995 First release. ### Preliminaries # Emulate #!/usr/local/bin/perl on systems without #! eval '(exit $?0)' && eval 'exec /usr/local/bin/perl -S $0 ${1+"$@"}' & eval 'exec /usr/local/bin/perl -S $0 $argv:q' if 0; require 5; # Perl 5 required, 5.001m recommended ### Parameters # Environment ($whatami = $0) =~ s|.*/||; # `basename $0` chop($date = `date '+%d %b %Y'`); # dd Mon yy $isatty = -t STDIN; # Configuration $delete = 0; # Delete the end of the first entry (e.g. a # redundant TOC) $force = 0; # Overwrite existing files $single = 0; # Make single page instead of top and sections $truncate = 0; # Truncate last entry $index = 'index.html'; # Name of entry point in each directory $tlb = ''; # Link insertion points $tle = ''; $blb = ''; $ble = ''; $deleteme = "Topics covered in this FAQ:\n"; # Delete starting here $author = 'The SGI FAQ group'; $email = 'sgi-faq@viz.tamu.edu'; $whereami = 'http://www-viz.tamu.edu/~sgi-faq/tools/'; # Initialization (don't change these) $out = ''; # Output directory/file $stdout = 0; # Write to standard output ### Arguments and error-checking # Parse args while ($#ARGV > -1 && (($first, $rest) = ($ARGV[0] =~ /^-(.)(.*)/))) { # Perl 5 lossage alert if ($first =~ /[o]/) { # Switches with arguments shift; $arg = $rest ne '' ? $rest : $ARGV[0] ne '' ? shift : &usage("$whatami: -$first requires an argument.\n"); } elsif ($rest ne '') { $ARGV[0] = "-$rest"; } else { shift; } if ($first eq '1') { $single = 1; } elsif ($first eq 'd') { $delete = 1; } elsif ($first eq 'f') { $force = 1; } elsif ($first eq 's') { $short = 1; } elsif ($first eq 't') { $truncate = 1; } elsif ($first eq 'o') { $out = $arg; } elsif ($first eq 'u') { &usage(0); } else { &usage("$whatami: -$first is not an option.\n"); } } sub usage { local ($message) = $_[0]; warn $message if $message; warn < won't die on errors foreach $i (@ARGV) { -s $i || die "$whatami: Can't read $i!\n"; } # Slurp digest, split into entries and insert hrefs @entries = (); $i = 0; while (<>) { if (/^-{30}/) { # This gets 70- and 30-hyphen lines $i++; } else { &escape_html; &href unless /^Subject:/; # Don't want hrefs in index push(@{$entries[$i]}, $_); } } pop(@entries) if $truncate; # Truncate last entry # Delete the end of the first entry beginning at the line $deleteme if ($delete) { $i = 0; while ($i <= $#{$entries[0]}) { if ($entries[0][$i] eq $deleteme) { splice(@{$entries[0]}, $i, $#{$entries[0]} - $i + 1); last; } $i++; } } # Trim leading and trailing blank lines from each entry foreach $i (@entries) { shift(@$i) while $$i[0] =~ /^\s*\n$/; pop(@$i) while $$i[-1] =~ /^\s*\n$/; } # Find subject lines @subjects = (); foreach $i (0 .. $#entries) { $j = 0; while ($j <= $#{$entries[$i]}) { if ($entries[$i][$j] =~ /^Subject:\s*(.*)/) { $subjects[$i] = $1; $j++; while ($j <= $#{$entries[$i]} && $entries[$i][$j] =~ /^\s+(\S.*)/) { $subjects[$i] .= " $1"; $j++; } last; } $j++; } } $title = $subjects[0]; $subjects[0] = "Introduction"; # Now that we're done with it push(@subjects, "The End"); # Last bit of RFC1153 has no "Subject:" # Find output directory/file name if it wasn't specified if (! $stdout && $out eq '') { $archive = ''; grep(/^Archive-name:\s*(.*)/ && ($archive = $1), @{$entries[0]}); $out = $archive || $infile; $out =~ s|.*/||; } # Check for output directory and create if necessary unless ($single) { if (-e $out) { if ($force) { -d _ || die "$whatami: Can't replace plain file $out with a directory!\n"; } else { die "$whatami: Output file $out already exists!\n"; } } else { mkdir($out, 0755) || die "$whatami: Can't create directory $out!\n"; } } # Make index (or single page) unless ($stdout) { $outfile = $out . ($single ? '.html' : "/$index"); if (-e $outfile && ! $force) { die "$whatami: Output file $outfile already exists!\n"; } open(OUT, ">$outfile") || die "$whatami: Can't write to $outfile!\n"; select OUT; } # Top of index (or single page) print < $title $tlb $tle

$title

    EOP # Table of contents foreach $i (0 .. $#entries) { print "
  • $subjects[$i]\n"; } print < EOP # Make file (or add to page) for each entry foreach $i (0 .. $#entries) { unless ($single) { if (-e "$out/$i.html" && ! $force) { die "$whatami: Output file $out/$i.html already exists!\n"; } open(OUT2, ">$out/$i.html") || die "$whatami: Can't write to $out/$i.html!\n"; select OUT2; # Top of entry's page print < $subjects[$i] EOP # Up, Next, Previous links print "Up: $title\n"; print "
    Next: $subjects[$i+1]\n" if $i < $#entries; print "
    Previous: $subjects[$i-1]\n" if $i > 0; } #
    before the entry print( ($single ? "" : ''), # Begin name anchor if $single '
    ', ($single ? '
    ' : ''), # End name anchor if $single "\n" ); print "
    \n", @{$entries[$i]}, "
    \n"; # Print the entry unless ($single) { # Up, Next, Previous links print "
    \n"; print "Up: $title\n"; print "
    Next: $subjects[$i+1]\n" if $i < $#entries; print "
    Previous: $subjects[$i-1]\n" if $i > 0; # End of entry's page print < EOP close OUT2; } } # Finish this digest's index (or single page) select OUT unless $single; # End of this digest's index (or single page) print <
    $author <$email>
    Generated by $whatami, $date
    EOP close OUT; exit; ### Subroutines # Escape HTML special characters sub escape_html { s/&/&\;/g; s/>/>\;/g; s/]+ (?# May contain any character except whitespace or &<>) [\w/] (?# Last character must be a \w or / to avoid trailing punctuation) }{$&}gx; # site:/path (treated as an ftp:// URL) s{ ([\w\.\-]+):/ (?# host:/) ( (?![:/\w])| (?# No path Must not precede :, / or \w, which would be part of a path or port spec) [^/:\s](?![/:/\w])| (?# One-character path Must not be /, so we don't break real URLs Must not precede :, / or \w, which would be more of a path or port spec) [^/:\s]\S*[\w/] (?# Longer path First character must not be /, so we don't break real URLs Last character must be a \w or / to avoid trailing punctuation) ) }{$&}gx; # user@host # Last character of host must be a \w to avoid trailing periods s#[\w\.\-\+]+@[\w\.\-]*[\w]#$&#g; # # , for the conservative # # Last character of host must be a \w to avoid trailing punctuation # # s#(<\;)[\w\.\-\+]+@[\w\.\-]*[\w](>\;)#$1$2$3#g; # IRIX manpages s#(\w+)\((\d)[a-zA-Z]{0,2}\)#$&#g; # RFCs s#\brfc\s*(\d+)#$&#ig; # Newsgroups s{ (^|[^/]) (?# Must not follow / so we don't get newsgroup names in paths, e.g. at rtfm) ( \b(?:alt|arpa|bionet|bit|biz|comp|de|gnu|humanities|misc|news|rec| sci|soc|talk|vmsnet) (?# Must begin with a known hierarchy so we don't get hostnames; add hierarchies as desired) \.[\w\.\-\+]* \w (?# Last character must be a \w to avoid trailing periods) ) (?!/) (?# Must not precede / so we don't get newsgroup names in paths, e.g. at rtfm) (?!\.\*) (?# Must not precede .* so we don't get, e.g., comp.sys.sgi.*) (?!\w) (?# Must not precede \w, so we don't get partials) (?!\.\w) }{$1$2}gx; # (See [also] "foo") # The SGI FAQs don't use this, so it's been only lightly tested s{(\(see\s+(?:also\s+)?\")([^\s\"]+)(\"\))}{ "$1$2$3" }egi; }