#!/usr/local/bin/perl -w

# digest2html v. 21 Dec 1995, Dave Schweisguth <dcs@proton.chem.yale.edu>
# Converts RFC1153 digest format (and Chris Lewis' minimal format) to HTML

# Inspired by work by Tom Fine, with hints by Seth Golub

# Notes
# -----
# Does not recognize Tom Fine's other format
# Manpages are linked to a database of IRIX manpages

# Version history
# ---------------
# 21 Dec 1995	First release.

### Preliminaries

# Emulate #!/usr/local/bin/perl on systems without #!

eval '(exit $?0)' && eval 'exec /usr/local/bin/perl -S $0 ${1+"$@"}'
& eval 'exec /usr/local/bin/perl -S $0 $argv:q' if 0;

require 5;			# Perl 5 required, 5.001m recommended

### Parameters

# Environment

($whatami = $0)	=~ s|.*/||;	# `basename $0`
chop($date	= `date '+%d %b %Y'`);	# dd Mon yy
$isatty		= -t STDIN;

# Configuration

$delete		= 0;		# Delete the end of the first entry (e.g. a
				#   redundant TOC)
$force		= 0;		# Overwrite existing files
$single		= 0;		# Make single page instead of top and sections
$truncate	= 0;		# Truncate last entry

$index		= 'index.html';	# Name of entry point in each directory
$tlb		= '<!-- TOP LINK BEGIN -->';	    # Link insertion points
$tle		= '<!-- TOP LINK END -->';
$blb		= '<!-- BOTTOM LINK BEGIN -->';
$ble		= '<!-- BOTTOM LINK END -->';
$deleteme	= "Topics covered in this FAQ:\n";  # Delete starting here
$author		= 'The SGI FAQ group';
$email		= 'sgi-faq@viz.tamu.edu';
$whereami	= 'http://www-viz.tamu.edu/~sgi-faq/tools/';

# Initialization (don't change these)

$out		= '';		# Output directory/file
$stdout		= 0;		# Write to standard output

### Arguments and error-checking

# Parse args

while ($#ARGV > -1 && (($first, $rest) = ($ARGV[0] =~ /^-(.)(.*)/))) {
    # Perl 5 lossage alert
    if ($first =~ /[o]/) {	# Switches with arguments
    	shift;
    	$arg = $rest ne '' ? $rest : $ARGV[0] ne '' ? shift :
      	    &usage("$whatami: -$first requires an argument.\n");
    } elsif ($rest ne '') {
    	$ARGV[0] = "-$rest";
    } else {
	shift;
    }
    if	  ($first eq '1') { $single = 1; }
    elsif ($first eq 'd') { $delete = 1; }
    elsif ($first eq 'f') { $force = 1; }
    elsif ($first eq 's') { $short = 1; }
    elsif ($first eq 't') { $truncate = 1; }
    elsif ($first eq 'o') { $out = $arg; }
    elsif ($first eq 'u') { &usage(0); }
    else		  { &usage("$whatami: -$first is not an option.\n"); }
}

sub usage {
    local ($message) = $_[0];

    warn $message if $message;
    warn <<EOP;
Usage: $whatami [-1dftu] [-o out] [file]
-1	Make a single page instead of an index and individual entries
-d	Delete the end of the first entry (e.g. a redundant TOC)
-f	Overwrite existing files
-s	Shorten URLs by omitting "$index". Appropriate when output will be
	  read only through an HTTP server which knows to append "$index".
-t	Truncate the last entry, which in proper RFC1153 is uninteresting
-o out	Output directory or file name, '-' for standard output. If not given,
	  the basename of the archive-name or the input filename is used.
-u	This message
EOP
    exit !! $message;
}

# Postprocess arguments

if (@ARGV == 0) {
    $isatty && die "$whatami: Specify an input file or provide one on standard input.\n";
} elsif (@ARGV == 1) {
    $infile = $ARGV[0];
} else {
    die "$whatami: Specify only one input file.\n";
}

if ($out eq '-') {
    $single || die "$whatami: Can't write multi-file format to standard output.\n";
    $stdout = 1;
}

$urlindex = $short ? '' : $index;

### Do it

# Check input files here, since <> won't die on errors

foreach $i (@ARGV) {
    -s $i || die "$whatami: Can't read $i!\n";
}

# Slurp digest, split into entries and insert hrefs

@entries = ();
$i = 0;
while (<>) {
    if (/^-{30}/) {		    # This gets 70- and 30-hyphen lines
	$i++;
    } else {
	&escape_html;
	&href unless /^Subject:/;   # Don't want hrefs in index
	push(@{$entries[$i]}, $_);
    }
}
pop(@entries) if $truncate;	    # Truncate last entry

# Delete the end of the first entry beginning at the line $deleteme

if ($delete) {
    $i = 0;
    while ($i <= $#{$entries[0]}) {
	if ($entries[0][$i] eq $deleteme) {
	    splice(@{$entries[0]}, $i, $#{$entries[0]} - $i + 1);
	    last;
	}
	$i++;
    }
}

# Trim leading and trailing blank lines from each entry

foreach $i (@entries) {
    shift(@$i) while $$i[0] =~ /^\s*\n$/;
    pop(@$i) while $$i[-1] =~ /^\s*\n$/;
}

# Find subject lines

@subjects = ();
foreach $i (0 .. $#entries) {
    $j = 0;
    while ($j <= $#{$entries[$i]}) {
	if ($entries[$i][$j] =~ /^Subject:\s*(.*)/) {
	    $subjects[$i] = $1;
	    $j++;
	    while ($j <= $#{$entries[$i]} && $entries[$i][$j] =~ /^\s+(\S.*)/) {
		$subjects[$i] .= " $1";
		$j++;
	    }
	    last;
	}
	$j++;
    }
}
$title = $subjects[0];
$subjects[0] = "Introduction";	# Now that we're done with it
push(@subjects, "The End");	# Last bit of RFC1153 has no "Subject:"

# Find output directory/file name if it wasn't specified

if (! $stdout && $out eq '') {
    $archive = '';
    grep(/^Archive-name:\s*(.*)/ && ($archive = $1), @{$entries[0]});
    $out = $archive || $infile;
    $out =~ s|.*/||;
}

# Check for output directory and create if necessary

unless ($single) {
    if (-e $out) {
	if ($force) {
	    -d _ || die "$whatami: Can't replace plain file $out with a directory!\n";
	} else {
	    die "$whatami: Output file $out already exists!\n";
	}
    } else {
	mkdir($out, 0755) || die "$whatami: Can't create directory $out!\n";
    }
}

# Make index (or single page)

unless ($stdout) {
    $outfile = $out . ($single ? '.html' : "/$index");
    if (-e $outfile && ! $force) {
	die "$whatami: Output file $outfile already exists!\n";
    }
    open(OUT, ">$outfile") || die "$whatami: Can't write to $outfile!\n";
    select OUT;
}

# Top of index (or single page)

print <<EOP;
<html>
<head>
<title>$title</title>
</head>
<body>
$tlb
$tle
<h1>$title</h1>
<ul>
EOP

# Table of contents

foreach $i (0 .. $#entries) {
    print "<li><a href=\"", ($single ? "#$i" : "$i.html"),
	"\">$subjects[$i]</a>\n";
}

print <<EOP;
</ul>
EOP

# Make file (or add to page) for each entry

foreach $i (0 .. $#entries) {

    unless ($single) {

	if (-e "$out/$i.html" && ! $force) {
	    die "$whatami: Output file $out/$i.html already exists!\n";
	}
	open(OUT2, ">$out/$i.html") ||
	    die "$whatami: Can't write to $out/$i.html!\n";
	select OUT2;
    
	# Top of entry's page
	    
	print <<EOP;
<html>
<head>
<title>$subjects[$i]</title>
</head>
<body>
EOP
	# Up, Next, Previous links

	print "<a href=\"./$urlindex\">Up: $title</a>\n";
	print "<br><a href=\"", $i+1, ".html\">Next: $subjects[$i+1]</a>\n"
	    if $i < $#entries;
	print "<br><a href=\"", $i-1, ".html\">Previous: $subjects[$i-1]</a>\n"
	    if $i > 0;
    }

    # <hr> before the entry

    print(
	($single ? "<a name=\"$i\">" : ''),	# Begin name anchor if $single
	'<hr>',
	($single ? '</a>' : ''),		# End name anchor if $single 
	"\n"
    );
    print "<pre>\n", @{$entries[$i]}, "</pre>\n";	# Print the entry

    unless ($single) {
    
	# Up, Next, Previous links
	
	print "<hr>\n";
	print "<a href=\"./$urlindex\">Up: $title</a>\n";
	print "<br><a href=\"", $i+1, ".html\">Next: $subjects[$i+1]</a>\n"
	    if $i < $#entries;
	print "<br><a href=\"", $i-1, ".html\">Previous: $subjects[$i-1]</a>\n"
	    if $i > 0;

	# End of entry's page

	print <<EOP;
</body>
</html>
EOP
	close OUT2;
    }
}

# Finish this digest's index (or single page)

select OUT unless $single;

# End of this digest's index (or single page)

print <<EOP;
$blb
$ble
<hr>
<address>
$author &lt;<a href="mailto:$email">$email</a>&gt;<br>
Generated by <a href="$whereami">$whatami</a>, $date
</address>
</body>
</html>
EOP
close OUT;

exit;

### Subroutines

# Escape HTML special characters

sub escape_html {

    s/&/&amp\;/g;
    s/>/&gt\;/g;
    s/</&lt\;/g;
	    
}

# Convert references to networked resources to hrefs

sub href {
    
    # Comment and uncomment substitutions as desired
    
    # URL

    s{
	\b(?:file|ftp|gopher|s?http|https|mailto|s?news|telnet|wais)://
		    (?# Add protocols as desired)
	[^\s&<>]+   (?# May contain any character except whitespace or &<>)
	[\w/]	    (?# Last character must be a \w or / to avoid trailing
			punctuation)
    }{<a href="$&">$&</a>}gx;

    # site:/path (treated as an ftp:// URL)

    s{
	([\w\.\-]+):/		(?# host:/)
	(
	    (?![:/\w])|		(?# No path
				    Must not precede :, / or \w, which would be
					part of a path or port spec)
	    [^/:\s](?![/:/\w])|	(?# One-character path
				    Must not be /, so we don't break real URLs
				    Must not precede :, / or \w, which would be
					more of a path or port spec)
	    [^/:\s]\S*[\w/]	(?# Longer path
				    First character must not be /, so we don't
					break real URLs
				    Last character must be a \w or / to avoid
					trailing punctuation)
	)
    }{<a href="ftp://$1/$2">$&</a>}gx;
    
    # user@host
    # Last character of host must be a \w to avoid trailing periods

    s#[\w\.\-\+]+@[\w\.\-]*[\w]#<a href="mailto:$&">$&</a>#g;

#   # <user@host>, for the conservative
#   # Last character of host must be a \w to avoid trailing punctuation
#
#   s#(&lt\;)[\w\.\-\+]+@[\w\.\-]*[\w](&gt\;)#$1<a href="mailto:$2">$2</a>$3#g;
    
    # IRIX manpages
    
    s#(\w+)\((\d)[a-zA-Z]{0,2}\)#<a href="http://reality.sgi.com/cgi-bin/uman\?p=$1&s=$2">$&</a>#g;

    # RFCs

    s#\brfc\s*(\d+)#<a href="http://ds.internic.net/rfc/rfc$1.txt">$&</a>#ig;

    # Newsgroups
    
    s{
	(^|[^/])		    (?# Must not follow / so we don't get
					newsgroup names in paths, e.g. at rtfm)
	(
	    \b(?:alt|arpa|bionet|bit|biz|comp|de|gnu|humanities|misc|news|rec|
		sci|soc|talk|vmsnet)
				    (?# Must begin with a known hierarchy so we
					don't get hostnames; add hierarchies as
					desired)
	    \.[\w\.\-\+]*
	    \w			    (?# Last character must be a \w to avoid
					trailing periods)
	)
	(?!/)			    (?# Must not precede / so we don't get
					newsgroup names in paths, e.g. at rtfm)
	(?!\.\*)		    (?# Must not precede .* so we don't get,
					e.g.,  comp.sys.sgi.*)
	(?!\w)			    (?# Must not precede \w, so we don't get
					partials)
	(?!\.\w)
    }{$1<a href="news:$2">$2</a>}gx;

    # (See [also] "foo")
    # The SGI FAQs don't use this, so it's been only lightly tested

    s{(\(see\s+(?:also\s+)?\")([^\s\"]+)(\"\))}{
	"$1<a href=\"" .
	($single ? "$2.html" : "../$2/$urlindex") .
	"\">$2</a>$3"
    }egi;
}