#!/usr/bin/perl -w
# Take the HTML m-w.com's wwwebster returns and process it into something
# readable.
#
# Hacky kludge.  Works for now.

my $ignorestart = <<line;
HTTP/1.1 200 OK
line

# Previously [<a href="/mw/theslimt.htm">Thesaurus Search</a>]
# then alt="WWWebster Dictionary"></center>
my $inputstart = <<line;
<!-------AD BANNER------->
line

# Previously <br clear=left><img src="/mw/thinbar.gif" alt="--------------------">
my $inputend = <<line;
<hr width="75%">
line

my $clipstart = <<line;
To view an entry in the list, highlight it and click on GO TO.
line

my $ignoring = 0;
my $clipping = 0;
my $intag = 0;  # are we inside an HTML tag?

while (<>) {
	s/\r//g;
	if ($ignoring) {
		$ignoring = 0 if $_ eq $inputstart;
		next;
	} else {
		$ignoring = 1 if $_ eq $inputend;
		if ($_ eq $ignorestart) {
			$ignoring = 1;
			next;
		}
		if ($_ eq $clipstart) {
			$clipping = 1;
		}
		if ($clipping) {
			if (m#^</pre>#) {
				print "(Alternatives not shown)\n";
				$clipping = 0;
			} else {
				next;
			}
		}
		chomp;
		s/<br>/\n/g;
		s/<.*?>//g;
		# Check for the beginning or end of a multi-line HTML tag.
		if ($intag) {
			s/.*?>// and $intag = 0;
		} else {
			s/<.*// and $intag = 1;
		}
		s/\&amp\;/\&/g;
		s/\&gt\;/\>/g;
		s/\&lt\;/\</g;
		print;
		next;
	}
}

print "\n";
