#! /usr/local/bin/perl
##---------------------------------------------------------------------------##
##  File:
##      stripsgml
##  Author:
##      Earl Hood       ehood@convex.com
##  Description:
##	Remove SGML markup.  Reads from STDIN, writes to STDOUT.
##	Character entities are converted to ASCII equivalents.  However,
##	due to the nature of some special characters, some special
##	characters may get lost in the output.
##
##  Usage:
##	% stripsgml [-html] < file.sgml > file.txt
##
##  	The -html option cause URLs in anchor elements in an HTML document
##	be preserved in the text output.
##
##  Changes
##	0.1.1:  Syntax changes for execution under perl 5.
##
##---------------------------------------------------------------------------##
##  Copyright (C) 1994  Earl Hood, ehood@convex.com
##
##  This program is free software; you can redistribute it and/or modify
##  it under the terms of the GNU General Public License as published by
##  the Free Software Foundation; either version 2 of the License, or
##  (at your option) any later version.
##
##  This program is distributed in the hope that it will be useful,
##  but WITHOUT ANY WARRANTY; without even the implied warranty of
##  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
##  GNU General Public License for more details.
##
##  You should have received a copy of the GNU General Public License
##  along with this program; if not, write to the Free Software
##  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
##---------------------------------------------------------------------------##

package main;

##---------------------------------------------------------------------------##
## Store name of program ##
($PROG = $0) =~ s/.*\///;
$VERSION = "0.1.1";

## Require libraries ##
unshift(@INC, '.');
require "sgml.pl" || die "Unable to require sgml.pl\n";

##---------------------------------------------------------------------------##
## Associative array for entity substitutions
##
%Entity = (

#-----------------------------
# Numeric and Special Graphic
#-----------------------------
#   Entity      ASCII substitution
#   ----------  ------------------
    "half",	"1/2",	# fraction one-half
    "frac12",	"1/2",
    "#189",	"1/2",
    "frac14",	"1/4",	# fraction one-quarter
    "#188",	"1/4",
    "frac34",	"3/4",	# fraction three-quarters
    "#190",	"3/4",
    "frac18",	"1/8",	# fraction one-eighth
    "frac38",	"3/8",	# fraction three-eighths
    "frac58",	"5/8",	# fraction five-eighths
    "frac78",	"7/8",	# fraction seven-eighths
    "sup1",	"[1]",	# superscript one
    "#185",	"[1]",
    "sup2",	"[2]",	# superscript two
    "#178",	"[2]",
    "sup3",	"[3]",	# superscript three
    "#179",	"[3]",
    "plus",	"+",	# plus sign
    "plusmn",	"+/-",	# plus-or-minus sign
    "lt",	"<",	# less-than sign
    "equals",	"=",	# equals sign
    "gt",	">",	# greater-than sign
    "divide",	"/",	# divide sign
    "times",	"*",	# times sign
    "curren",	"\$",	# general currency sign
    "#164",	"\$",
    "pound",	"L",	# pound sign
    "#163",	"L",
    "dollar",	"\$",	# dollar sign
    "cent",	"c",	# cent sign
    "#162",	"c",
    "yen",	"Y",	# yen sign
    "#165",	"Y",
    "num",	"#",	# number sign
    "percent",	"%", 	# percent sign
    "amp",	"&",	# ampersand
    "ast",	"*",	# asterisk
    "commat",	"@",	# commercial at
    "lsqb",	"[",	# left square bracket
    "bsol",	"\\",	# reverse solidus
    "rsqb",	"]",	# right square bracket
    "lcub",	"{",	# left curly bracket
    "verbar",	"|",	# vertical bar
    "rcub",	"}",	# right curly bracket
    "copy",	"(C)",	# copyright sign
    "#169",	"(C)",
    "reg",	"(R)",	# registered sign
    "#174",	"(R)",
    "trade",	"(TM)",	# tradmark
    "brvbar",	"|",	# broken vertical bar
    "#166",	"|",
    "excl",	"!",	# exclamation mark
    "quot",	'"',	# quotation mark
    "apos",	"'",	# apostrophe
    "lpar",	"(",	# left parenthesis
    "rpar",	")",	# right parenthesis
    "comma",	",",	# comma
    "lowbar",	"_",	# low line
    "hyphen",	"-",	# hyphen
    "period",	".",	# full stop, period
    "sol",	"/",	# solidus
    "colon",	":",	# colon
    "semi",	";",	# semicolon
    "quest",	"?",	# question mark
    "laquo",	"<<",	# angle quotation mark, left
    "&#171",	"<<",
    "raquo",	">>",	# angle quotation mark, right
    "#187",	">>",
    "lsquo",	"'",	# single quotation mark, left
    "rsquo",	"'",	# single quotation mark, right
    "ldquo",	'"',	# double quotation mark, left
    "rdquo",	'"',	# double quotation mark, right
    "nbsp",	" ",	# no break (required) space
    "#160",	" ",
    "shy",	"-",	# soft hyphen
    "#173",	"-",

#---------------
# Added Latin 1
#---------------
#   Entity      ASCII substitution
#   ----------  ------------------
    "aacute",	"a",	# small a, acute accent
    "#225",	"a",
    "Aacute",	"A",	# capital A, acute accent
    "#193",	"A",
    "acirc",	"a",	# small a, circumflex accent
    "#226",	"a",
    "Acirc",	"A",	# capital A, circumflex accent
    "#194",	"A",
    "agrave",	"a",	# small a, grave accent
    "#224",	"a",
    "Agrave",	"A",	# capital A, grave accent
    "#192",	"A",
    "aring",	"a",	# small a, ring
    "#229",	"a",
    "Aring",	"A",	# capital A, ring
    "#197",	"A",
    "atilde",	"a",	# small a, tilde
    "#227",	"a",
    "Atilde",	"A",	# capital A, tilde
    "#195",	"A",
    "auml",	"a",	# small a, dieresis or umlaut mark
    "#228",	"a",
    "Auml",	"A",	# capital A, dieresis or umlaut mark
    "#196",	"A",
    "aelig",	"ae",	# small ae diphthong (ligature)
    "#230",	"ae",
    "AElig",	"AE",	# capital AE diphthong (ligature)
    "#198",	"AE",
    "ccedil",	"c",	# small c, cedilla
    "#231",	"c",
    "Ccedil",	"C",	# capital C, cedilla
    "#199",	"C",
    "eth",	"d",	# small eth, Icelandic
    "#240",	"d",
    "ETH",	"d",	# capital Eth, Icelandic
    "#208",	"d",
    "eacute",	"e",	# small e, acute accent
    "#233",	"e",
    "Eacute",	"E",	# capital E, acute accent
    "#201",	"E",
    "ecirc",	"e",	# small e, circumflex accent
    "#234",	"e",
    "Ecirc",	"E",	# capital E, circumflex accent
    "#202",	"E",
    "egrave",	"e",	# small e, grave accent
    "#232",	"e",
    "Egrave",	"E",	# capital E, grave accent
    "#200",	"E",
    "euml",	"e",	# small e, dieresis or umlaut mark
    "#235",	"e",
    "Euml",	"E",	# capital E, dieresis or umlaut mark
    "#203",	"E",
    "iacute",	"i",	# small i, acute accent
    "#237",	"i",
    "Iacute",	"I",	# capital I, acute accent
    "#205",	"I",
    "icirc",	"i",	# small i, circumflex accent
    "#238",	"i",
    "Icirc",	"I",	# capital I, circumflex accent
    "#206",	"I",
    "igrave",	"i",	# small i, grave accent
    "#236",	"i",
    "Igrave",	"I",	# capital I, grave accent
    "#204",	"I",
    "iuml",	"i",	# small i, dieresis or umlaut mark
    "#239",	"i",
    "Iuml",	"I",	# capital I, dieresis or umlaut mark
    "#207",	"I",
    "ntilde",	"n",	# small n, tilde
    "#241",	"n",
    "Ntilde",	"N",	# capital N, tilde
    "#209",	"N",
    "oacute",	"o",	# small o, acute accent
    "#243",	"o",
    "Oacute",	"O",	# capital O, acute accent
    "#211",	"O",
    "ocirc",	"o",	# small o, circumflex accent
    "#244",	"o",
    "Ocirc",	"O",	# capital O, circumflex accent
    "#212",	"O",
    "ograve",	"o",	# small o, grave accent
    "#242",	"o",
    "Ograve",	"O",	# capital O, grave accent
    "#210",	"O",
    "oslash",	"o",	# small o, slash
    "#248",	"o",
    "Oslash",	"O",	# capital O, slash
    "#216",	"O",
    "otilde",	"o",	# small o, tilde
    "#245",	"o",
    "Otilde",	"O",	# capital O, tilde
    "#213",	"O",
    "ouml",	"o",	# small o, dieresis or umlaut mark
    "#246",	"o",
    "Ouml",	"O",	# capital O, dieresis or umlaut mark
    "#214",	"O",
    "szlig",	"s",	# small sharp s, German (sz ligature)
    "#223",	"s",
    "thorn",	"p",	# small thorn, Icelandic
    "#254",	"p",
    "THORN",	"P",	# capital THORN, Icelandic
    "#222",	"P",
    "uacute",	"u",	# small u, acute accent
    "#250",	"u",
    "Uacute",	"U",	# capital U, acute accent
    "#218",	"U",
    "ucirc",	"u",	# small u, circumflex accent
    "#251",	"u",
    "Ucirc",	"U",	# capital U, circumflex accent
    "#219",	"U",
    "ugrave",	"u",	# small u, grave accent
    "#249",	"u",
    "Ugrave",	"U",	# capital U, grave accent
    "#217",	"U",
    "uuml",	"u",	# small u, dieresis or umlaut mark
    "#252",	"u",
    "Uuml",	"U",	# capital U, dieresis or umlaut mark
    "#220",	"U",
    "yacute",	"y",	# small y, acute accent
    "#253",	"y",
    "Yacute",	"Y",	# capital Y, acute accent
    "#221",	"Y",
    "yuml",	"y",	# small y, dieresis or umlaut mark
    "#255",	"y",
);
##---------------------------------------------------------------------------##
##------------##
## Begin MAIN ##
##------------##
{

local(@array);

$HTML = 1  if "@ARGV" =~ /-html/;

&SGMLread_sgml(STDIN, *array);
foreach (@array) {
    if ($HTML) {
	if (/^<A\s/i) {			# Check for URL
	    if (/href\s*=\s*['"]([^'"]+)['"]/i) {
		$url = $1;
	    } else {
		$url = '';
	    }
	} elsif (m%^</A\s*>%i) {	# Output URL at anchor end
	    print STDOUT " <URL:$url>"  if $url;
	}
    }
    next if /^</;
    s/\&([#\w-._]+);/$Entity{$1}/g;
    print STDOUT $_;
}
exit 0;

}
##----------##
## End MAIN ##
##----------##
##---------------------------------------------------------------------------##
