#! /usr/bin/perl
#------------------------------------------------------------------------

=head1 HTML::TextToHTML

HTML::TextToHTML - convert plain text file to HTML

=head1 SYNOPSIS

  From the command line:

    perl -MHTML::TextToHTML -e run_txt2html -- --help;
    (prints this synopsis)

    perl -MHTML::TextToHTML -e run_txt2html -- --man;
    (prints this manpage)

    perl -MHTML::TextToHTML -e run_txt2html -- I<arguments>;
    (calls the txt2html method with the given arguments)

  From Scripts:

    use HTML::TextToHTML;
 
    # create a new object
    my $conv = new HTML::TextToHTML();

    # convert a file
    $conv->txt2html(["--file", $text_file,
                     "--outfile", $html_file,
		     "--title", "Wonderful Things",
			 "--mail"
      ]);

    # convert a string
    $newstring = $conv->process_para($mystring)

=head1 DESCRIPTION

HTML::TextToHTML converts plain text files to HTML.

It supports headings, tables, lists, simple character markup, and
hyperlinking, and is highly customizable. It recognizes some of the
apparent structure of the source document (mostly whitespace and
typographic layout), and attempts to mark that structure explicitly
using HTML. The purpose for this tool is to provide an easier way of
converting existing text documents to HTML format.

There are two ways to use this module:
    (1) called from a perl script
    (2) call run_txt2html from the command line

The first usage requires one to create a HTML::TextToHTML object, and
then call the txt2html or process_para method with suitable arguments.
Because this object is a subclass of AppConfig, one can use all the
power of AppConfig for defining and parsing options/arguments.  One can
also pass arguments in when creating the object, or call the args method
to pass arguments in.

The second usage allows one to pass arguments in from the command line, by
calling perl and executing the module, and calling run_txt2html which
creates an object for you and parses the command line.

Either way, the arguments are the same.  See L<OPTIONS> for the
arguments.

The following are the public functions of this module:

=over 4

=item run_txt2html

    run_txt2html()

This is exported, and is what is used to run this module from the
command-line.  It creates a HTML::TextToHTML object and parses the
command-line arguments, and passes them to the object, and runs
the txt2html method.  It takes no arguments.

=item new

    $conv = new HTML::TextToHTML(\@args)

Create a new object with new.  It has an optional argument of a reference
to an array -- this contains the arguments/options to customize the
conversion.

=item txt2html

    $conv->txt2html(\@args)

txt2html is the workhorse of this module; this does the conversion
of a text file to HTML.  It has an optional argument of a reference
to an array -- this contains the arguments/options to customize
the conversion (besides which, it needs to know what file(s) to
convert!).

=item process_para

    $newstring = $conv->process_para($mystring)

Convert a string to HTML, treating it as if it were a single paragraph.
This returns the processed string.  If you want to pass arguments to alter
the behaviour of this conversion, you need to do that earlier, either
when you create the object, or with the C<args> method (see below).

=item args

    $conv->args(\@args)

    $conv->args(["--infile", "CLEAR"]);

Updates the current arguments/options of the HTML::GenToc object.
Takes a reference to an array of arguments, which will be used
in invocations of other methods.

=item do_help

    $conv->do_help();

Output the default help or manpage message (and exit) if the --help or
--manpage options are set.  This is explicitly called inside
I<txt2html> and I<run_txt2html>, so you only need to call this
if you wish to trigger the help action without having called those
methods.

If --manpage is true, this displays all the PoD documentation
of the calling program.  Otherwise, if --help is true, then this
displays the SYNOPSIS information from the PoD documentation
of the calling program.

=back

=head1 OPTIONS

=head2 A Note about Options

Options can start with '--' or '-'.  If it is a yes/no option, that is the
only part of the option (and such an option can be prefaced with "no" to
negate it).  If the option takes a value, then the list must be
("--option", "value").

Order does matter.  For options which are yes/no options, a later
argument overrides an earlier one.  For arguments which are single values,
a later value replaces an earlier one.  For arguments which are
cumulative, a later argument is added on to the list.  For such arguments,
if you want to clear the old value and start afresh, give it the
special value of CLEAR.

=over 8

=item --append_file I<filename> | --append I<filename> | --append_body I<filename>

If you want something appended by default, put the filename here.
The appended text will not be processed at all, so make sure it's
plain text or decent HTML.  i.e. do not have things like:
    Kathryn Andersen E<lt>rubykat@katspace.comE<gt>
but instead, have:
    Kathryn Andersen &lt;rubykat@katspace.com&gt;

(default: nothing)

=item --append_head I<filename> | -ah I<filename>

If you want something appended to the head by default, put the filename here.
The appended text will not be processed at all, so make sure it's
plain text or decent HTML.  i.e. do not have things like:
    Kathryn Andersen E<lt>rubykat@katspace.comE<gt>
but instead, have:
    Kathryn Andersen &lt;rubykat@katspace.com&gt;

(default: nothing)

=item --caps_tag I<tag> | --capstag I<tag> | -ct I<tag>

Tag to put around all-caps lines
(default: STRONG)

=item --config I<file>

A file containing options, which is read in, and the options from the file
are treated as if they were in the argument list at the point at which the
--config option was.  See L<Config File> for more information.

=item --custom_heading_regexp I<regexp> | --heading I<regexp> | -H I<regexp>

Add a regexp for headings.  Header levels are assigned by regexp
in order seen When a line matches a custom header regexp, it is tagged as
a header.  If it's the first time that particular regexp has matched,
the next available header level is associated with it and applied to
the line.  Any later matches of that regexp will use the same header level.
Therefore, if you want to match numbered header lines, you could use
something like this:
    -H '^ *\d+\. \w+' -H '^ *\d+\.\d+\. \w+' -H '^ *\d+\.\d+\.\d+\. \w+'

Then lines like
                " 1. Examples "
                " 1.1 Things"
            and " 4.2.5 Cold Fusion"
Would be marked as H1, H2, and H3 (assuming they were found in that
order, and that no other header styles were encountered).
If you prefer that the first one specified always be H1, the second
always be H2, the third H3, etc, then use the -EH/--explicit-headings
option.

This is a multi-valued option.

(default: none)

=item --debug

Enable copious script debugging output (don't bother, this is for the
developer)

=item --default_link_dict I<filename>

The name of the default "user" link dictionary.
(default: "$ENV{'HOME'}/.txt2html.dict" -- this is the same as for
the txt2html script)

=item --dict_debug I<n> | -db I<n>

Debug mode for link dictionaries Bitwise-Or what you want to see:
          1: The parsing of the dictionary
          2: The code that will make the links
          4: When each rule matches something

(default: 0)

=item --doctype I<doctype> | --dt I<doctype>

This gets put in the DOCTYPE field at the
top of the document, unless it's empty.
(default : "-//W3C//DTD HTML 3.2 Final//EN")

=item --eight_bit_clean | --eight_bit

disable Latin-1 character entity naming
(default: false)

=item --escape_HTML_chars | --escape_chars | -ec

turn & E<lt> E<gt> into &amp; &gt; &lt;
(default: true)

=item --explicit_headings | -EH

Don't try to find any headings except the ones specified in the
--custom_heading_regexp option.
Also, the custom headings will not be assigned levels in the order they
are encountered in the document, but in the order they are specified on
the command line.
(default: false)

=item --extract

Extract Mode; don't put HTML headers or footers on the result, just
the plain HTML (thus making the result suitable for inserting into
another document (or as part of the output of a CGI script).
(default: false)

=item --hrule_min I<n> | --hrule I<n> | -r I<n>

Min number of ---s for an HRule.
(default: 4)

=item --indent_width I<n> | --indent I<n> | -iw I<n>

Indents this many spaces for each level of a list.
(default: 2)

=item --indent_par_break | -ipb

Treat paragraphs marked solely by indents as breaks with indents.
That is, instead of taking a three-space indent as a new paragraph,
put in a <BR> and three non-breaking spaces instead.
(see also --preserve_indent)
(default: false)

=item --infile I<filename> | --file I<filename>

The name of the input file.
This is a cumulative list argument.  If you want to process more than
one file, just add another --file I<file> to the list of arguments.  If
you want to process a different file, you need to CLEAR this argument
first.
(default:undefined)

=item --links_dictionaries I<filename> | --link I<filename> | -l I<filename>

File to use as a link-dictionary.  There can be more than one of these.
These are in addition to the System Link Dictionary and the User Link
Dictionary.

=item --link_only | --linkonly | -LO

Do no escaping or marking up at all, except for processing the links
dictionary file and applying it.  This is useful if you want to use
the linking feature on an HTML document.  If the HTML is a
complete document (includes HTML,HEAD,BODY tags, etc) then you'll
probably want to use the --extract option also.
(default: false)

=item --mailmode | --mail | -m

Deal with mail headers & quoted text
(default: false)

=item --make_anchors | --anchors

Should we try to make anchors in headings?
(default: true)

=item --make_links

Should we try to build links?
(default: true)

=item --make_tables | --tables

Should we try to build tables?

If true, spots tables and marks them up appropriately.  A table must be
marked as a separate paragraph, that is, it must be surrounded by blank
lines.  Columns must be separated by two or more spaces (this prevents
accidental incorrect recognition of a paragraph where interword spaces
happen to line up).  If there are two or more rows in a paragraph and
all rows share the same set of (two or more) columns, the paragraph is
assumed to be a table.  For example

    -e  File exists.
    -z  File has zero size.
    -s  File has nonzero size (returns size).

becomes

    <TABLE>
    <TR><TD>-e</TD><TD>File exists.</TD></TR>
    <TR><TD>-z</TD><TD>File has zero size.</TD></TR>
    <TR><TD>-s</TD><TD>File has nonzero size (returns size).</TD></TR>
    </TABLE>

This guesses for each column whether it is intended to be left,
centre or right aligned.

This overrides the detection of lists; if something looks like a table,
it is taken as a table, and list-checking is not done for that
paragraph.

(default: false)

=item --min_caps_length I<n> | --caps I<n> | -c I<n>

min sequential CAPS for an all-caps line
(default: 3)

=item --outfile I<filename>

The name of the output file.  If it is "-" then the output goes
to Standard Output.
(default: - )

=item --par_indent I<n>

Minumum number of spaces indented in first lines of paragraphs.
  Only used when there's no blank line
preceding the new paragraph.
(default: 2)

=item --preformat_trigger_lines I<n> | --prebegin I<n> | -pb I<n>

How many lines of preformatted-looking text are needed to switch to <PRE>
          <= 0 : Preformat entire document
             1 : one line triggers
          >= 2 : two lines trigger

(default: 2)

=item --endpreformat_trigger_lines I<n> | --preend I<n> | -pe I<n>

How many lines of unpreformatted-looking text are needed to switch from <PRE>
           <= 0 : Never preformat within document
              1 : one line triggers
           >= 2 : two lines trigger
(default: 2)

NOTE for --prebegin and --preend:
A zero takes precedence.  If one is zero, the other is ignored.
If both are zero, entire document is preformatted.

=item --preformat_start_marker I<regexp>

What flags the start of a preformatted section if --use_preformat_marker
is true.

(default: "^(:?(:?&lt;)|<)PRE(:?(:?&gt;)|>)\$")

=item --preformat_end_marker I<regexp>

What flags the end of a preformatted section if --use_preformat_marker
is true.

(default: "^(:?(:?&lt;)|<)/PRE(:?(:?&gt;)|>)\$")

=item --preformat_whitespace_min I<n> | --prewhite I<n> | -p I<n>

Minimum number of consecutive whitespace characters to trigger
normal preformatting. 
NOTE: Tabs are expanded to spaces before this check is made.
That means if B<tab_width> is 8 and this is 5, then one tab may be
expanded to 8 spaces, which is enough to trigger preformatting.
(default: 5)

=item --prepend_file I<filename> | --prepend_body I<filename> | --pp I<filename>

If you want something prepended to the processed body text, put the
filename here.  The prepended text will not be processed at all, so make
sure it's plain text or decent HTML.

(default: nothing)

=item --preserve_indent | -pi

Preserve the first-line indentation of paragraphs marked with indents
by replacing the spaces of the first line with non-breaking spaces.
(default: false)


=item --short_line_length I<n> | --shortline I<n> | -s I<n>

Lines this short (or shorter) must be intentionally broken and are kept
that short.
(default: 40)

=item --system_link_dict I<filename>

The name of the default "system" link dictionary.
(default: "/usr/share/txt2html/txt2html.dict" -- this is the same as for
the txt2html script)

=item --tab_width I<n> | --tabwidth I<n> | -tw I<n>

How many spaces equal a tab?
(default: 8)

=item --title I<title> | -t I<title>

You can specify a title.  Otherwise it will use a blank one.
(default: nothing)

=item --titlefirst | -tf

Use the first non-blank line as the title.

=item --underline_length_tolerance I<n> | --ulength I<n> | -ul I<n>

How much longer or shorter can underlines be and still be underlines?
(default: 1)

=item --underline_offset_tolerance I<n> | --uoffset I<n> | -uo I<n>

How far offset can underlines be and still be underlines?
(default: 1)

=item --unhyphenation | --unhypnenate | -u

Enables unhyphenation of text.
(default: true)

=item --use_mosaic_header | --mosaic | -mh

Use this option if you want to force the heading styles to match what Mosaic
outputs.  (Underlined with "***"s is H1,
with "==="s is H2, with "+++" is H3, with "---" is H4, with "~~~" is H5
and with "..." is H6)
This was the behavior of txt2html up to version 1.10.
(default: false)

=item --use_preformat_marker | --preformat_marker | -pm

Turn on preformatting when encountering
"<PRE>" on a line by itself, and turn it
off when there's a line containing only "</PRE>".
(default: off)

=head1 FILE FORMATS

There are two kinds of files which are used which can affect the outcome of
the conversion.  One is a config file, which can contain any of the
arguments described in L<OPTIONS> and is treated the same as if they were
entered in a script or from the command line.  The other is the link
dictionary, which contains patterns (of how to recognise http links and
other things) and how to convert them.

=head2 Config File

The Config file is a way of specifying default options in a file instead
of having to do it when you call this.

The file may contain blank lines and comments (prefixed by
'#') which are ignored.  Continutation lines may be marked
by ending the line with a '\'.

    # this is a comment
    title = Page of Wonderful and Inexplicably Joyous \
    Things You Want To Know About

Options that are simple flags and do not expect an argument can be
specified without any value.  They will be set with the value 1, with any
value explicitly specified (except "0" and "off") being ignored.  The
option may also be specified with a "no" prefix to implicitly set the
variable to 0.

    mail                                 # on (1)
    mail = 1                             # on (1)
    mail = 0                             # off (0)
    mail off                             # off (0)
    mail on                              # on (1)
    mail mumble                          # on (1)
    nomail                               # off (0)

Options that expect an argument (but are not cumulative) will
be set to whatever follows the variable name, up to the end of the
current line.  An equals sign may be inserted between the option
and value for clarity.

    tab_width = 8
    tab_width   4

Each subsequent re-definition of the option value overwites
the previous value.  From the above example, the value of the tab
width would now be 4.

Some options are simple cumulative options, with each subsequent
definition of the option adding to the list of previously set values
for that option.

    heading = '^ *\d+\. \w+'
    heading = '^ *\d+\.\d+\. \w+'
    heading = '^ *\d+\.\d+\.\d+\. \w+'

If you want to clear the list and start again, give the CLEAR option.

    heading = CLEAR

The '-' prefix can be used to reset a variable to its
default value and the '+' prefix can be used to set it to 1.

    -mail
    +debug

Option values may contain references to other options, environment
variables and/or users' home directories.

    link = ~/.link_dict	# expand '~' to home directory

    mail = ${TXT_MAIL}   # expand TXT_MAIL environment variable

The configuration file may have options arranged in blocks.  A block
header, consisting of the block name in square brackets, introduces a
configuration block.  The block name and an underscore are then prefixed to
the names of all options subsequently referenced in that block.  The
block continues until the next block definition or to the end of the
current file.

    [underline]
    length_tolerance = 8    # underline_length_tolerance = 8
    offset_tolerance = 4    # underline_offset_tolerance = 4

See AppConfig for more information.

=head2 Link Dictionary

A link dictionary file contains patterns to match, and what to convert
them to.  It is called a "link" dictionary because it was intended to be
something which defined what a href link was, but it can be used for
more than that.  However, if you wish to define your own links, it is
strongly advised to read up on regular expressions (regexes) because
this relies heavily on them.

The file consists of comments (which are lines starting with #)
and blank lines, and link entries.
Each entry consists of a regular expression, a -> separator (with
optional flags), and a link "result".

In the simplest case, with no flags, the regular expression
defines the pattern to look for, and the result says what part
of the regular expression is the actual link, and the link which
is generated has the href as the link, and the whole matched pattern
as the visible part of the link.  The first character of the regular
expression is taken to be the separator for the regex, so one
could either use the traditional / separator, or something else
such as | (which can be helpful with URLs which are full of / characters).

So, for example, an ftp URL might be defined as:

    |ftp:[\w/\.:+\-]+|      -> $&

This takes the whole pattern as the href, and the resultant link
has the same thing in the href as in the contents of the anchor.

But sometimes the href isn't the whole pattern.

    /&lt;URL:\s*(\S+?)\s*&gt;/ --> $1

With the above regex, a () grouping marks the first subexpression,
which is represented as $1 (rather than $& the whole expression).
This entry matches a URL which was marked explicity as a URL
with the pattern <URL:foo>  (note the &lt; is shown as the
entity, not the actual character.  This is because by the
time the links dictionary is checked, all such things have
already been converted to their HTML entity forms)
This would give us a link in the form
<A HREF="foo">&lt;URL:foo&gt;</A>

B<The h flag>

However, if we want more control over the way the link is constructed,
we can construct it ourself.  If one gives the h flag, then the
"result" part of the entry is taken not to contain the href part of
the link, but the whole link.

For example, the entry:

    /&lt;URL:\s*(\S+?)\s*&gt;/ -h-> <A HREF="$1">$1</A>

will take <URL:foo> and give us <A HREF="foo">foo</A>

However, this is a very powerful mechanism, because it
can be used to construct custom tags which aren't links at all.
For example, to flag *italicised words* the following
entry will surround the words with EM tags.

    /\B\*([a-z][a-z -]*[a-z])\*\B/ -hi-> <EM>$1</EM>

B<The i flag>

This turns on ignore case in the pattern matching.

B<The e flag>

This turns on execute in the pattern substitution.  This really
only makes sense if h is turned on too.  In that case, the "result"
part of the entry is taken as perl code to be executed, and the
result of that code is what replaces the pattern.

B<The o flag>

This marks the entry as a once-only link.  This will convert the
first instance of a matching pattern, and ignore any others
further on.

For example, the following pattern will take the first mention
of HTML::TextToHTML and convert it to a link to the module's home page.

    "HTML::TextToHTML"  -io-> http://www.katspace.com/tools/text_to_html/

=head1 EXAMPLES

    use HTML::TextToHTML;
 
=head2 Create a new object

    my $conv = new HTML::TextToHTML();

    my $conv = new HTML::TextToHTML(["--title", "Wonderful Things",
			    "--system_link_dict", $my_link_file,
      ]);

    my $conv = new HTML::TextToHTML(\@ARGV);

=head2 Add further arguments

    $conv->args(["--short_line_length", 60,
	       "--prebegin", 4,
	       "--caps_tag", "strong",
      ]);

=head2 Convert a file

    $conv->txt2html(["--file", $text_file,
                     "--outfile", $html_file,
		     "--title", "Wonderful Things",
			 "--mail"
      ]);

=head1 NOTES

=over 4

=item *

One cannot use "CLEAR" as a value for the cumulative arguments.

=item *

If the underline used to mark a header is off by more than 1, then 
that part of the text will not be picked up as a header unless you
change the value of --underline_length_tolerance and/or
--underline_offset_tolerance.  People tend to forget this.

=back 4

=head1 BUGS

Tell me about them.

=head1 PREREQUSITES

HTML::TextToHTML requires Perl 5.005_03 or later.

It also requires AppConfig, 
Data::Dumper (only for debugging purposes)
and Pod::Usage.

=head1 EXPORT

run_txt2html

=head1 AUTHOR

Kathryn Andersen, E<lt>rubykat@katspace.comE<gt>

=head1 SEE ALSO

L<perl>.
L<txt2html>.
AppConfig
Pod::Usage
Data::Dumper

=cut

#------------------------------------------------------------------------
package HTML::TextToHTML;

use 5.005_03;
use strict;
use warnings;
use diagnostics;

require Exporter;
use vars qw($VERSION $PROG @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);

BEGIN {
    @ISA = qw(Exporter AppConfig);
    require Exporter;
    use AppConfig qw(:argcount);
    use Data::Dumper;
    use HTML::SimpleParse;
    use Pod::Usage;
}

# Items to export into callers namespace by default. Note: do not export
# names by default without a very good reason. Use EXPORT_OK instead.
# Do not simply export all your public functions/methods/constants.

# This allows declaration	use HTML::TextToHTML ':all';
# If you do not need this, moving things directly into @EXPORT or @EXPORT_OK
# will save memory.
%EXPORT_TAGS = (
    'all' => [
        qw(

        )
    ]
);

@EXPORT_OK = (@{$EXPORT_TAGS{'all'}});

@EXPORT = qw(
  run_txt2html
);
$PROG = 'HTML::TextToHTML';
$VERSION = '0.04';

#------------------------------------------------------------------------
use constant TEXT_TO_HTML => "TEXT_TO_HTML";

########################################
# Definitions  (Don't change these)
#

# These are just constants I use for making bit vectors to keep track
# of what modes I'm in and what actions I've taken on the current and
# previous lines.  
use vars qw($NONE $LIST $HRULE $PAR $PRE $END $BREAK $HEADER
  $MAILHEADER $MAILQUOTE $CAPS $LINK $PRE_EXPLICIT $TABLE
  $IND_BREAK);

$NONE         = 0;
$LIST         = 1;
$HRULE        = 2;
$PAR          = 4;
$PRE          = 8;
$END          = 16;
$BREAK        = 32;
$HEADER       = 64;
$MAILHEADER   = 128;
$MAILQUOTE    = 256;
$CAPS         = 512;
$LINK         = 1024;
$PRE_EXPLICIT = 2048;
$TABLE        = 4096;
$IND_BREAK    = 8192;

# Constants for Ordered Lists and Unordered Lists.  
# I use this in the list stack to keep track of what's what.

use vars qw($OL $UL);
$OL = 1;
$UL = 2;

# Character entity names
use vars qw(%char_entities %char_entities2);

# characters to replace *before* processing a line
%char_entities = (
    "\241", "&iexcl;",  "\242", "&cent;",   "\243", "&pound;",
    "\244", "&curren;", "\245", "&yen;",    "\246", "&brvbar;",
    "\247", "&sect;",   "\250", "&uml;",    "\251", "&copy;",
    "\252", "&ordf;",   "\253", "&laquo;",  "\254", "&not;",
    "\255", "&shy;",    "\256", "&reg;",    "\257", "&hibar;",
    "\260", "&deg;",    "\261", "&plusmn;", "\262", "&sup2;",
    "\263", "&sup3;",   "\264", "&acute;",  "\265", "&micro;",
    "\266", "&para;",   "\270", "&cedil;",  "\271", "&sup1;",
    "\272", "&ordm;",   "\273", "&raquo;",  "\274", "&fraq14;",
    "\275", "&fraq12;", "\276", "&fraq34;", "\277", "&iquest;",
    "\300", "&Agrave;", "\301", "&Aacute;", "\302", "&Acirc;",
    "\303", "&Atilde;", "\304", "&Auml;",   "\305", "&Aring;",
    "\306", "&AElig;",  "\307", "&Ccedil;", "\310", "&Egrave;",
    "\311", "&Eacute;", "\312", "&Ecirc;",  "\313", "&Euml;",
    "\314", "&Igrave;", "\315", "&Iacute;", "\316", "&Icirc;",
    "\317", "&Iuml;",   "\320", "&ETH;",    "\321", "&Ntilde;",
    "\322", "&Ograve;", "\323", "&Oacute;", "\324", "&Ocirc;",
    "\325", "&Otilde;", "\326", "&Ouml;",   "\327", "&times;",
    "\330", "&Oslash;", "\331", "&Ugrave;", "\332", "&Uacute;",
    "\333", "&Ucirc;",  "\334", "&Uuml;",   "\335", "&Yacute;",
    "\336", "&THORN;",  "\337", "&szlig;",  "\340", "&agrave;",
    "\341", "&aacute;", "\342", "&acirc;",  "\343", "&atilde;",
    "\344", "&auml;",   "\345", "&aring;",  "\346", "&aelig;",
    "\347", "&ccedil;", "\350", "&egrave;", "\351", "&eacute;",
    "\352", "&ecirc;",  "\353", "&euml;",   "\354", "&igrave;",
    "\355", "&iacute;", "\356", "&icirc;",  "\357", "&iuml;",
    "\360", "&eth;",    "\361", "&ntilde;", "\362", "&ograve;",
    "\363", "&oacute;", "\364", "&ocirc;",  "\365", "&otilde;",
    "\366", "&ouml;",   "\367", "&divide;", "\370", "&oslash;",
    "\371", "&ugrave;", "\372", "&uacute;", "\373", "&ucirc;",
    "\374", "&uuml;",   "\375", "&yacute;", "\376", "&thorn;",
    "\377", "&yuml;",
);

# characters to replace *after* processing a line
%char_entities2 = ("\267", "&middot;",);

# alignments for tables
use vars qw(@alignments);
@alignments = ('', '', ' ALIGN="RIGHT"', ' ALIGN="CENTER"');

#---------------------------------------------------------------#
# Object interface
#---------------------------------------------------------------#

# Name: new
# Creates a new instance of a Toc
# Args:
#   $invocant
#   \@args (array of command-line arguments in Args style)
sub new {
    my $invocant = shift;
    my $args_ref = (@_ ? shift: 0);

    my $class = ref($invocant) || $invocant;    # Object or class name
    my $self = AppConfig->new(
        {
            CASE   => 1,
            CREATE => 0,
            GLOBAL => {
                ARGCOUNT => ARGCOUNT_NONE,
                EXPAND   => AppConfig::EXPAND_ALL,
                ACTION   => \&do_var_action,
            }
        }
    );

    init_our_data($self);

    # re-bless self
    bless($self, $class);

    # and set with the passed-in args
    if ($args_ref && @{$args_ref}) {
        if (!$self->args($args_ref)) {
            print STDERR "Unrecognised option, try --help\n";
            return 0;
        }
    }

    return $self;
}    # new

# Name: args
# sets arguments for a given object
# Args:
#   $self
#   \@args (array of command-line arguments in Args style)
sub args {
    my $self     = shift;
    my $args_ref = (@_ ? shift: 0);

    # and set with the passed-in args
    if ($args_ref && @{$args_ref}) {
        if (!$self->SUPER::args($args_ref)) {
            print STDERR "Unrecognised option, try --help\n";
            exit(1);
        }
    }

    return 1;
}    # args

#--------------------------------#
# Name: do_help
# Args:
#   $self
sub do_help ($) {
    my $self = shift;

    if ($self->man_help()) {
        if (-f "$0") {
            pod2usage(
                {
                    -message => "$0",
                    -exitval => 0,
                    -verbose => 2,
                }
            );
        }
        else {
            pod2usage(
                {
                    -message  => "HTML::TextToHTML",
                    -exitval  => 0,
                    -verbose  => 2,
                    -input    => "HTML/TextToHTML.pm",
                    -pathlist => \@INC,
                }
            );
        }
    }
    if ($self->help()) {
        if (-f "$0") {
            pod2usage(
                {
                    -message => "$0",
                    -exitval => 0,
                    -verbose => 0,
                }
            );
        }
        else {
            pod2usage(
                {
                    -message  => "HTML::TextToHTML",
                    -exitval  => 0,
                    -verbose  => 0,
                    -input    => "HTML/TextToHTML.pm",
                    -pathlist => \@INC,
                }
            );
        }
    }

}    # do_help
     #---------------------------------------------------------------#
     # AppConfig-related subroutines

#--------------------------------#
# Name: do_var_action
#   ACTION function for hash AppConfig variables
# Args:
#   $state_ref -- reference to AppConfig::State
#   $name -- variable name
#   $value -- new value
sub do_var_action($$$) {
    my $state_ref = shift;
    my $name      = shift;
    my $value     = shift;

    my $parent = $state_ref->get(TEXT_TO_HTML);

    if ($name eq TEXT_TO_HTML) {

        # do nothing!
    }

    # clear the variable if given the value CLEAR
    elsif ($value eq "CLEAR") {
        if (ref($state_ref->get($name)) eq "HASH") {
            %{$state_ref->get($name)} = ();
        }
        elsif (ref($state_ref->get($name)) eq "ARRAY") {
            @{$state_ref->get($name)} = ();
        }
    }

    # if this is config, read in the given config file
    elsif ($name eq "config") {
        if ($state_ref->get('debug')) {
            print STDERR ">>> reading in config file $value\n";
        }
        $parent->file($value);
        if ($state_ref->get('debug')) {
            print STDERR "<<< read in config file $value\n";
        }
    }

    if ($state_ref->get('debug')) {
        print STDERR "=========\n changed $name to $value\n =========\n";
        if (ref($state_ref->get($name)) eq "HASH") {
            print STDERR Dumper($state_ref->get($name));
        }
        elsif (ref($state_ref->get($name)) eq "ARRAY") {
            print STDERR Dumper($state_ref->get($name));
        }
    }
}    # do_var_action

#--------------------------------#
# Name: define_vars
#   define the variables which AppConfig will recognise
# Args:
#   $self
sub define_vars {
    my $self = shift;

    # since debug is checked in the action, set it first
    $self->define(
        "debug",
        {
            DEFAULT => 0,
        }
    );

    # reference to self!  (do not change!)
    $self->define(
        "TEXT_TO_HTML",
        {
            ARGCOUNT => ARGCOUNT_ONE,
        }
    );
    $self->set(TEXT_TO_HTML, $self);

    #
    # All the options, in alphabetical order
    #
    $self->define(
        "append_file|append|append_body|a|ab=s",
        {
            DEFAULT => "",
        }
    );
    $self->define(
        "append_head|ah=s",
        {
            DEFAULT => "",
        }
    );
    $self->define(
        "caps_tag|capstag|ct=s",
        {
            DEFAULT => "STRONG",
        }
    );
    $self->define("config=s");    # name of a config file -- parsed immediately
    $self->define("custom_heading_regexp|heading|H=s@");
    $self->define(
        "default_link_dict|dict=s",
        {
            DEFAULT => "$ENV{HOME}/.txt2html.dict",
        }
    );
    $self->define(
        "dict_debug|db=n",
        {
            DEFAULT => 0,
        }
    );
    $self->define(
        "doctype|dt=s",
        {
            DEFAULT => "-//W3C//DTD HTML 3.2 Final//EN",
        }
    );
    $self->define(
        "eight_bit_clean|eight_bit|eight|8",
        {
            DEFAULT => 0,
        }
    );
    $self->define(
        "escape_HTML_chars|escapechars|ec",
        {
            DEFAULT => 1,
        }
    );
    $self->define(
        "explicit_headings|EH",
        {
            DEFAULT => 0,
        }
    );
    $self->define(
        "extract",
        {
            DEFAULT => 0,
        }
    );
    $self->define("help");
    $self->define(
        "hrule_min|hrule|r=n",
        {
            DEFAULT => 4,
        }
    );
    $self->define(
        "indent_width|indent|iw=n",
        {
            DEFAULT => 2,
        }
    );
    $self->define(
        "indent_par_break|ipb",
        {
            DEFAULT => 0,
        }
    );
    $self->define("infile|file=s@");    # names of files to be processed
    $self->define("links_dictionaries|link|l=s@");
    $self->define(
        "link_only|linkonly|LO",
        {
            DEFAULT => 0,
        }
    );
    $self->define(
        "mailmode|mail",
        {
            DEFAULT => 0,
        }
    );
    $self->define(
        "make_anchors|anchors",
        {
            DEFAULT => 1,
        }
    );
    $self->define(
        "make_links",
        {
            DEFAULT => 1,
        }
    );
    $self->define(
        "make_tables|tables",
        {
            DEFAULT => 0,
        }
    );
    $self->define("man_help|manpage|man");
    $self->define(
        "min_caps_length|caps|c=n",
        {
            DEFAULT => 3,
        }
    );
    $self->define(
        "outfile|out|o=s",
        {
            DEFAULT => "-",
        }
    );
    $self->define(
        "par_indent=n",
        {
            DEFAULT => 2,
        }
    );
    $self->define(
        "preformat_trigger_lines|prebegin|pb=n",
        {
            DEFAULT => 2,
        }
    );
    $self->define(
        "endpreformat_trigger_lines|preend|pe=n",
        {
            DEFAULT => 2,
        }
    );
    $self->define(
        "preformat_start_marker=s",
        {
            DEFAULT => "^(:?(:?&lt;)|<)PRE(:?(:?&gt;)|>)\$",
        }
    );
    $self->define(
        "preformat_end_marker=s",
        {
            DEFAULT => "^(:?(:?&lt;)|<)/PRE(:?(:?&gt;)|>)\$",
        }
    );

    $self->define(
        "preformat_whitespace_min|prewhite|p=n",
        {
            DEFAULT => 5,
        }
    );
    $self->define(
        "prepend_file|prepend_body|pp=s",
        {
            DEFAULT => "",
        }
    );
    $self->define(
        "preserve_indent|pi",
        {
            DEFAULT => 0,
        }
    );
    $self->define(
        "short_line_length|shortline|s=n",
        {
            DEFAULT => 40,
        }
    );
    $self->define("system_link_dict|sysdict=s");
    $self->define(
        "tab_width|tabwidth|tw=n",
        {
            DEFAULT => 8,
        }
    );
    $self->define(
        "title|t=s",
        {
            DEFAULT => "",
        }
    );
    $self->define(
        "titlefirst|tf",
        {
            DEFAULT => 0,
        }
    );
    $self->define(
        "underline_length_tolerance|ulength|ul=n",
        {
            DEFAULT => 1,
        }
    );
    $self->define(
        "underline_offset_tolerance|uoffset|uo=n",
        {
            DEFAULT => 1,
        }
    );
    $self->define(
        "unhyphenation|unhyphenate",
        {
            DEFAULT => 1,
        }
    );
    $self->define(
        "use_mosaic_header|mosaic|mh",
        {
            DEFAULT => 0,
        }
    );
    $self->define(
        "use_preformat_marker|preformat_marker|pm",
        {
            DEFAULT => 0,
        }
    );

}    # define_vars

#--------------------------------#
# Name: init_our_data
# Args:
#   $self
sub init_our_data ($) {
    my $self = shift;

    define_vars($self);

    # read in from the __DATA__ section
    $self->file(\*DATA);

    # accumulation variables
    $self->{__file} = "";    # Current file being processed
    my %heading_styles = ();
    $self->{__heading_styles}     = \%heading_styles;
    $self->{__num_heading_styles} = 0;
    my %links_table = ();
    $self->{__links_table} = \%links_table;
    my @links_table_order = ();
    $self->{__links_table_order} = \@links_table_order;
    my @search_patterns = ();
    $self->{__search_patterns} = \@search_patterns;
    my @repl_code = ();
    $self->{__repl_code}        = \@repl_code;
    $self->{__prev_para_action} = 0;
    $self->{__non_header_anchor} = 0;
    $self->{__mode}              = 0;
    $self->{__listnum}           = 0;
    $self->{__list_indent}       = "";

    $self->{__call_init_done}    = 0;

}    # init_our_data

#---------------------------------------------------------------#
# txt2html-related subroutines

#--------------------------------#
# Name: init_our_data
#   do extra processing related to particular options
# Args:
#   $self
sub deal_with_options ($) {
    my $self = shift;

    if ($self->links_dictionaries()) {
	# only put into the links dictionaries files which are readable
	my @dict_files = @{$self->links_dictionaries()};
	$self->args(['--link', 'CLEAR']);

        foreach my $ld (@dict_files) {
            if (-r $ld) {
                $self->set('make_links' => 1);
		$self->args(['--link', $ld]);
            }
            else {
                print STDERR "Can't find or read link-file $ld\n";
            }
        }
    }
    if (!$self->make_links()) {
        $self->set('links_dictionaries' => 0);
        $self->set('system_link_dict'   => "");
    }
    if ($self->append_file()) {
        if (!-r $self->append_file()) {
            print STDERR "Can't find or read ", $self->append_file(), "\n";
	    $self->set('append_file', '');
        }
    }
    if ($self->prepend_file()) {
        if (!-r $self->prepend_file()) {
            print STDERR "Can't find or read ", $self->prepend_file(), "\n";
	    $self->set('prepend_file', '');
        }
    }
    if ($self->append_head()) {
        if (!-r $self->append_head()) {
            print STDERR "Can't find or read ", $self->append_head(), "\n";
	    $self->set('append_head', '');
        }
    }

    if (!$self->outfile()) {
        $self->set('outfile' => "-");
    }

    $self->set('preformat_trigger_lines' => 0)
      if ($self->preformat_trigger_lines() < 0);
    $self->set('preformat_trigger_lines' => 2)
      if ($self->preformat_trigger_lines() > 2);

    $self->set('endpreformat_trigger_lines' => 1)
      if ($self->preformat_trigger_lines() == 0);
    $self->set('endpreformat_trigger_lines' => 0)
      if ($self->endpreformat_trigger_lines() < 0);
    $self->set('endpreformat_trigger_lines' => 2)
      if ($self->endpreformat_trigger_lines() > 2);

    $self->{__preformat_enabled} =
      (($self->endpreformat_trigger_lines() != 0)
      || $self->use_preformat_marker());

    if ($self->use_mosaic_header()) {
        my $num_heading_styles = 0;
        my %heading_styles     = ();
        $heading_styles{"*"} = ++$num_heading_styles;
        $heading_styles{"="} = ++$num_heading_styles;
        $heading_styles{"+"} = ++$num_heading_styles;
        $heading_styles{"-"} = ++$num_heading_styles;
        $heading_styles{"~"} = ++$num_heading_styles;
        $heading_styles{"."} = ++$num_heading_styles;
        $self->{__heading_styles}     = \%heading_styles;
        $self->{__num_heading_styles} = $num_heading_styles;
    }
}

sub is_blank ($) {

    return $_[0] =~ /^\s*$/;
}

sub escape ($) {
    my ($text) = @_;
    $text =~ s/&/&amp;/g;
    $text =~ s/>/&gt;/g;
    $text =~ s/</&lt;/g;
    return $text;
}

sub hrule ($;$$$) {
    my $self            = shift;
    my $line_ref        = (@_ ? shift: \$self->{__line});
    my $line_action_ref = (@_ ? shift: \$self->{__line_action});
    my $prev_ref        = (@_ ? shift: \$self->{__prev});

    my $hrmin = $self->hrule_min();
    if (${$line_ref} =~ /^\s*([-_~=\*]\s*){$hrmin,}$/) {
        ${$line_ref} = "<HR>\n";
        ${$prev_ref} =~ s/<P>//;
        ${$line_action_ref} |= $HRULE;
    }
    elsif (${$line_ref} =~ /\014/) {
        ${$line_action_ref} |= $HRULE;
        ${$line_ref} =~ s/\014/\n<HR>\n/g;   # Linefeeds become horizontal rules
    }
}

sub shortline ($;$$$$$$) {
    my $self            = shift;
    my $mode_ref        = (@_ ? shift: \$self->{__mode});
    my $line_ref        = (@_ ? shift: \$self->{__line});
    my $line_action_ref = (@_ ? shift: \$self->{__line_action});
    my $prev_ref        = (@_ ? shift: \$self->{__prev});
    my $prev_action_ref = (@_ ? shift: \$self->{__prev_action});
    my $prev_line_len   = (@_ ? shift: $self->{__prev_line_length});

    # Short lines should be broken even on list item lines iff the
    # following line is more text.  I haven't figured out how to do
    # that yet.  For now, I'll just not break on short lines in lists.
    # (sorry)

    if (!(${$mode_ref} & ($PRE | $LIST | $TABLE))
        && !is_blank(${$line_ref})
        && !is_blank(${$prev_ref})
        && ($prev_line_len < $self->short_line_length())
        && !(${$line_action_ref} & ($END | $HEADER | $HRULE | $LIST | $IND_BREAK| $PAR))
        && !(${$prev_action_ref} & ($HEADER | $HRULE | $BREAK | $IND_BREAK)))
    {
        ${$prev_ref} .= "<BR>" . chop(${$prev_ref});
        ${$prev_action_ref} |= $BREAK;
    }
}

sub mailstuff ($;$$$$$) {
    my $self            = shift;
    my $line_ref        = (@_ ? shift: \$self->{__line});
    my $line_action_ref = (@_ ? shift: \$self->{__line_action});
    my $prev_ref        = (@_ ? shift: \$self->{__prev});
    my $prev_action_ref = (@_ ? shift: \$self->{__prev_action});
    my $next_ref        = (@_ ? shift: \$self->{__nextline});

    if (((${$line_ref} =~ /^\w*&gt/)    # Handle "FF> Werewolves."
        || (${$line_ref} =~ /^\w*\|/))    # Handle "Igor| There wolves."
        && !is_blank(${$next_ref})
      )
    {
        ${$line_ref} =~ s/$/<BR>/;
        ${$line_action_ref} |= ($BREAK | $MAILQUOTE);
        if (!(${$prev_action_ref} & ($BREAK | $PAR))) {
            ${$prev_ref} .= "<P>";
            ${$line_action_ref} |= $PAR;
        }
    }
    elsif ((${$line_ref} =~ /^(From:?)|(Newsgroups:) /)
        && is_blank(${$prev_ref}))
    {
        $self->anchor_mail($line_ref)
          if !(${$prev_action_ref} & $MAILHEADER);
        chomp ${$line_ref};
        ${$line_ref} = "<!-- New Message -->\n<P>" . ${$line_ref} . "<BR>\n";
        ${$line_action_ref} |= ($BREAK | $MAILHEADER | $PAR);
    }
    elsif ((${$line_ref} =~ /^[\w\-]*:/)    # Handle "Some-Header: blah"
        && (${$prev_action_ref} & $MAILHEADER) && !is_blank(${$next_ref})
      )
    {
        ${$line_ref} =~ s/$/<BR>/;
        ${$line_action_ref} |= ($BREAK | $MAILHEADER);
    }
    elsif ((${$line_ref} =~ /^\s+\S/) &&    # Handle multi-line mail headers
        (${$prev_action_ref} & $MAILHEADER) && !is_blank(${$next_ref})
      )
    {
        ${$line_ref} =~ s/$/<BR>/;
        ${$line_action_ref} |= ($BREAK | $MAILHEADER);
    }
}

# Subtracts modes listed in $mask from $vector.
sub subtract_modes ($$) {
    my ($vector, $mask) = @_;
    return ($vector | $mask) - $mask;
}

sub paragraph ($;$$$$$) {
    my $self            = shift;
    my $mode_ref        = (@_ ? shift: \$self->{__mode});
    my $line_ref        = (@_ ? shift: \$self->{__line});
    my $line_action_ref = (@_ ? shift: \$self->{__line_action});
    my $prev_ref        = (@_ ? shift: \$self->{__prev});
    my $prev_action_ref = (@_ ? shift: \$self->{__prev_action});
    my $line_indent     = (@_ ? shift: $self->{__line_indent});
    my $prev_indent     = (@_ ? shift: $self->{__prev_indent});

    if (!is_blank(${$line_ref})
        && !(${$mode_ref} & ($PRE | $TABLE))
        && !subtract_modes(${$line_action_ref},
            $END | $MAILQUOTE | $CAPS | $BREAK)
        && (is_blank(${$prev_ref})
            || (${$line_action_ref} & $END)
            || ($line_indent > $prev_indent + $self->par_indent())))
    {
	if ($self->indent_par_break()
	    && !is_blank(${$prev_ref})
	    && !(${$line_action_ref} & $END)
	    && ($line_indent > $prev_indent + $self->par_indent()))
	{
	    ${$prev_ref} .= "<BR>";
	    ${$prev_ref} .= "&nbsp;" x $line_indent;
	    ${$line_ref} =~ s/^ {$line_indent}//;
	    ${$prev_action_ref} |= $BREAK;
	    ${$line_action_ref} |= $IND_BREAK;
	}
	elsif ($self->preserve_indent())
	{
	    ${$prev_ref} .= "<P>";
	    ${$prev_ref} .= "&nbsp;" x $line_indent;
	    ${$line_ref} =~ s/^ {$line_indent}//;
	    ${$line_action_ref} |= $PAR;
	}
	else
	{
	    ${$prev_ref} .= "<P>";
	    ${$line_action_ref} |= $PAR;
	}
    }
    # detect also a continuing indentation at the same level
    elsif ($self->indent_par_break()
        && !(${$mode_ref} & ($PRE | $TABLE | $LIST))
	&& !is_blank(${$prev_ref})
	&& !(${$line_action_ref} & $END)
	&& (${$prev_action_ref} & ($IND_BREAK | $PAR))
        && !subtract_modes(${$line_action_ref},
            $END | $MAILQUOTE | $CAPS)
        && ($line_indent > $self->par_indent())
	&& ($line_indent == $prev_indent)
	)
    {
	${$prev_ref} .= "<BR>";
	${$prev_ref} .= "&nbsp;" x $line_indent;
	${$line_ref} =~ s/^ {$line_indent}//;
	${$prev_action_ref} |= $BREAK;
	${$line_action_ref} |= $IND_BREAK;
    }
}

# If the line is blank, return the second argument.  Otherwise,
# return the number of spaces before any nonspaces on the line.
sub count_indent ($$) {
    my ($line, $prev_length) = @_;

    if (is_blank($line)) {
        return $prev_length;
    }
    my ($ws) = $line =~ /^( *)[^ ]/;
    return length($ws);
}

sub listprefix ($) {
    my $line = shift;

    my ($prefix, $number, $rawprefix);

    return (0, 0, 0)
      if (!($line =~ /^\s*[-=o\*\267]+\s+\S/)
        && !($line =~ /^\s*(\d+|[^\W\d_])[\.\)\]:]\s+\S/));

    ($number) = $line =~ /^\s*(\d+|[^\W\d_])/;
    $number = 0 unless defined($number);

    # That slippery exception of "o" as a bullet
    # (This ought to be determined using the context of what lists
    #  we have in progress, but this will probably work well enough.)
    if ($line =~ /^\s*o\s/) {
        $number = 0;
    }

    if ($number) {
        ($rawprefix) = $line =~ /^(\s*(\d+|[^\W\d_]).)/;
        $prefix = $rawprefix;
        $prefix =~ s/(\d+|[^\W\d_])//;    # Take the number out
    }
    else {
        ($rawprefix) = $line =~ /^(\s*[-=o\*\267]+.)/;
        $prefix = $rawprefix;
    }
    ($prefix, $number, $rawprefix);
}

sub startlist ($$$$$) {
    my $self      = shift;
    my $prefix    = shift;
    my $number    = shift;
    my $rawprefix = shift;
    my $prev_ref  = shift;

    $self->{__listprefix}->[$self->{__listnum}] = $prefix;
    if ($number) {

        # It doesn't start with 1,a,A.  Let's not screw with it.
        if (($number ne "1") && ($number ne "a") && ($number ne "A")) {
            return 0;
        }
        ${$prev_ref} .= $self->{__list_indent} . "<OL>\n";
        $self->{__list}->[$self->{__listnum}] = $OL;
    }
    else {
        ${$prev_ref} .= $self->{__list_indent} . "<UL>\n";
        $self->{__list}->[$self->{__listnum}] = $UL;
    }

    $self->{__listnum}++;
    $self->{__list_indent} = " " x $self->{__listnum} x $self->indent_width();
    $self->{__line_action} |= $LIST;
    $self->{__mode} |= $LIST;
    1;
}

# End N lists
sub endlist ($$$;$) {
    my $self            = shift;
    my $n               = shift;
    my $prev_ref        = shift;
    my $line_action_ref = (@_ ? shift: \$self->{__line_action});

    for (; $n > 0 ; $n--, $self->{__listnum}--) {
        $self->{__list_indent} =
          " " x ($self->{__listnum} - 1) x $self->indent_width();
        if ($self->{__list}->[$self->{__listnum} - 1] == $UL) {
            ${$prev_ref} .= $self->{__list_indent} . "</UL>\n";
        }
        elsif ($self->{__list}->[$self->{__listnum} - 1] == $OL) {
            ${$prev_ref} .= $self->{__list_indent} . "</OL>\n";
        }
        else {
            print STDERR "Encountered list of unknown type\n";
        }
    }
    ${$line_action_ref} |= $END;
    $self->{__mode} ^= $LIST if (!$self->{__listnum});
}

sub continuelist ($$$) {
    my $self            = shift;
    my $line_ref        = shift;
    my $line_action_ref = shift;

    my $list_indent = $self->{__list_indent};
    ${$line_ref} =~ s/^\s*[-=o\*\267]+\s*/$list_indent<LI>/
      if $self->{__list}->[$self->{__listnum} - 1] == $UL;
    ${$line_ref} =~ s/^\s*(\d+|[^\W\d_]).\s*/$list_indent<LI>/
      if $self->{__list}->[$self->{__listnum} - 1] == $OL;
    ${$line_action_ref} |= $LIST;
}

sub liststuff ($;$$$$$) {
    my $self            = shift;
    my $line_ref        = (@_ ? shift: \$self->{__line});
    my $line_action_ref = (@_ ? shift: \$self->{__line_action});
    my $line_indent_ref = (@_ ? shift: \$self->{__line_indent});
    my $prev_ref        = (@_ ? shift: \$self->{__prev});
    my $prev_action_ref = (@_ ? shift: \$self->{__prev_action});

    my $i;

    my ($prefix, $number, $rawprefix) = listprefix(${$line_ref});

    if (!$prefix) {
        return if !is_blank(${$prev_ref});    # inside a list item
             # This ain't no list.  We'll want to end all of them.
        if ($self->{__listnum}) {
            $self->endlist($self->{__listnum}, $prev_ref, $line_action_ref);
        }
        return;
    }

    # If numbers with more than one digit grow to the left instead of
    # to the right, the prefix will shrink and we'll fail to match the
    # right list.  We need to account for this.
    my $prefix_alternate;
    if (length("" . $number) > 1) {
        $prefix_alternate = (" " x (length("" . $number) - 1)) . $prefix;
    }

    # Maybe we're going back up to a previous list
    for ($i = $self->{__listnum} - 1 ;
        ($i >= 0) && ($prefix ne $self->{__listprefix}->[$i]) ; $i--
      )
    {
        if (length("" . $number) > 1) {
            last if $prefix_alternate eq $self->{__listprefix}->[$i];
        }
    }

    my $islist;

    # Measure the indent from where the text starts, not where the
    # prefix starts.  This won't screw anything up, and if we don't do
    # it, the next line might appear to be indented relative to this
    # line, and get tagged as a new paragraph.
    my ($total_prefix) = ${$line_ref} =~ /^(\s*[\w=o\*-]+.\s*)/;

    # Of course, we only use it if it really turns out to be a list.

    $islist = 1;
    $i++;
    if (($i > 0) && ($i != $self->{__listnum})) {
        $self->endlist($self->{__listnum} - $i, $prev_ref, $line_action_ref);
        $islist = 0;
    }
    elsif (!$self->{__listnum} || ($i != $self->{__listnum})) {
        if ((${$line_indent_ref} > 0)
            || is_blank(${$prev_ref})
            || (${$prev_action_ref} & ($BREAK | $HEADER | $CAPS)))
        {
            $islist = $self->startlist($prefix, $number, $rawprefix, $prev_ref);
        }
        else {

            # We have something like this: "- foo" which usually
            # turns out not to be a list.
            return;
        }
    }

    $self->continuelist($line_ref, $line_action_ref)
      if ($self->{__mode} & $LIST);
    ${$line_indent_ref} = length($total_prefix) if $islist;
}

sub tablestuff ($$$) {
    my $self     = shift;
    my $rows_ref = shift;
    my $para_len = shift;

    # TABLES: spot and mark up tables.  We combine the lines of the
    # paragraph using the string bitwise or (|) operator, the result
    # being in $spaces.  A character in $spaces is a space only if
    # there was a space at that position in every line of the
    # paragraph.  $space can be used to search for contiguous spaces
    # that occur on all lines of the paragraph.  If this results in at
    # least two columns, the paragraph is identified as a table.

    # Note that this sub must be called before checking for preformatted
    # lines because a table may well have whitespace to the left, in
    # which case it must not be incorrectly recognised as a preformat.
    my @rows = @{$rows_ref};
    my @starts;
    my @ends;
    my $spaces;
    my $max = 0;
    my $min = $para_len;
    foreach my $row (@rows) {
        ($spaces |= $row) =~ tr/ /\xff/c;
        $min = length $row if length $row < $min;
        $max = length $row if $max < length $row;
    }
    $spaces = substr $spaces, 0, $min;
    push (@starts, 0) unless $spaces =~ /^ /;
    while ($spaces =~ /((?:^| ) +)(?=[^ ])/g) {
        push @ends,   pos($spaces) - length $1;
        push @starts, pos($spaces);
    }
    shift (@ends) if $spaces =~ /^ /;
    push (@ends, $max);

    # Two or more rows and two or more columns indicate a table.
    if (2 <= @rows and 2 <= @starts) {
        $self->{__mode} |= $TABLE;

        # For each column, guess whether it should be left, centre or
        # right aligned by examining all cells in that column for space
        # to the left or the right.  A simple majority among those cells
        # that actually have space to one side or another decides (if no
        # alignment gets a majority, left alignment wins by default).
        my @align;
        my $cell = '';
        foreach my $col (0 .. $#starts) {
            my @count = (0, 0, 0, 0);
            foreach my $row (@rows) {
                my $width = $ends[$col] - $starts[$col];
                $cell = substr $row, $starts[$col], $width;
                ++$count[($cell =~ /^ / ? 2 : 0) +
                  ($cell =~ / $/ || length($cell) < $width ? 1 : 0)];
            }
            $align[$col] = 0;
            my $population = $count[1] + $count[2] + $count[3];
            foreach (1 .. 3) {
                if ($count[$_] * 2 > $population) {
                    $align[$col] = $_;
                    last;
                }
            }
        }

        foreach my $row (@rows) {
            $row = join '', '<TR>', (
              map {
                  $cell = substr $row, $starts[$_], $ends[$_] - $starts[$_];
                  $cell =~ s/^ +//;
                  $cell =~ s/ +$//;

                  if ($self->escape_HTML_chars()) {
                      $cell = escape($cell);
                  }

                  ('<TD', $alignments[$align[$_]], '>', $cell, '</TD>');
              } 0 .. $#starts),
              '</TR>';
        }

        # put the <TABLE> around the rows
        $rows[0] = "<TABLE>\n" . $rows[0];
        $rows[$#rows] .= "\n</TABLE>";
        @{$rows_ref} = @rows;
        return 1;
    }
    else {
        return 0;
    }
}

# Returns true if the passed string is considered to be preformatted
sub is_preformatted ($$) {
    my $self = shift;
    my $line = shift;

    my $pre_white_min = $self->preformat_whitespace_min();
    my $result = (($line =~ /\s{$pre_white_min,}\S+/o)    # whitespaces
      || ($line =~ /\.{$pre_white_min,}\S+/o));    # dots
    return $result;
}

sub endpreformat ($;$$$$) {
    my $self            = shift;
    my $mode_ref        = (@_ ? shift: \$self->{__mode});
    my $line_ref        = (@_ ? shift: \$self->{__line});
    my $line_action_ref = (@_ ? shift: \$self->{__line_action});
    my $prev_ref        = (@_ ? shift: \$self->{__prev});
    my $next_ref        = (@_ ? shift: \$self->{__nextline});

    if (${$mode_ref} & $PRE_EXPLICIT) {
        if (${$line_ref} =~ /$self->preformat_end_marker()/io) {
            ${$prev_ref} .= "</PRE>\n";
            ${$line_ref} = "";
            ${$mode_ref} ^= (($PRE | $PRE_EXPLICIT) & ${$mode_ref});
            ${$line_action_ref} |= $END;
        }
        return;
    }

    if (!$self->is_preformatted(${$line_ref})
        && ($self->endpreformat_trigger_lines() == 1
            || !$self->is_preformatted(${$next_ref})))
    {
        ${$prev_ref} .= "</PRE>\n";
        ${$mode_ref} ^= ($PRE & ${$mode_ref});
        ${$line_action_ref} |= $END;
    }
}

sub preformat ($;$$$$$) {
    my $self            = shift;
    my $mode_ref        = (@_ ? shift: \$self->{__mode});
    my $line_ref        = (@_ ? shift: \$self->{__line});
    my $line_action_ref = (@_ ? shift: \$self->{__line_action});
    my $prev_ref        = (@_ ? shift: \$self->{__prev});
    my $next_ref        = (@_ ? shift: \$self->{__nextline});

    if ($self->use_preformat_marker()) {
        my $pstart = $self->preformat_start_marker();
        if (${$line_ref} =~ /$pstart/io) {
            ${$line_ref} = "<PRE>\n";
            ${$prev_ref} =~ s/<P>//;
            ${$mode_ref} |= $PRE | $PRE_EXPLICIT;
            ${$line_action_ref} |= $PRE;
            return;
        }
    }

    if ($self->preformat_trigger_lines() == 0
        || ($self->is_preformatted(${$line_ref})
            && ($self->preformat_trigger_lines() == 1
                || $self->is_preformatted(${$next_ref}))))
    {
        ${$line_ref} =~ s/^/<PRE>\n/;
        ${$prev_ref} =~ s/<P>//;
        ${$mode_ref} |= $PRE;
        ${$line_action_ref} |= $PRE;
    }
}

sub make_new_anchor ($$) {
    my $self          = shift;
    my $heading_level = shift;

    my ($anchor, $i);

    return sprintf("%d", $self->{__non_header_anchor}++) if (!$heading_level);

    $anchor = "section-";
    $self->{__heading_count}->[$heading_level - 1]++;

    # Reset lower order counters
    for ($i = @{$self->{__heading_count}} ; $i > $heading_level ; $i--) {
        $self->{__heading_count}->[$i - 1] = 0;
    }

    for ($i = 0 ; $i < $heading_level ; $i++) {
        $self->{__heading_count}->[$i] = 1
          if !$self->{__heading_count}->[$i];    # In case they skip any
        $anchor .= sprintf("%d.", $self->{__heading_count}->[$i]);
    }
    chomp($anchor);
    $anchor;
}

sub anchor_mail ($$) {
    my $self     = shift;
    my $line_ref = shift;

    if ($self->make_anchors()) {
        my ($anchor) = $self->make_new_anchor(0);
        ${$line_ref} =~ s/([^ ]*)/<A NAME="$anchor">$1<\/A>/;
    }
}

sub anchor_heading ($$$) {
    my $self     = shift;
    my $level    = shift;
    my $line_ref = shift;

    if ($self->make_anchors()) {
        my ($anchor) = $self->make_new_anchor($level);
        ${$line_ref} =~ s/(<H.>)(.*)(<\/H.>)/$1<A NAME="$anchor">$2<\/A>$3/;
    }
}

sub heading_level ($$) {
    my $self = shift;

    my ($style) = @_;
    $self->{__heading_styles}->{$style} = ++$self->{__num_heading_styles}
      if !$self->{__heading_styles}->{$style};
    $self->{__heading_styles}->{$style};
}

sub heading ($$$$) {
    my $self            = shift;
    my $line_ref        = shift;
    my $line_action_ref = shift;
    my $next_ref        = shift;

    my ($hoffset, $heading) = ${$line_ref} =~ /^(\s*)(.+)$/;
    $hoffset = "" unless defined($hoffset);
    $heading = "" unless defined($heading);
    $heading =~ s/&[^;]+;/X/g;    # Unescape chars so we get an accurate length
    my ($uoffset, $underline) = ${$next_ref} =~ /^(\s*)(\S+)\s*$/;
    $uoffset   = "" unless defined($uoffset);
    $underline = "" unless defined($underline);
    my ($lendiff, $offsetdiff);
    $lendiff = length($heading) - length($underline);
    $lendiff *= -1 if $lendiff < 0;

    $offsetdiff = length($hoffset) - length($uoffset);
    $offsetdiff *= -1 if $offsetdiff < 0;

    if (is_blank(${$line_ref})
        || ($lendiff > $self->underline_length_tolerance())
        || ($offsetdiff > $self->underline_offset_tolerance()))
    {
        return;
    }

    $underline = substr($underline, 0, 1);

    # Call it a different style if the heading is in all caps.
    $underline .= "C" if $self->iscaps(${$line_ref});
    ${$next_ref} = " ";    # Eat the underline
    $self->{__heading_level} = $self->heading_level($underline);
    $self->tagline("H" . $self->{__heading_level}, $line_ref);
    $self->anchor_heading($self->{__heading_level}, $line_ref);
    ${$line_action_ref} |= $HEADER;
}

sub custom_heading ($$$) {
    my $self            = shift;
    my $line_ref        = shift;
    my $line_action_ref = shift;

    my ($i, $level);
    for ($i = 0 ; $i < @{$self->custom_heading_regexp()} ; $i++) {
        my $reg = ${$self->custom_heading_regexp()}[$i];
        if (${$line_ref} =~ /$reg/) {
            if ($self->explicit_headings()) {
                $level = $i + 1;
            }
            else {
                $level = $self->heading_level("Cust" . $i);
            }
            $self->tagline("H" . $level, $line_ref);
            $self->anchor_heading($level, $line_ref);
            ${$line_action_ref} |= $HEADER;
            last;
        }
    }
}

sub unhyphenate_para ($$) {
    my $self     = shift;
    my $para_ref = shift;

    # Treating this whole paragraph as one string, look for
    # 1 - whitespace
    # 2 - a word (ending in a hyphen, followed by a newline)
    # 3 - whitespace (starting on the next line)
    # 4 - a word with its punctuation
    # Substitute this with
    # 1-whitespace 2-word 4-word newline 3-whitespace
    # We preserve the 3-whitespace because we don't want to mess up
    # our existing indentation.
    ${$para_ref} =~
      /(\s*)([^\W\d_]*)\-\n(\s*)([^\W\d_]+[\)\}\]\.,:;\'\"\>]*\s*)/s;
    ${$para_ref} =~
s/(\s*)([^\W\d_]*)\-\n(\s*)([^\W\d_]+[\)\}\]\.,:;\'\"\>]*\s*)/$1$2$4\n$3/gs;
}

sub untabify ($$) {
    my $self = shift;
    my $line = shift;

    while ($line =~ /\011/) {
        $line =~ s/\011/" " x ($self->tab_width() - (length($`) %
	    $self->tab_width()))/e;
    }
    $line;
}

sub tagline ($$$) {
    my $self     = shift;
    my $tag      = shift;
    my $line_ref = shift;

    chomp ${$line_ref};    # Drop newline
    ${$line_ref} =~ s/^\s*(.*)$/<$tag>$1<\/$tag>\n/;
}

sub iscaps {
    my $self = shift;
    local ($_) = @_;

    my $min_caps_len = $self->min_caps_length();

    # This is ugly, but I don't know a better way to do it.
    # (And, yes, I could use the literal characters instead of the 
    # numeric codes, but this keeps the script 8-bit clean, which will
    # save someone a big headache when they transfer via ASCII ftp.
/^[^a-z\341\343\344\352\353\354\363\370\337\373\375\342\345\347\350\355\357\364\365\376\371\377\340\346\351\360\356\361\362\366\372\374<]*[A-Z\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\330\331\332\333\334\335\336]{$min_caps_len,}[^a-z\341\343\344\352\353\354\363\370\337\373\375\342\345\347\350\355\357\364\365\376\371\377\340\346\351\360\356\361\362\366\372\374<]*$/;
}

sub caps {
    my $self            = shift;
    my $line_ref        = shift;
    my $line_action_ref = shift;

    if ($self->iscaps(${$line_ref})) {
        $self->tagline($self->caps_tag(), $line_ref);
        ${$line_action_ref} |= $CAPS;
    }
}

# Convert very simple globs to regexps
sub glob2regexp {
    my ($glob) = @_;

    # Escape funky chars
    $glob =~ s/[^\w\[\]\*\?\|\\]/\\$&/g;
    my ($regexp, $i, $len, $escaped) = ("", 0, length($glob), 0);

    for (; $i < $len ; $i++) {
        my $char = substr($glob, $i, 1);
        if ($escaped) {
            $escaped = 0;
            $regexp .= $char;
            next;
        }
        if ($char eq "\\") {
            $escaped = 1;
            next;
            $regexp .= $char;
        }
        if ($char eq "?") {
            $regexp .= ".";
            next;
        }
        if ($char eq "*") {
            $regexp .= ".*";
            next;
        }
        $regexp .= $char;    # Normal character
    }
    "\\b" . $regexp . "\\b";
}

sub add_regexp_to_links_table ($$$$) {
    my $self = shift;
    my ($key, $URL, $switches) = @_;

    # No sense adding a second one if it's already in there.
    # It would never get used.
    if (!$self->{__links_table}->{$key}) {

        # Keep track of the order they were added so we can
        # look for matches in the same order
        push (@{$self->{__links_table_order}}, ($key));

        $self->{__links_table}->{$key}        = $URL;      # Put it in The Table
        $self->{__links_switch_table}->{$key} = $switches;
        print STDERR " (", @{$self->{__links_table_order}},
          ")\tKEY: $key\n\tVALUE: $URL\n\tSWITCHES: $switches\n\n"
          if ($self->dict_debug() & 1);
    }
    else {
        if ($self->dict_debug() & 1) {
            print STDERR " Skipping entry.  Key already in table.\n";
            print STDERR "\tKEY: $key\n\tVALUE: $URL\n\n";
        }
    }
}

sub add_literal_to_links_table ($$$$) {
    my $self = shift;
    my ($key, $URL, $switches) = @_;

    $key =~ s/(\W)/\\$1/g;    # Escape non-alphanumeric chars
    $key = "\\b$key\\b";      # Make a regexp out of it
    $self->add_regexp_to_links_table($key, $URL, $switches);
}

sub add_glob_to_links_table ($$$$) {
    my $self = shift;
    my ($key, $URL, $switches) = @_;

    $self->add_regexp_to_links_table(glob2regexp($key), $URL, $switches);
}

# This is the only function you should need to change if you want to
# use a different dictionary file format.
sub parse_dict ($$$) {
    my $self = shift;

    my ($dictfile, $dict) = @_;

    print STDERR "Parsing dictionary file $dictfile\n"
      if ($self->dict_debug() & 1);

    $dict =~ s/^\#.*$//mg;           # Strip lines that start with '#'
    $dict =~ s/^.*[^\\]:\s*$//mg;    # Strip lines that end with unescaped ':'

    if ($dict =~ /->\s*->/) {
        my $message = "Two consecutive '->'s found in $dictfile\n";
        my $near;

        # Print out any useful context so they can find it.
        ($near) = $dict =~ /([\S ]*\s*->\s*->\s*\S*)/;
        $message .= "\n$near\n" if $near =~ /\S/;
        die $message;
    }

    my ($key, $URL, $switches, $options);
    while ($dict =~ /\s*(.+)\s+\-+([ieho]+\-+)?\>\s*(.*\S+)\s*\n/ig) {
        $key      = $1;
        $options  = $2;
        $options  = "" unless defined($options);
        $URL      = $3;
        $switches = 0;
        $switches += 1 if $options =~ /i/i;    # Case insensitivity
        $switches += 2 if $options =~ /e/i;    # Evaluate as Perl code
        $switches += 4 if $options =~ /h/i;    # provides HTML, not just URL
        $switches += 8 if $options =~ /o/i;    # Only do this link once

        $key =~ s/\s*$//;                      # Chop trailing whitespace

        if ($key =~ m|^/|)                     # Regexp
        {
            $key = substr($key, 1);
            $key =~ s|/$||;    # Allow them to forget the closing /
            $self->add_regexp_to_links_table($key, $URL, $switches);
        }
        elsif ($key =~ /^\|/)    # alternate regexp format
        {
            $key = substr($key, 1);
            $key =~ s/\|$//;      # Allow them to forget the closing |
            $key =~ s|/|\\/|g;    # Escape all slashes
            $self->add_regexp_to_links_table($key, $URL, $switches);
        }
        elsif ($key =~ /\"/) {
            $key = substr($key, 1);
            $key =~ s/\"$//;    # Allow them to forget the closing "
            $self->add_literal_to_links_table($key, $URL, $switches);
        }
        else {
            $self->add_glob_to_links_table($key, $URL, $switches);
        }
    }

}

sub setup_dict_checking ($) {
    my $self = shift;

    # now create the replace funcs and precomile the regexes
    my ($key, $URL, $switches, $options);
    my ($pattern, $href, $i, $r_sw, $code, $code_ref);
    for ($i = 1 ; $i < @{$self->{__links_table_order}} ; $i++) {
        $pattern  = $self->{__links_table_order}->[$i];
        $key      = $pattern;
        $switches = $self->{__links_switch_table}->{$key};

        $href = $self->{__links_table}->{$key};

        $href =~ s@/@\\/@g;
        $href = '<A HREF="' . $href . '">$&<\\/A>'
          if !($switches & 4);

        $r_sw = "s";    # Options for replacing
        $r_sw .= "i" if ($switches & 1);
        $r_sw .= "e" if ($switches & 2);

        # Generate code for replacements.
        # Create an anonymous subroutine for each replacement,
        # and store its reference in an array.
        # We need to do an "eval" to create these because we need to
        # be able to treat the *contents* of the $href variable
        # as if it were perl code, because sometimes the $href
        # contains things which need to be evaluated, such as $& or $1,
        # not just those cases where we have a "e" switch.
        $code =
"\$self->{__repl_code}->[$i] = sub {\nmy \$al = shift;\n\$al =~ s/$pattern/$href/$r_sw;\nreturn \$al; }\n";
        print STDERR "$code" if ($self->dict_debug() & 2);
        eval "$code";

        # compile searching pattern
        if ($switches & 1)    # i
        {
            $self->{__search_patterns}->[$i] = qr/$pattern/si;
        }
        else {
            $self->{__search_patterns}->[$i] = qr/$pattern/s;
        }
    }
}

sub in_link_context ($$) {
    my ($match, $before) = @_;
    return 1 if $match =~ m@</?A>@i;    # No links allowed inside match

    my ($final_open, $final_close);
    $final_open  = rindex($before, "<A ") - $[;
    $final_close = rindex($before, "</A>") - $[;

    return 1 if ($final_open >= 0)      # Link opened
      && (($final_close < 0)            # and not closed    or
        || ($final_open > $final_close)
    );    # one opened after last close

    # Now check to see if we're inside a tag, matching a tag name, 
    # or attribute name or value
    $final_open  = rindex($before, "<") - $[;
    $final_close = rindex($before, ">") - $[;
    ($final_open >= 0)    # Tag opened
      && (($final_close < 0)    # and not closed    or
        || ($final_open > $final_close)
    );    # one opened after last close
}

# Check (and alter if need be) the bits in this line matching
# the patterns in the link dictionary.
sub check_dictionary_links ($$$) {
    my $self            = shift;
    my $line_ref        = shift;
    my $line_action_ref = shift;

    my ($i, $pattern, $switches, $options, $repl_func);
    my $key;
    my $s_sw;
    my $r_sw;
    my ($line_link) = (${$line_action_ref} | $LINK);
    my ($before, $linkme, $line_with_links);

    # for each pattern, check and alter the line
    for ($i = 1 ; $i < @{$self->{__links_table_order}} ; $i++) {
        $pattern  = $self->{__links_table_order}->[$i];
        $key      = $pattern;
        $switches = $self->{__links_switch_table}->{$key};

        # check the pattern
        if ($switches & 8)    # Do link only once
        {
            $line_with_links = "";
            while (!$self->{__done_with_link}->[$i]
                && ${$line_ref} =~ $self->{__search_patterns}->[$i])
            {
                $self->{__done_with_link}->[$i] = 1;
                $line_link = $LINK if (!$line_link);
                $before    = $`;
                $linkme    = $&;

                ${$line_ref} =
                  substr(${$line_ref}, length($before) + length($linkme));
                if (!in_link_context($linkme, $line_with_links . $before)) {
                    print STDERR "Link rule $i matches $linkme\n"
                      if ($self->dict_debug() & 4);

                    # call the special subroutine already created to do
                    # this replacement
                    $repl_func = $self->{__repl_code}->[$i];
                    $linkme    = &$repl_func($linkme);
                }
                $line_with_links .= $before . $linkme;
            }
            ${$line_ref} = $line_with_links . ${$line_ref};
        }
        else {
            $line_with_links = "";
            while (${$line_ref} =~ $self->{__search_patterns}->[$i]) {
                $line_link = $LINK if (!$line_link);
                $before    = $`;
                $linkme    = $&;

                ${$line_ref} =
                  substr(${$line_ref}, length($before) + length($linkme));
                if (!in_link_context($linkme, $line_with_links . $before)) {
                    print STDERR "Link rule $i matches $linkme\n"
                      if ($self->dict_debug() & 4);

                    # call the special subroutine already created to do
                    # this replacement
                    $repl_func = $self->{__repl_code}->[$i];
                    $linkme    = &$repl_func($linkme);
                }
                $line_with_links .= $before . $linkme;
            }
            ${$line_ref} = $line_with_links . ${$line_ref};
        }
    }
    ${$line_action_ref} |= $line_link;    # Cheaper only to do bitwise OR once.
}

sub load_dictionary_links ($) {
    my $self = shift;
    my ($dict, $contents);
    @{$self->{__links_table_order}} = 0;
    %{$self->{__links_table}}       = ();

    foreach $dict (@{$self->links_dictionaries()}) {
        next unless $dict;
	    open(DICT, "$dict") || die "Can't open Dictionary file $dict\n";

	    $contents = "";
	    $contents .= $_ while (<DICT>);
	    close(DICT);
	    $self->parse_dict($dict, $contents);
    }
    $self->setup_dict_checking();
}

sub make_dictionary_links ($$$) {
    my $self            = shift;
    my $line_ref        = shift;
    my $line_action_ref = shift;

    $self->check_dictionary_links($line_ref, $line_action_ref);
    warn $@ if $@;
}

# process_para
# Args:
#   $self
#   $para
# Return:
#   processed $para string
sub process_para ($$) {
    my $self = shift;
    my $para = shift;

    # if this is an external call, do certain initializations
    $self->do_init_call();

    my $para_action = $NONE;

    # tables don't carry over from one para to the next
    if ($self->{__mode} & $TABLE) {
        $self->{__mode} ^= $TABLE;
    }
    if (!$self->link_only()) {

        my $para_len         = length($para);
        my @para_lines       = split (/^/, $para);
        my @para_line_len    = ();
        my @para_line_indent = ();
        my @para_line_action = ();
        my $line;
        for (my $i = 0 ; $i < @para_lines ; $i++) {
            $line = $para_lines[$i];
	    my $ind;

            # Chop trailing whitespace and DOS CRs
            $line =~ s/[ \011]*\015$//;
            $line = $self->untabify($line);    # Change all tabs to spaces
            push @para_line_len, length($line);
            if ($i > 0) {
                $ind = count_indent($line, $para_line_indent[$i - 1]);
                push @para_line_indent, $ind;
            }
            else {
                $ind = count_indent($line, 0);
                push @para_line_indent, $ind;
            }
            push @para_line_action, 0;
            $para_lines[$i] = $line;
        }

        # do the table stuff on the array of lines
        if ($self->make_tables()) {
            $self->tablestuff(\@para_lines, $para_len);
        }

        my $prev        = '';
        my $next        = '';
        my $prev_action = $self->{__prev_para_action};
        for (my $i = 0 ; $i < @para_lines ; $i++) {
            my $prev_ref;
            my $prev_action_ref;
            my $prev_line_indent;
            my $prev_line_len;
            if ($i == 0) {
                $prev_ref         = \$prev;
                $prev_action_ref  = \$prev_action;
                $prev_line_indent = 0;
                $prev_line_len    = 0;
            }
            else {
                $prev_ref         = \$para_lines[$i - 1];
                $prev_action_ref  = \$para_line_action[$i - 1];
                $prev_line_indent = $para_line_indent[$i - 1];
                $prev_line_len    = $para_line_len[$i - 1];
            }
            my $next_ref;
            if ($i == @para_lines - 1) {
                $next_ref = \$next;
            }
            else {
                $next_ref = \$para_lines[$i + 1];
            }

            # Don't escape HTML chars if we're in a table, because
            # it's already been done in tablestuff above
            # and we don't actually want to escape the table code!
            if ($self->escape_HTML_chars() && !($self->{__mode} & $TABLE)) {
                $para_lines[$i] = escape($para_lines[$i]);
            }
            if (($self->{__mode} & $PRE)
                && ($self->preformat_trigger_lines() != 0))
            {
                $self->endpreformat(
                    \$self->{__mode},       \$para_lines[$i],
                    \$para_line_action[$i], $prev_ref,
                    $next_ref
                );
            }
            if (!($self->{__mode} & $PRE)) {
                $self->hrule(\$para_lines[$i], \$para_line_action[$i],
                    $prev_ref);
            }
            if (@{$self->custom_heading_regexp()} && !($self->{__mode} & $PRE))
            {
                $self->custom_heading(\$para_lines[$i], \$para_line_action[$i]);
            }
            if (!($self->{__mode} & ($PRE | $TABLE))
                && !is_blank($para_lines[$i]))
            {
                $self->liststuff(
                    \$para_lines[$i],       \$para_line_action[$i],
                    \$para_line_indent[$i], $prev_ref,
                    $prev_action_ref
                );
            }
            if (!$self->explicit_headings()
                && !($self->{__mode} & ($PRE | $HEADER | $TABLE))
                && ${$next_ref} =~ /^\s*[=\-\*\.~\+]+\s*$/)
            {
                $self->heading(\$para_lines[$i], \$para_line_action[$i],
                    $next_ref);
            }
            if ($self->mailmode()
                && !($self->{__mode} & ($PRE | $TABLE))
                && !($para_line_action[$i] & $HEADER))
            {
                $self->mailstuff(
                    \$para_lines[$i], \$para_line_action[$i],
                    $prev_ref,        $prev_action_ref,
                    $next_ref
                );
            }
            if (
                !($para_line_action[$i] &
                    ($HEADER | $LIST | $MAILHEADER | $TABLE))
                && !($self->{__mode} & ($LIST | $PRE))
                && $self->{__preformat_enabled})
            {
                $self->preformat(
                    \$self->{__mode},       \$para_lines[$i],
                    \$para_line_action[$i], $prev_ref,
                    $next_ref
                );
            }
            $self->paragraph(
                \$self->{__mode},       \$para_lines[$i],
                \$para_line_action[$i], $prev_ref, $prev_action_ref,
                $para_line_indent[$i],  $prev_line_indent
            );
            $self->shortline(
                \$self->{__mode},       \$para_lines[$i],
                \$para_line_action[$i], $prev_ref,
                $prev_action_ref,       $prev_line_len
            );
            if (!($self->{__mode} & ($PRE | $TABLE))) {
                $self->caps(\$para_lines[$i], \$para_line_action[$i]);
            }

            if ($i == 0 && !is_blank($prev))

              # put the "prev" line in front of the first line
            {
                $line = $para_lines[$i];
                $para_lines[$i] = $prev . $line;
            }
            if ($i == @para_lines - 1 && !is_blank($next))

              # put the "next" at the end of the last line
            {
                $para_lines[$i] .= $next;
            }
        }

        # para action is the action of the last line of the para
        $para_action = $para_line_action[$#para_line_action];

        # now put the para back together as one string
        $para = join ("", @para_lines);

        if ($self->unhyphenation()

            # ends in hyphen & next line starts w/letters
            && ($para =~ /[^\W\d_]\-\n\s*[^\W\d_]/s)
            && !($self->{__mode} &
                ($PRE | $HEADER | $MAILHEADER | $TABLE | $BREAK))
          )
        {
            $self->unhyphenate_para(\$para);
        }

    }

    if ($self->make_links()
        && !is_blank($para)
        && @{$self->{__links_table_order}})
    {
        $self->make_dictionary_links(\$para, \$para_action);
    }

    # All the matching and formatting is done.  Now we can 
    # replace non-ASCII characters with character entities.
    if (!$self->eight_bit_clean()) {
        my @chars = split (//, $para);
        foreach $_ (@chars) {
            $_ = $char_entities{$_} if defined($char_entities{$_});
        }
        $para = join ("", @chars);
    }

    $self->{__prev_para_action} = $para_action;

    return $para;
}

# do_file_start
#    extra stuff needed for the beginning
# Args:
#   $self
#   $para
# Return:
#   processed $para string
sub do_file_start ($$$) {
    my $self      = shift;
    my $outhandle = shift;
    my $para      = shift;

    if (!$self->extract()) {
        my @para_lines = split (/\n/, $para);
        my $first_line = $para_lines[0];

        print $outhandle '<!DOCTYPE HTML PUBLIC "' . $self->doctype() . "\">\n"
          unless !$self->doctype();
        print $outhandle "<HTML>\n";
        print $outhandle "<HEAD>\n";

        # if --titlefirst is set and --title isn't, use the first line
        # as the title.
        if ($self->titlefirst() && !$self->title()) {
            my ($tit) = $first_line =~ /^ *(.*)/;    # grab first line
            $tit =~ s/ *$//;                         # strip trailing whitespace
            $tit = escape($tit) if $self->escape_HTML_chars();
            $self->set('title' => $tit);
        }
        if (!$self->title()) {
            $self->set('title' => "");
        }
        print $outhandle "<TITLE>", $self->title(), "</TITLE>\n";

        if ($self->append_head()) {
            open(APPEND, $self->append_head())
              || die "Failed to open $self->append_head()\n";
            while (<APPEND>) {
                print $outhandle $_;
            }
            close(APPEND);
        }

        print $outhandle
          "<META NAME=\"generator\" CONTENT=\"$PROG v$VERSION\">\n";
        print $outhandle "</HEAD>\n";
        print $outhandle "<BODY>\n";
    }

    if ($self->prepend_file()) {
        if (-r $self->prepend_file()) {
            open(PREPEND, $self->prepend_file());
            while (<PREPEND>) {
                print $outhandle $_;
            }
            close(PREPEND);
        }
        else {
            print STDERR "Can't find or read file ", $self->prepend_file(),
              " to prepend.\n";
        }
    }
}

# do_init_call
# certain things, like reading link dictionaries, need to be
# done once
sub do_init_call ($) {
    my $self     = shift;

    if (!$self->{__call_init_done}) {
	push (@{$self->links_dictionaries()}, ($self->default_link_dict()))
	  if ($self->make_links() && (-f $self->default_link_dict()));
	$self->deal_with_options();
	if ($self->make_links()) {
	    push (@{$self->links_dictionaries()}, ($self->system_link_dict()))
	      if -f $self->system_link_dict();
	    $self->load_dictionary_links();
	}
     
	# various initializations
	$self->{__non_header_anchor} = 0;
	$self->{__mode}              = 0;
	$self->{__listnum}           = 0;
	$self->{__list_indent}       = "";

	$self->{__call_init_done} = 1;
    }
}

sub txt2html ($;$) {
    my $self     = shift;
    my $args_ref = (@_ ? shift: 0);

    # and set with the passed-in args
    if ($args_ref && @{$args_ref}) {
        if (!$self->args($args_ref)) {
            print STDERR "Unrecognised option, try --help\n";
            exit(1);
        }
    }

    # check for help messages
    $self->do_help();

    $self->do_init_call();

    my $outhandle;
    my $not_to_stdout;

    # open the output
    if ($self->outfile() eq "-") {
        $outhandle     = *STDOUT;
        $not_to_stdout = 0;
    }
    else {
        open(HOUT, "> " . $self->outfile()) || die "Error: unable to open ",
          $self->outfile(), ": $!\n";
        $outhandle     = *HOUT;
        $not_to_stdout = 1;
    }


    # slurp up a paragraph at a time
    local $/ = "";
    my $para  = '';
    my $count = 0;
    foreach my $file (@{$self->infile()}) {
        if (-f $file && open(IN, $file)) {
            while (<IN>) {
                $para = $_;
                $para =~ s/\n$//;    # trim the endline
                if ($count == 0) {
                    $self->do_file_start($outhandle, $para);
                }
                $para = $self->process_para($para, 0);
                print $outhandle $para, "\n";
                $count++;
            }
        }
    }

    $self->{__prev} = "";
    $self->endlist($self->{__listnum}, \$self->{__prev})
      if ($self->{__mode} & $LIST);    # End all lists
    print $outhandle $self->{__prev};

    #print $outhandle "\n";

    print $outhandle "</PRE>\n" if ($self->{__mode} & $PRE);

    if ($self->append_file()) {
        if (-r $self->append_file()) {
            open(APPEND, $self->append_file());
            while (<APPEND>) {
                print $outhandle $_;
            }
            close(APPEND);
        }
        else {
            print STDERR "Can't find or read file ", $self->append_file(),
              " to append.\n";
        }
    }

    if (!$self->extract()) {
        print $outhandle "</BODY>\n";
        print $outhandle "</HTML>\n";
    }
    if ($not_to_stdout) {
        close($outhandle);
    }
    return 1;
}

# run this from the command line
sub run_txt2html {
    my ($caller) = @_;    # ignore all passed in arguments,
                          # because this only should look at ARGV

    my $conv = new HTML::TextToHTML(\@ARGV);

    # check for help messages
    $conv->do_help();

    my @args = ();

    # now the remainder must be input-files
    foreach my $df (@ARGV) {
        push @args, "--infile", $df;
    }
    $conv->txt2html(\@args);
}

#------------------------------------------------------------------------
1;

# These are the default settings
__DATA__
system_link_dict = /usr/share/txt2html/txt2html.dict
