﻿#!/usr/bin/perl -w
use strict;
use HTML::Toc;
use HTML::TocInsertor;
use HTML::Entities;
use Encode;

use Test::More tests => 7;
use Test::Differences;


# http://search.cpan.org/dist/HTML-Toc/Toc.pod#templateAnchorName
sub assembleAnchorName() {
    my ($aFile, $aGroupId, $aLevel, $aNode, $text, $children) = @_;

    if ($text !~ /^\s*$/) {
        # generate a SEO-friendly anchor right from the token content
        # The allowed character set is limited first by the URI specification for fragments, http://tools.ietf.org/html/rfc3986#section-2: characters
        # then by the limitations of the values of 'id' and 'name' attributes: http://www.w3.org/TR/REC-html40/types.html#type-name
        # Eventually, the only punctuation allowed in id values is [_.:-]
        # Unicode characters with code points > 0x7E (e.g. Chinese characters) are allowed (test "<h1 id="行政区域">header</h1>" at http://validator.w3.org/#validate_by_input+with_options), except for smart quotes (!), see http://www.w3.org/Search/Mail/Public/search?type-index=www-validator&index-type=t&keywords=[VE][122]+smart+quotes&search=Search+Mail+Archives
        # However, that contradicts the HTML 4.01 spec: "Anchor names should be restricted to ASCII characters." - http://www.w3.org/TR/REC-html40/struct/links.html#h-12.2.1
        # ...and the [A-Za-z] class of letters mentioned at http://www.w3.org/TR/REC-html40/types.html#type-name
        # Finally, note that pod2html fails miserably to generate XHTML-compliant anchor links. See http://validator.w3.org/check?uri=http%3A%2F%2Fsearch.cpan.org%2Fdist%2FCatalyst-Runtime%2Flib%2FCatalyst%2FRequest.pm&charset=(detect+automatically)&doctype=XHTML+1.0+Transitional&group=0&user-agent=W3C_Validator%2F1.606
        $text =~ s/\s/_/g;
        decode_entities($text);  # we need to replace [#&;] only when they are NOT part of an HTML entity. decode_entities saves us from crafting a nasty regexp
        $text = encode('utf-8', $text);  # convert to UTF-8 because we need to output the UTF-8 bytes
        $text =~ s/([^A-Za-z0-9_:.-])/sprintf('.%02X', ord($1))/eg;  # MediaWiki also uses the period, see http://en.wikipedia.org/wiki/Hierarchies#Ethics.2C_behavioral_psychology.2C_philosophies_of_identity
        $text = 'L'.$text if $text =~ /\A\W/; # "ID and NAME tokens must begin with a letter ([A-Za-z])" -- http://www.w3.org/TR/html4/types.html#type-name
    }
    $text = 'id' if $text eq '';

    # check if the anchor already exists; if so, add a number
    # NOTE: there is no way currently to do this easily in HTML-Toc-1.10.

    #my $anch_num = 1;
    #my $word_name = $name;
    ## Reference: http://www.w3.org/TR/REC-html40/struct/links.html#h-12.2.1
    ## Anchor names must be unique within a document. Anchor names that differ only in case may not appear in the same document.
    #while (grep {lc $_ eq lc $name} keys %{$args{anchors}}) {
    #    # FIXME (in caller sub): to avoid the grep above, the $args{anchors} hash
    #    # should have as key the lowercased anchor name, and as value its actual value (instead of '1')
    #    $name = $word_name . "_$anch_num";
    #    $anch_num++;
    #}

    return $text;
}


my $toc = HTML::Toc->new();
my $tocInsertor = HTML::TocInsertor->new();
my $output;

$toc->setOptions({
    #header => '',  # by default, \n<!-- Table of Contents generated by Perl - HTML::Toc -->\n
    insertionPoint => 'replace {{toc}}',
    doLinkToId => 0,
    levelToToc => "[1-6]",
    templateAnchorName => \&assembleAnchorName,
});



#--- Basic functionality --------------------------------------------
my $content = <<HTML;
{{toc}}<br />
<h1>Chapter 1</h1>
Some text here
<h1>Chapter 2</h1>
Second chapter
HTML

# http://search.cpan.org/dist/HTML-Toc/Toc.pod#HTML::TocInsertor::insert()
$tocInsertor->insert($toc, $content, {output => \$output});
eq_or_diff($output, <<'EOT', 'basic functionality') or BAIL_OUT("Basic functionality failure");

<!-- Table of Contents generated by Perl - HTML::Toc -->
<ul>
   <li><a href="#Chapter_1">Chapter 1</a></li>
   <li><a href="#Chapter_2">Chapter 2</a></li>
</ul>
<!-- End of generated Table of Contents -->
<br />
<h1><a name="Chapter_1"></a>Chapter 1</h1>
Some text here
<h1><a name="Chapter_2"></a>Chapter 2</h1>
Second chapter
EOT


# ------------------------------------------------------------------------------
# --- Short test of character set in anchor names - must be [A-Za-z0-9_:.-] only
# ------------------------------------------------------------------------------
$toc->setOptions({
    header => '',  # by default, \n<!-- Table of Contents generated by Perl - HTML::Toc -->\n
    templateAnchorName => \&assembleAnchorName,
});
$content = <<'HTML';
{{toc}}
  <h1>&#x884C;&#x653F;&#x533A;&#x57DF;</h1>

  Per http://www.w3.org/TR/REC-html40/struct/links.html#h-12.2.1,
  &#8220;Anchor names should be restricted to ASCII characters.&#8221;,
  and MediaWiki does that too (see http://zh.wikipedia.org/wiki/&#x521A;&#x679C;&#x6C11;&#x4E3B;&#x5171;&#x548C;&#x56FD;)
HTML

$tocInsertor->insert($toc, $content, {output => \$output});
eq_or_diff($output, <<'HTML', 'short test of character set in anchor names', {max_width => 120});
<ul>
   <li><a href="#L.E8.A1.8C.E6.94.BF.E5.8C.BA.E5.9F.9F">&#x884C;&#x653F;&#x533A;&#x57DF;</a></li>
</ul>
<!-- End of generated Table of Contents -->

  <h1><a name="L.E8.A1.8C.E6.94.BF.E5.8C.BA.E5.9F.9F"></a>&#x884C;&#x653F;&#x533A;&#x57DF;</h1>

  Per http://www.w3.org/TR/REC-html40/struct/links.html#h-12.2.1,
  &#8220;Anchor names should be restricted to ASCII characters.&#8221;,
  and MediaWiki does that too (see http://zh.wikipedia.org/wiki/&#x521A;&#x679C;&#x6C11;&#x4E3B;&#x5171;&#x548C;&#x56FD;)
HTML


TODO: {
    local $TODO = 'HTML::Toc needs to support a way of checking for existing anchor names when generating a new one';

# ------------------------------------------------------------------------
# --- Anchor names must be unique -----------------------------------------
# --- Reference: http://www.w3.org/TR/REC-html40/struct/links.html#h-12.2.1
# --- "Anchor names must be unique within a document. Anchor names that differ only in case may not appear in the same document."
# ------------------------------------------------------------------------
    $toc->setOptions({
        header => '',  # by default, \n<!-- Table of Contents generated by Perl - HTML::Toc -->\n
        templateAnchorName => \&assembleAnchorName,
    });
    $content = <<'HTML';
{{toc}}<br />
<h1>Chapter 1</h1>
<h2>Notes</h2>
Notes that belong to Chapter 1
<h1>Chapter 2</h1>
<h2>Notes</h2>
Notes that belong to Chapter 2
HTML

    $tocInsertor->insert($toc, $content, {output => \$output});
    eq_or_diff($output, <<'HTML', 'unique anchor names');

<!-- Table of Contents generated by Perl - HTML::Toc -->
<ul>
   <li><a href="#Chapter_1">Chapter 1</a>
      <ul>
         <li><a href="#Notes">Notes</a></li>
      </ul>
   </li>
   <li><a href="#Chapter_2">Chapter 2</a>
      <ul>
         <li><a href="#Notes_2">Notes</a></li>
      </ul>
   </li>
</ul>
<!-- End of generated Table of Contents -->
<br />
<h1><a name="Chapter_1"></a>Chapter 1</h1>
<h2><a name="Notes"></a>Notes</h2>
Notes that belong to Chapter 1
<h1><a name="Chapter_2"></a>Chapter 2</h1>
<h2><a name="Notes_2"></a>Notes</h2>
Notes that belong to Chapter 2
HTML

# ------------------------------------------------------------------------
# --- Conflicting anchor names due to encoding of forbidden characters
# ------------------------------------------------------------------------
    $toc->setOptions({
        header => '',  # by default, \n<!-- Table of Contents generated by Perl - HTML::Toc -->\n
        templateAnchorName => \&assembleAnchorName,
    });
    $content = <<'HTML';
{{toc}}
  <h1>.25%</h1>
  <h1>%.25</h1>
  <h1>.25</h1>
  <h1>%</h1>
  <h1>Yes...</h1>
  <h1>%</h1>
  Per http://www.w3.org/TR/REC-html40/types.html#type-name,
  &#8220;ID and NAME tokens must begin with a letter ([A-Za-z]) and may be followed by any number of letters, digits ([0-9]), hyphens ("-"), underscores ("_"), colons (":"), and periods (".").&#8221;,
  and MediaWiki does that too (see http://en.wikipedia.org/wiki/Hierarchies#Ethics.2C_behavioral_psychology.2C_philosophies_of_identity)

  <h1>The Big Step</h1>
  <h1>The big Step</h1>
  Per http://www.w3.org/TR/REC-html40/struct/links.html#h-12.2.1, <br />
  &#8220;Anchor names must be unique within a document. Anchor names that differ only in case may not appear in the same document.&#8221;<br />
  <h1>The Big Step 2</h1>
  MediaWiki fails here, see http://en.wikipedia.org/w/index.php?title=User:Dandv/Sandbox&oldid=274553709#The_Big_Step_2

HTML

$tocInsertor->insert($toc, $content, {output => \$output});
eq_or_diff($output, <<'HTML', 'conflicting anchor names due to encoding of forbidden characters', {max_width => 120});
<ul>
   <li><a href="#L.25.25">.25%</a></li>
   <li><a href="#L.25.25_2">%.25</a></li>
   <li><a href="#L.25">.25</a></li>
   <li><a href="#L.25_2">%</a></li>
   <li><a href="#Yes...">Yes...</a></li>
   <li><a href="#L.25_3">%</a></li>
   <li><a href="#The_Big_Step">%</a></li>
   <li><a href="#The_big_step_2">The big step</a></li>
   <li><a href="#The_Big_Step_2_2">The Big Step 2</a></li>
</ul>
<!-- End of generated Table of Contents -->
<h1><a name="L.25.25"></a>.25%</h1>
  <h1><a name="L.25.25_2"></a>%.25</h1>
  <h1><a name="L.25"></a>.25</h1>
  <h1><a name="L.25_2"></a>%</h1>
  <h1><a name="Yes..."></a>Yes...</h1>
  <h1><a name="L.25_3"></a>%</h1>
  Per http://www.w3.org/TR/REC-html40/types.html#type-name,
  &#8220;ID and NAME tokens must begin with a letter ([A-Za-z]) and may be followed by any number of letters, digits ([0-9]), hyphens ("-"), underscores ("_"), colons (":"), and periods (".").&#8221;,
  and MediaWiki does that too (see http://en.wikipedia.org/wiki/Hierarchies#Ethics.2C_behavioral_psychology.2C_philosophies_of_identity)

  <h1><a name="The_Big_Step"></a>The Big Step</h1>
  <h1><a name="The_big_step_2"></a>The big step</h1>
  Per http://www.w3.org/TR/REC-html40/struct/links.html#h-12.2.1, <br />
  &#8220;Anchor names must be unique within a document. Anchor names that differ only in case may not appear in the same document.&#8221;<br />
  <h1><a name="The_Big_Step_2_2"></a>The Big Step 2</h1>
  MediaWiki fails here, see http://en.wikipedia.org/w/index.php?title=User:Dandv/Sandbox&oldid=274553709#The_Big_Step_2

HTML

}  # TODO tests



# ------------------------------------------------------------------------
# --- Comprehensive test of character set in anchor names
# ------------------------------------------------------------------------
$toc->setOptions({
    header => '',  # by default, \n<!-- Table of Contents generated by Perl - HTML::Toc -->\n
    templateAnchorName => \&assembleAnchorName,
});
$content = <<'HTML';
{{toc}}<br />
  <h1>The Big Step 1</h1>
  The first heading text goes here<br />
  <h1>The Big Step 2</h1>
  This is the second heading text<br />
    <h2>second header, first subheader</h2>
    Some subheader text here<br />
    <h2>second header, second subheader</h2>
    Another piece of subheader text here<br />
  <h1>The Big Step</h1>
  Third text for heading h1 #3<br />
  <h1>The Big Step #6</h1>
  Per the XHTML 1.0 spec, the number/hash sign is NOT allowed in fragments; in practice, the fragment starts with the first hash.<br />
  Such anchors also work in Firefox 3 and IE 6.<br />
  <h1>Calculation #7: 7/5&gt;3 or &lt;2?</h1>
  Hail the spec, http://www.w3.org/TR/REC-html40/types.html#type-name:
  ID and NAME tokens must begin with a letter ([A-Za-z]) and may be followed by any number of letters, digits ([0-9]), hyphens ("-"), underscores ("_"), colons (":"), and periods (".").
  <h1>#8: start with a number (hash) [pound] {comment} sign</h1>
  <h1>Lots of gibberish here: &#8220;!&#8221;#$%&amp;&#39;()*+,-./:;&lt;=&gt;?@[\]^_`{|}~</h1>
  Note how the straight quotes were replaced by smart quotes, which are invalid in id attributes for <span class="caps">XHTML</span> 1.0 (!)
HTML

$tocInsertor->insert($toc, $content, {output => \$output});
eq_or_diff($output, <<'EOT', 'comprehensive test of character set in anchor names', {max_width => 50});
<ul>
   <li><a href="#The_Big_Step_1">The Big Step 1</a></li>
   <li><a href="#The_Big_Step_2">The Big Step 2</a>
      <ul>
         <li><a href="#second_header.2C_first_subheader">second header, first subheader</a></li>
         <li><a href="#second_header.2C_second_subheader">second header, second subheader</a></li>
      </ul>
   </li>
   <li><a href="#The_Big_Step">The Big Step</a></li>
   <li><a href="#The_Big_Step_.236">The Big Step #6</a></li>
   <li><a href="#Calculation_.237:_7.2F5.3E3_or_.3C2.3F">Calculation #7: 7/5&gt;3 or &lt;2?</a></li>
   <li><a href="#L.238:_start_with_a_number_.28hash.29_.5Bpound.5D_.7Bcomment.7D_sign">#8: start with a number (hash) [pound] {comment} sign</a></li>
   <li><a href="#Lots_of_gibberish_here:_.E2.80.9C.21.E2.80.9D.23.24.25.26.27.28.29.2A.2B.2C-..2F:.3B.3C.3D.3E.3F.40.5B.5C.5D.5E_.60.7B.7C.7D.7E">Lots of gibberish here: &#8220;!&#8221;#$%&amp;&#39;()*+,-./:;&lt;=&gt;?@[\]^_`{|}~</a></li>
</ul>
<!-- End of generated Table of Contents -->
<br />
  <h1><a name="The_Big_Step_1"></a>The Big Step 1</h1>
  The first heading text goes here<br />
  <h1><a name="The_Big_Step_2"></a>The Big Step 2</h1>
  This is the second heading text<br />
    <h2><a name="second_header.2C_first_subheader"></a>second header, first subheader</h2>
    Some subheader text here<br />
    <h2><a name="second_header.2C_second_subheader"></a>second header, second subheader</h2>
    Another piece of subheader text here<br />
  <h1><a name="The_Big_Step"></a>The Big Step</h1>
  Third text for heading h1 #3<br />
  <h1><a name="The_Big_Step_.236"></a>The Big Step #6</h1>
  Per the XHTML 1.0 spec, the number/hash sign is NOT allowed in fragments; in practice, the fragment starts with the first hash.<br />
  Such anchors also work in Firefox 3 and IE 6.<br />
  <h1><a name="Calculation_.237:_7.2F5.3E3_or_.3C2.3F"></a>Calculation #7: 7/5&gt;3 or &lt;2?</h1>
  Hail the spec, http://www.w3.org/TR/REC-html40/types.html#type-name:
  ID and NAME tokens must begin with a letter ([A-Za-z]) and may be followed by any number of letters, digits ([0-9]), hyphens ("-"), underscores ("_"), colons (":"), and periods (".").
  <h1><a name="L.238:_start_with_a_number_.28hash.29_.5Bpound.5D_.7Bcomment.7D_sign"></a>#8: start with a number (hash) [pound] {comment} sign</h1>
  <h1><a name="Lots_of_gibberish_here:_.E2.80.9C.21.E2.80.9D.23.24.25.26.27.28.29.2A.2B.2C-..2F:.3B.3C.3D.3E.3F.40.5B.5C.5D.5E_.60.7B.7C.7D.7E"></a>Lots of gibberish here: &#8220;!&#8221;#$%&amp;&#39;()*+,-./:;&lt;=&gt;?@[\]^_`{|}~</h1>
  Note how the straight quotes were replaced by smart quotes, which are invalid in id attributes for <span class="caps">XHTML</span> 1.0 (!)
EOT


# ------------------------------------------------------------------------
# --- range of header levels to make TOC out of: 1-1
# ------------------------------------------------------------------------
$content = <<'HTML';
<div class="ToC">{{toc 1-1}}</div>
  <h1>The Big Step 1</h1>
  The first heading text goes here<br />
  <h1>The Big Step 2</h1>
  This is the second heading text<br />
    <h2>second header, first subheader</h2>
    Some subheader text here<br />
    <h2>second header, second subheader</h2>
    Another piece of subheader text here<br />
  <h1>The Big Step #3</h1>
  another h1
    <h2>Second level heading</h2>
      <h3>Third level heading</h3>
        <h4>fourth level heading</h4>
        header text level 4
          <h5>Fifth level heading</h5>
  <h1>Back to level one with an interrobang&#x203D;</h1>
  '&#x203D;' is an interrobang.
</div>
HTML


$toc->setOptions({
    header => '',  # by default, \n<!-- Table of Contents generated by Perl - HTML::Toc -->\n
    templateAnchorName => \&assembleAnchorName,
    levelToToc => "[1-1]",
    insertionPoint => 'replace {{toc \[?\d*-?\d*\]?}}'
});
$tocInsertor->insert($toc, $content, {output => \$output});
eq_or_diff($output, <<'HTML', 'range of header levels to make TOC out of: 1-1', {max_width => 120});
<div class="ToC"><ul>
   <li><a href="#The_Big_Step_1">The Big Step 1</a></li>
   <li><a href="#The_Big_Step_2">The Big Step 2</a></li>
   <li><a href="#The_Big_Step_.233">The Big Step #3</a></li>
   <li><a href="#Back_to_level_one_with_an_interrobang.E2.80.BD">Back to level one with an interrobang&#x203D;</a></li>
</ul>
<!-- End of generated Table of Contents -->
</div>
  <h1><a name="The_Big_Step_1"></a>The Big Step 1</h1>
  The first heading text goes here<br />
  <h1><a name="The_Big_Step_2"></a>The Big Step 2</h1>
  This is the second heading text<br />
    <h2>second header, first subheader</h2>
    Some subheader text here<br />
    <h2>second header, second subheader</h2>
    Another piece of subheader text here<br />
  <h1><a name="The_Big_Step_.233"></a>The Big Step #3</h1>
  another h1
    <h2>Second level heading</h2>
      <h3>Third level heading</h3>
        <h4>fourth level heading</h4>
        header text level 4
          <h5>Fifth level heading</h5>
  <h1><a name="Back_to_level_one_with_an_interrobang.E2.80.BD"></a>Back to level one with an interrobang&#x203D;</h1>
  '&#x203D;' is an interrobang.
</div>
HTML



# ------------------------------------------------------------------------
# --- range of header levels to make TOC out of: 5-
# ------------------------------------------------------------------------
$toc->setOptions({
    header => '',  # by default, \n<!-- Table of Contents generated by Perl - HTML::Toc -->\n
    templateAnchorName => \&assembleAnchorName,
    levelToToc => "[5-8]",
    insertionPoint => 'replace {{toc \[?\d*-?\d*\]?}}'
});
$tocInsertor->insert($toc, $content, {output => \$output});
eq_or_diff($output, <<'HTML', 'range of header levels to make TOC out of: 5-', {max_width => 120});
<div class="ToC"><ul>
   <li><a href="#Fifth_level_heading">Fifth level heading</a></li>
</ul>
<!-- End of generated Table of Contents -->
</div>
  <h1>The Big Step 1</h1>
  The first heading text goes here<br />
  <h1>The Big Step 2</h1>
  This is the second heading text<br />
    <h2>second header, first subheader</h2>
    Some subheader text here<br />
    <h2>second header, second subheader</h2>
    Another piece of subheader text here<br />
  <h1>The Big Step #3</h1>
  another h1
    <h2>Second level heading</h2>
      <h3>Third level heading</h3>
        <h4>fourth level heading</h4>
        header text level 4
          <h5><a name="Fifth_level_heading"></a>Fifth level heading</h5>
  <h1>Back to level one with an interrobang&#x203D;</h1>
  '&#x203D;' is an interrobang.
</div>
HTML
