<?xml version="1.0" encoding="US-ASCII"?>
<!-- This template is for creating an Internet Draft using xml2rfc,
    which is available here: http://xml.resource.org. -->

<!DOCTYPE rfc SYSTEM "rfc2629.dtd" [
        <!ENTITY RFC2119 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2119.xml">
        <!ENTITY RFC2629 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.2629.xml">
        <!ENTITY RFC3032 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.3032.xml">
        <!ENTITY RFC3277 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.3277.xml">
        <!ENTITY RFC3719 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.3719.xml">
        <!ENTITY RFC4271 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.4271.xml">
        <!ENTITY RFC5120 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.5120.xml">
        <!ENTITY RFC5301 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.5301.xml">
        <!ENTITY RFC5303 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.5303.xml">
        <!ENTITY RFC5305 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.5305.xml">
        <!ENTITY RFC5308 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.5308.xml">
        <!ENTITY RFC5309 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.5309.xml">
        <!ENTITY RFC5311 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.5311.xml">
        <!ENTITY RFC5316 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.5316.xml">
        <!ENTITY RFC5440 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.5440.xml">
        <!ENTITY RFC5449 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.5449.xml">
        <!ENTITY RFC5614 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.5614.xml">
        <!ENTITY RFC5837 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.5837.xml">
        <!ENTITY RFC5820 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.5820.xml">
        <!ENTITY RFC6232 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.6232.xml">
        <!ENTITY RFC7356 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.7356.xml">
        <!ENTITY RFC7921 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.7921.xml">
        <!ENTITY RFC8174 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.8174.xml">
        <!ENTITY RFC8126 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.8126.xml">
        <!ENTITY RFC8296 SYSTEM "http://xml.resource.org/public/rfc/bibxml/reference.RFC.8296.xml">
        ]>

<?xml-stylesheet type='text/xsl' href='rfc2629.xslt' ?>
<?rfc strict="yes" ?>
<?rfc toc="yes"?>
<?rfc tocdepth="4"?>
<?rfc symrefs="yes"?>
<?rfc sortrefs="yes" ?>
<?rfc compact="yes" ?>
<?rfc subcompact="no" ?>
<rfc category="exp" docName="draft-prz-lsr-hierarchical-snps-02" ipr="trust200902">

    <!-- ***** FRONT MATTER ***** -->

    <front>

        <title>ISIS Hierarchical SNPs</title>

        <author initials='T.' surname='Przygienda' fullname='Tony Przygienda'>
            <organization>Juniper Networks</organization>
            <address>
                <email>prz@juniper.net</email>
            </address>
        </author>

        <author initials='T.' surname='Li' fullname='Tony Li'>
            <organization>Juniper Networks</organization>
            <address>
                <email>tli@juniper.net</email>
            </address>
        </author>


        <date/>

        <abstract>
            <t>
                The document presents an optional new type of SNP called a Hierarchical SNP (HSNP). When feasible,
                it compresses
                traditional CSNP exchanges into a Merkle tree-like structure, which speeds up synchronization of large
                databases and adjacency numbers while reducing the load from regular CSNP exchanges during normal
                operation.
            </t>


        </abstract>

    </front>

    <middle>

        <section title="Introduction">

<!--            <t>-->
<!--                The document introduces an optional, new type of SNPs called Hierarchical SNP (HSNP) that can-->
<!--                compress the traditional CSNPs exchange into a variant of merkle tree, hence-->
<!--                allowing to support fast synchronization of large databases and adjacency numbers while-->
<!--                lessening the load of periodic CSNP exchanges during steady state operation.-->

<!--                Mixture of parallel flooding, exchanging of CSNPs and a fitting HSNP compression strategy-->
<!--                should lead to faster database re-synchronization-->
<!--                since only a subset of packets as compared to full scale CSNP exchange-->
<!--                is necessary to reconcile any entropy present. When using HSNPs more of unnecessary flooding can be suppressed-->
<!--                while the mismatched fragments will be detected with less communication overhead as compared to-->
<!--                full CSNP exchanges.-->
<!--            </t>-->

            <t>
                The document introduces an optional new type of SNP called the Hierarchical SNP (HSNP). It compresses
                traditional CSNP exchanges into a Merkle tree-like structure <xref target="MERKLE"/>,
                enabling faster synchronization of large
                databases and adjacency information while reducing the overhead of regular CSNP exchanges during
                steady state
                operation.

                By combining parallel flooding, CSNP exchanges, and HSNP-based compression, database resynchronization
                can be accelerated because fewer packets (not the entire CSNP set) need to be exchanged to fix
                inconsistencies. Using HSNPs also reduces unnecessary flooding and communication overhead, while still
                detecting mismatched fragments more efficiently than full CSNP exchanges.
            </t>

<!--            <t>-->
<!--                To keep the framework uniform we consider-->
<!--                the usual CSNP entries of LSPs simply a "as good as a perfect merkle hash" where we basically use lsp id, seq# and checksum-->
<!--                as "hash" and assume the 16 bits IS-IS fletcher checksum of the content never collides with other fragments. We add the PDU length-->
<!--                of the fragment for further entropy. Subsequently, we build a fletcher checksum on that data as-->
<!--                the fingerprint of the fragment and consider it its hash.-->
<!--                The hash summarizing hashes of all fragments of a node we consider as "node merkle hash". Range of such node merkle-->
<!--                hashes can be again summarized by a hash built over the according hashes.-->
<!--                We will call such hash a "node range merkle hash". For such hashes we do however-->
<!--                move away from fletcher checksums for reasons explained further in <xref target="hashfn"/>.-->

<!--            </t>-->

            <t>
                To maintain a consistent framework, we initially treat
                each CSNP entry for an LSP as equivalent to a "as good as perfect",
                though rather long,
                Merkle hash. In other words, in the first step, the LSP ID, sequence number,
                and checksum act as a "perfect fragment hash". The CSNPs then serve as the "bottom" of the Merkle tree
                conceptually speaking. We include the fragment's PDU length to add
                more entropy in the next step.

                In this step we compute a  <xref target="SIPHASH"/> over the fragment's description to create a shorter fragment-level merkle
                hash. Those hashes are never transmitted in packets, since doing so would effectively
                duplicate CSNP functionality.
                However, the hash that combines all fragment hashes of a node becomes a "node Merkle hash". Groups
                of such node hashes can then be summarized again into a "node range Merkle hash", created by hashing the
                individual node hashes together. For the node and node range hashes,
                we switch from <xref target="SIPHASH"/> to a different hashing
                method, as explained in <xref target="hashfn"/>.

                The resulting hierarchy of hashes enables validation of large LSDB synchronizations using far fewer
                packets than relying solely on CSNPs. Although these hashes summarize ranges recursively, the resulting
                exchange, depending on range mismatches, resembles a <xref target="SKIP"/>-like process rather than
                maintaining a fixed
                tree structure in every exchange.
            </t>

            <t>

                This document limits itself for practical purposes to LSDB sizes
                of the order of 1E6 fragments and further considerations necessary to prevent overly
                garrulous exchanges
                of hashes covering smaller and smaller sets of fragments. More details on the
                targeted IS-IS envelope can be found in <xref target="envelope"/> and further
                considerations revolving around these assumptions
                are summarized in
                <xref target="further"/>.
            </t>
        </section>


        <section title="Dynamic Partitioning" anchor="partition">

            <t>
                In practical terms, the most interesting problem
                is figuring out how to divide the database into groups of leaf
                nodes (each representing a set of fragments) that both compress the data effectively compared to
                standard CSNPs and remain as stable as possible. If the fragment boundaries within each hash keep
                changing, neighboring systems will need to recalculate their hashes instead of reusing cached ones from
                their own Merkle tree, which adds unnecessary computation.

                The subdivision should also create enough "bins" to handle any distribution of fragment IDs across the
                network. This helps avoid pathological cases where all fragments might end up in a single hash, such as
                when e.g. a hashing function degrades. Ideally, when a hash mismatch occurs, it should take only one or
                two packets, with hashes for smaller fragment groups or regular CSNPs, to fix the difference.
                Reducing I/O and/or computation directly improves how quickly the systems synchronize.
            </t>

<!--            <t>-->
<!--                Practically speaking, the most interesting problem we encounter is the correct subdivision of the database into-->
<!--                collections of leaf nodes (set of fragments contained by such nodes)-->
<!--                that on one hand will provide a good compression vs. standard CSNPs,-->
<!--                on the other hand it changes as little as possible. In case the boundaries of fragments covered by-->
<!--                a hash change all the time, the neighbors receiving the-->
<!--                information-->
<!--                may have to recompute the hashes rather than relying on a cache representing its own merkle tree of such hashes.-->
<!--                The generated subdivision should also produce enough "bins" no matter the distribution of the fragment IDs-->
<!--                in the network. This is important to prevent such things as packing of all the fragments into a single hash when-->
<!--                e.g. a particular hash function degenerates. And ideally, a hash mismatch should produce not more than a-->
<!--                single packet or two with hashes covering less fragments or ultimately, CSNPs.-->
<!--                Rather obviously, minimizing necessary I/O and computation will improve re-convergence performance.-->
<!--            </t>-->


            <t>
                To begin, in IS-IS networks we can fit just under 100 PSNP entries into the typical 1500-byte MTU frame.
                This is the consequence of each CSNP entry including the Node ID, Fragment number, Sequence number, Checksum, and
                Lifetime fields -- totaling 7 + 1 + 4 + 2 + 2 = 16 bytes per entry.
            </t>

<!--            <t>-->
<!--                To start our considerations, in IS-IS networks-->
<!--                we can fit into the prevailing 1500 bytes somewhat less than 100 of-->
<!--                PSNP entries for the foreseeable future. This is the consequence of-->
<!--                CSNP entries consuming Node ID + Fragment# + Seq# + CSUM + Lifetime-->
<!--                length which-->
<!--                amounts to 7 + 1 + 4 + 2 + 2 = 16 bytes each.-->
<!--            </t>-->


<!--            <t> Subsequently, fletcher hashes will occupy (as shown in <xref target="first-order"/>) the length-->
<!--                of 6 + 6 + 6 = 18 bytes-->
<!--                per hash-->
<!--                and hence around 70 of those hashes fit into a packet. Deeper considerations how the-->
<!--                48-bits fletcher checksum and subsequent hashes size has been chosen can be found in-->
<!--                <xref target="collisions"/>.-->
<!--            </t>-->

            <t>
                <xref target="SIPHASH"/> hashes, as shown in <xref target="first-order"/>, take up
                6 + 6 + 8 = 20 bytes each, which means that
                about 70 such hashes fit into a single packet. A more detailed explanation of why a 64-bit
                <xref target="SIPHASH"/> and the resulting hash sizes were chosen is provided in <xref target="collisions"/>.
            </t>

            <t>These limits form the basis for the recommended partitioning and packing strategies discussed later.</t>

            <t>
                At the lowest compression level, it is optimal to generate a single CSNP packet on a mismatch in
                a hash. To achieve this, the first-level hashes should initially group about 80 LSP fragments
                together, with exceptions handled later. There is no need to maximize this initial packing.
                </t>

            <t>

                As the LSDB grows, it is better to leave some flexibility ("slack") in how fragments are grouped. This
                increases the likelihood that both sides of an adjacency will maintain the same leaf-level packing, even
                during flooding transitions, and prevents the ranges from shifting constantly.


<!--                At "lowest" compression level, it is desirable to produce one CSNP packet on a miss on the merkle hash-->
<!--                and hence such "first level" hashes should  pack initially 80-->
<!--                LSP fragments with exceptions following later. There is no need to maximize such "initial packing".-->
<!--                LSDB may grow and to-->
<!--                maximize the chances of the same "leaf packing" ranges on both sides-->
<!--                of an adjacency, even during flooding transitions, some-->
<!--                "slack" is advisable so the ranges don't move all the time.-->
            </t>

            <t>
<!--                The packing will always put all fragments of a system and its pseudonodes-->
<!--                into the same node range (which of course can exceed-->
<!--                the advisable 80 fragments sometimes) and a first order leaf will be considered "full" if addition-->
<!--                of next System ID fragments would exceed this size (except obviously, when the leaf is empty).-->
<!--                At such point the range may be included into a hash of a less specific range which however on-->
<!--                mismatch will force disaggregation and sending of less specific ranges.-->

                The packing process always places all fragments belonging to the same system and its pseudonodes within
                a single node Merkle hash. This hash may occasionally exceed the recommended size of 80 fragments. When
                it comes to node range hashes a
                hash is considered "full" when adding the next system's fragments would go beyond this
                limit. Unless, of course, the leaf is still empty.
                </t>
            <t>

                At that point, any range hash can be folded into a higher-level, less specific range hash. However,
                whenever a mismatch
                occurs at any level, the process must disaggregate the hash and send the corresponding smaller, more
                specific range or node hash instead.
            </t>


            <section title="Denser Packing and Repacking">

                <t>To talk meaningfully about node range hashes,  we will refer to hashes that cover a wider range of nodes as
                    less specific, and to those covering only a subset of that range as more specific.

                </t>

                <t>
                    Instead or in addition to the "first level" packing a system can decide to pack "more densely".
                    In such a mode, the HSNP may include hashes that cover a much larger range than the
                    first-level hashes. How this denser packing is implemented is left up to the specific implementation.
                    </t>

                <t>
                    A good general approach is to increase packing density in parts of the database that have not changed,
                    where no hash mismatches have been observed, or when it is reasonable to assume that the
                    neighbor already holds a mostly synchronized database.
                </t>

                <t>



                    As a secondary consideration, it would be useful for efficient, cache-based implementations for both
                    sides to agree on the ranges of Merkle hashes advertised. This would make caching of merkle nodes much more
                    effective. However, while this idea seems viable in theory, implementing it across a large number of
                    interfaces would effectively require a global synchronization protocol - something impractical in a
                    network where nodes are constantly adding, removing, and updating fragments asynchronously. These
                    ongoing changes continually affect what the "optimal" hash ranges are. And with enough churn, such
                    a range negotiation protocol might never converge at all.

                    </t>



                <t>
                    Alternatively, providing a fast way to reconstruct the internal Merkle hash for a mismatched range
                    could reduce the need for perfect range alignment. For example, in the proposed packing scheme, nodes
                    always agree on system ID boundaries. By maintaining a Merkle hash per system ID, a node can quickly
                    recompute the required hashes whenever received ranges differ from its cached ones - even in
                    networks with a large number of nodes.

                </t>

                <t>
                    It is still highly preferable for advertised Merkle hash ranges to align on system ID boundaries as
                    much as possible - especially at the top level. Under stable conditions, these top-level Merkle hashes
                    significantly reduce the amount of CSNP exchange required, minimizing both packet volume and
                    processing overhead.
                </t>
                <t>Even though a fully stable network could, in theory, be represented by a single hash covering the
                    entire LSDB, doing so is neither desirable nor beneficial. Since an HSNP packet must be sent anyway,
                    it is much better to fill it with around 70 node range hashes. This approach limits the amount of
                    decompression required if a collision occurs within one of those ranges and also reduces the risk of
                    hash collisions, as discussed in <xref target="collisions"/>.

                    </t>


                <t>
                    In summary, a node should avoid compressing beyond the point where a single HSNP covers the entire
                    database. Ideally, one HSNP should contain at most about (MTU / Node Range Hash Entry Size)
                    hashes - or fewer - to keep collision probabilities low, as described in detail in <xref target="collisions"/>.

                </t>


                <!--
                <t>
                    To attempt to settle on the same ranges in HSNPs an implementation of the suggested packing should let a
                    leaf that
                    drops under 50% occupancy "start robbing" system IDs from "left" of the next leaf until the current
                    leaf meets the "full condition". This is of course a recursive action that may ultimately generate
                    less leaves, remove some and in a recursive fashion lead to the same "greedy robbery from the left"
                    in the next level up. The "left" is colloquial here for starting with lowest System IDs under
                    normal sorting criteria.
                    On the other end of the spectrum a leaf that holds more than 150% of usual capacity (i.e. 80 * 1.5 LSPs or 60 * 1.5 Hashes)
                    should be preferably split into two leaves unless it holds a single System ID with more than 80 fragments.
                    Splitting the leaves may cause a repacking at a higher
                    level again in a recursive fashion.

                </t>

                <t>
                    The rebalancing of ranges to agree across all nodes and hence reduce hashing load
                    is a trade-off in terms of possibly large recomputation vs. suffering a penalty of
                    recomputing some hashes on disagreeing ranges on every exchange. Other solutions are
                    of course possible such as internal caches that keep the recomputed hashes for the
                    neighbor's ranges.
                </t>

                <t>
                    Precise splitting/merging algorithms agreed upon increase the likelihood of nodes ending up on
                    precisely same ranges. A possibly
                    simpler idea to discuss is to simply "repack" the whole thing on some balance violations or
                    periodically. As example, the optimal dynamic programming algorithm for the problem of
                    optimal partitioning into capacity limited segments is well known.
                    Another idea is to simply use ISIS fragment sliding but this may lead in worst
                    case to first level checksumming a single fragment over time.
</t>
                <t>
                    Overall, different partitioning and packing approaches are possible but if system ID
                    as natural partition is not used, this will likely change the packet
                    format since the partition boundaries will necessarily reflect which of the fragments are
                    covered by the hashes. Although, given that ordering of fragments has to be preserved it is hard to
                    imagine anything else but start and end consisting of fragment IDs.
                </t>

                -->

            </section>



        </section>

        <section title="Hash Function for a Fragment">
            <t>
                Each fragment generates a 64-bit <xref target="SIPHASH">siphash-1-3</xref>. The salt key
                is given in <xref target="refcode"/>.

            </t>

            <t>
                In case the fragment hash generates a zero value the value MUST be replaced with
                constant of value 1.
            </t>

            <t>
                To validate correctness of an implementation a reference hash is given in
                <xref target="reference-hash"/>.
            </t>

            <figure anchor="reference-hash">
                <artwork align="left" name="" type="" alt=""><![CDATA[
Hash Variant: Siphash-1-3:64Bits
Fragment ID: 0101:0101:0000.01-01
Seq# $0001
Csum: $0001
Len: 0512
Hash $6EB348F808C9AE4E
]]></artwork>
            </figure>

        </section>

        <section title="Fast, Incremental, Self-Inverse Hashing Function for Fragment Ranges" anchor="hashfn">
            <t>
                Since large-scale deployments must compute significant numbers of hashes over sets of frequently changing
                fragments, it is highly desirable to use a specialized hash function that supports fast incremental
                updates when fragments are added, removed, or when their checksums change.
                </t>
            <t>
                Once hashes are built over sets of fragments, it is desirable to support very fast splitting and merging
                of such sets, especially when two hashes differ in which fragments they contain.
            </t>

            <t>
            Deeper considerations on such hashes can be found in
            <xref target="HASHES"/>
            but our design space
            is simplified due to irrelevance of security involved.
            </t>


            <t>
                The hash for a set of fragments is computed using a very fast XOR over their fragment hashes. This makes
                it straightforward to update the hash when a leaf is added, removed, or its checksum changes. As a
                result, less specific ranges can quickly derive their hash by XOR'ing the hashes of all included, more
                specific ranges, when those are available.

<!--                The hash function for sets of fragments is a very fast XOR operation of their fletcher hashes.-->
<!--                This allows to update the hash when adding, removing a leaf or-->
<!--                changing its checksum-->
<!--                in a very fast and simple manner. This also implies that less specific ranges can very quickly-->
<!--                compute their hash by XOR'ing all included more specific ranges if such hashes are available.-->
            </t>

            <!--
            <t>
                Ultimately, and fairly obviously, third order hash uses second order hash logic to keep its hash.
                This all means that every time a first order leaf changes the
                contained system IDs for
                some reason the merkle hashes will have to be readjusted recursively in according 2nd and third
                order leaves. This is in itself nothing
                particular since
                <em>any</em>
                change on first order leaf hash forces change on second order and consequently third order leaf hash.
                This is how Merkle
                trees work after all.
            </t>
            -->

        </section>

        <section title="Procedures">
            <section title="HSNP Support Negotiation">
                <t>
                    IIH of nodes supporting this extension MUST include in IIH a new TLV that will indicate
                    support for reception of HSNPs.
                    All nodes on the adjacency MUST advertise
                    the TLV on their IIHs, otherwise HSNPs are not used.
                    Observe that a node may very well just receive and process the HSNPs and answer them
                    always with CSNPs necessary although this is obviously less beneficial than fully supporting
                    sending and receiving of HSNPs.
                </t>
            </section>

            <section title="Advertising and Receiving HSNPs">
                <t>
                    Advertising standard CSNPs is extended with HSNP advertisements when this feature is supported. Since
                    both CSNPs and HSNPs carry range information in their headers, they can be freely mixed, depending
                    on which level of fragment "compression" best fits the situation. In practice, sufficiently specific
                    range mismatches will naturally fall back to CSNP exchanges or flooding to resolve remaining
                    differences.
<!--                    -->
<!--                    Advertising normal CSNPs is augmented with advertisement of HSNPs when this feature is supported.-->
<!--                    Since both CSNP and HSNPs advertise "range" in their headers they can even be-->
<!--                    randomly mixed depending what level of "compression" of fragments is best suited to the situation-->
<!--                    and range mismatches specific enough will end up to CSNP exchanges or flooding.-->
                </t>

                <t>
                    The ranges MUST be sorted based on Range Start System ID. The ranges MAY overlap albeit it is
                    highly undesirable.
                </t>

                <t>
                    Any node IDs that are not covered by the ranges in a packet - either because there are gaps between
                    the advertised ranges, or between those ranges and the HSNP's Start and End System IDs - MUST be
                    treated as missing. Consequently, if a node detects that it holds Merkle hashes for LSPs that are
                    not covered by a received HSNP, it MUST behave as it would in the same situation with a CSNP, namely
                    by flooding the missing LSPs.

                    <!-- Alternately, instead of leaving a "hole" in the packet range, a
                    HSNP can be included with Merkle hash being set to 0 (which will generate a guaranteed miss) and
                    interpreted as nodes in the range being present but not covered by HSNP compression anymore.
                    The node sending such an entry SHOULD send according CSNPs before the according HSNP packet.
                    Observe that in the case the receiving node computed the "real" XOR'ed HSNP hash of the fragments of such
                    range as resulting in a 0 value, it MUST still treat the received HSNP hash as "mismatch" to prevent
                    a stable "hole" in the database on the peer.

                    -->


<!--                    Any node IDs not covered by ranges in the packet either due to "holes" between the advertised-->
<!--                    ranges or holes between contained ranges and the Start and End System ID of the HSNP MUST be considered-->
<!--                    as missing.-->
<!--                    Consequently, in case a node detects that it holds Merkle hashes for LSPs that are not covered by the-->
<!--                    received HSNP, it MUST trigger the same behavior as triggered by CSNP with this condition,-->
<!--                    i.e. flood the missing LSPs.-->
                </t>


                <t>
                    As with CSNPs, an HSNP whose first range covers the first node in the database MUST use
                    0000.0000.0000 as the start system ID in its packet range so that missing nodes can be detected. The
                    same rule applies at the other end of the database: an HSNP whose range covers the last node MUST
                    indicate this in a way that allows detection of any missing trailing nodes.


<!--                    -->
<!--                    As is the case with CSNPs the HSNP containing in its first range the first node of the-->
<!--                    database MUST indicate in its packet range 0000.0000.0000 as start system ID to-->
<!--                    allow detection of missing nodes. Analogous behavior applies to HSNP carrying hash-->
<!--                    covering the last node of the database.-->
                </t>

<!--                <t>-->
<!--                    When a node receives HSNPs with Merkle hash ranges, it MUST either compute-->
<!--                    and verify the hashes for the indicated ranges and in case of mismatch -->
<!--                    send more specific hashes for that range - in other-->
<!--                    words, "disaggregate". Disaggregation is less preferable, however, because if both sides repeatedly-->
<!--                    do this for wide mismatched ranges, it can lead to a ping-pong effect that ultimately falls back to-->
<!--                    full CSNP exchanges.-->
<!--                    -->
<!--                    In case a node receives HSNPs where the merkle hash ranges are not the same, the node MUST-->
<!--                    either compute and verify the hashes over the ranges indicated or-->
<!--                    send more specific hashes for the range or in other words "disaggregate". The disaggregation is-->
<!--                    less preferred since in case of  mismatches over wide ranges both sides-->
<!--                    using this strategy end up ultimately in a 'ping-pong' ending with CSNPs.-->
<!--                </t>-->

                <t>
                    When a node receives an HSNP where any of the contained hashes
                    does not match after recomputation or comparison, it MUST
                    immediately send HSNPs with Merkle hashes covering the mismatched ranges. These new hashes MUST be
                    more specific than the range where the mismatch occurred.
                    </t>
                <t>

                    Alternatively, instead of more specific HSNP hashes, 
                    a node MAY choose to send corresponding CSNPs, PSNPs, or flood the mismatched LSPs
                    directly. Sending CSNPs or flooding immediately may be preferable when the mismatch affects only a
                    small number of LSPs.
                    </t>

<!--                <t>-->
<!--                    -->
<!--                    A node receiving an HSNP where any of the hashes received does not match on recomputation or-->
<!--                    comparison the result of its own hash-->
<!--                    MUST send immediately HSNPs with Merkle hashes covering the-->
<!--                    ranges where the hash mismatch was detected. The sent hashes MUST be more specific than-->
<!--                    the range where the collison occurred.-->
<!--                    Alternately, a node MAY choose to-->
<!--                    immediately send according CSNPs, PSNPs or flood the LSPs that have been detected-->
<!--                    as not matching the merkle hashes. Sending CSNPs or immediate flooding-->
<!--                    may be preferable if the mismatch-->
<!--                    covers relatively few LSPs. </t>-->

                    <t>
                        If there is a mismatch - or no computation available - for a hash covering just a single node (with
                        its pseudonodes), or for a hash spanning fewer fragments than a full CSNP PDU, then CSNPs covering
                        all fragments of the node and its pseudonodes MUST
                        be sent. Alternatively, the node MAY choose to flood those specific fragments directly instead.

<!--                        In case of mismatch or no computation available for a hash covering only a single node-->
<!--                        (with its pnodes) or mismatch on a hash covering less than a CSNP PDU full of fragments,-->
<!--                        a CSNP MUST be sent or alternately the node MAY choose to flood those fragments.-->
                    </t>


            </section>

        </section>

        <section title="HSNP PDU Format" anchor="format">
            <t>
<!--                HSNP PDU Format follows closely CSNP format where instead of CSNP entries the according merkle-->
<!--                hashes are propagated, i.e. the hashes incorporate strictly the same-->
<!--                fragments that are being incorporated in-->
<!--                CSNP packets. Start and End System IDs do not include pseudo node bytes as those are subsumed-->
<!--                implicitly.-->

                The HSNP PDU format closely follows the CSNP format. Instead of CSNP entries, it carries the
                corresponding Merkle hashes - which cover exactly the same fragments that would appear in CSNP packets.
                The Start and End System IDs exclude pseudonode bytes, as those are implicitly included within the
                ranges.
            </t>

            <figure>
            <artwork align="center" name="" type="" alt=""><![CDATA[
        ...

+--------------------------------------------+
|              PDU Length                    |
+--------------------------------------------+
|              Source ID                     |
+--------------------------------------------+
|              Start System ID               |
+--------------------------------------------+
|              End System ID                 |
+--------------------------------------------+
|           Variable Length Fields           |
+--------------------------------------------+

]]></artwork>
            </figure>

            <t>
                The Start and End System IDs use the standard ID length and indicate the range of fragments covered by
                the HSNP, just like CSNPs do. The key difference is that all pseudonodes of the systems within this
                range are implicitly included (implying as well that all fragments of the range are included).
                Both the Start and End System IDs are inclusive, meaning fragments from
                both endpoints are part of the range.


<!--                Start and End System IDs are or the usual ID Length and indicate, just like CSNP do,-->
<!--                the range of fragments that the HSNP covers with the difference that all pnodes of-->
<!--                the systems are implied in the range. Start and End System ID are inclusive, i.e.-->
<!--                the fragments of both the start and the end system ID are included.-->
            </t>

            <t>
                The variable length fields are a sorted sequence of Node Range Hash Entries
                in the following format.
            </t>

        <figure>
            <artwork align="center" name="" type="" alt=""><![CDATA[

+--------------------------------------------+
|              Range Start System ID         |
+--------------------------------------------+
|              Range End System ID           |
+--------------------------------------------+
|              Merkle Hash                   |
+--------------------------------------------+
]]></artwork>
        </figure>

            <t>Range of LSPs that are  included in the hash. Range Start and Range End System ID are inclusive, i.e.
                the fragments of both the start and the end system ID are contained within the range.  </t>
            <t>
                Merkle hash consists of 8 bytes of the 64-bit computed hash of all fragments covered
                by the range.
            </t>

            <t>
                This makes an entry in typical deployment scenarios 6 + 6 + 8 = 20 bytes long and hence about 70
                hashes fit into a typical MTU .
            </t>

            <t>Ranges MUST be sorted on Start System ID. </t>

        </section>

        <section title="Example">


            <t>An example will clarify things further. Consider an LSDB with 512 nodes, each having a system ID of
                1000.0000.00&lt;2 digits node-id&gt; and holding 32 fragments numbered 0 - 31. We skip uneven node
                identifiers to create intentional "holes" in the numbering.
                The pseudonode byte is treated simply as part of the system ID, since it doesn't affect the
                scheme itself.

<!--                An example will serve well here. We limit ourselves in the examples to consideration of a-->
<!--                LSDB with 512 nodes with system identifiers of 1000.0000.00 &lt;2 digits node-id&gt; each holding-->
<!--                32 fragments-->
<!--                numbered 0 to 31.-->
<!--                We leave the uneven node identifiers out to have some "holes" in the numbering to hit some corner-->
<!--                cases in further examples.-->
<!--                We disregard the pseudo node byte as simply another byte of system identifier since it does not-->
<!--                contribute-->
<!--                further details to the scheme.-->

            </t>

            <t>
                In a stable state, reasonable compression can deliver 128 "first-order" leaves - each containing fragments
                from 2 systems (64 fragments total) - requiring roughly 512 / (2 * 70) ~ 4 packets. The first of these
                "first-order" packets would look approximately like this:


<!--                In a stable state we could expect a reasonable compression-->
<!--                with the following 128 "first order" leaves (each holding 2 systems worth-->
<!--                of fragments, hence 64 fragments) and thus generating 512 / (2 * 70) ~ 4 packets.-->
<!--                First of the  "first order" packets-->
<!--                will look roughly like this-->
            </t>

            <figure anchor="first-order">
                <artwork align="center" name="" type="" alt=""><![CDATA[
        ...

+--------------------------------------------+
|  Start System ID: 0000.0000.0000           |
+--------------------------------------------+
|  End System ID:   0000.0000.00A0           | // 80 ranges covering 160 nodes
+--------------------------------------------+
+--------------------------------------------+
|  Start System ID: 1000.0000.0000           |
+--------------------------------------------+
|  End System ID:   1000.0000.0002           | // 64 fragments over 2 systems
+--------------------------------------------+
|              Merkle Hash                   |
+--------------------------------------------+
..
|  Start System ID: 1000.0000.008E           |
+--------------------------------------------+
|  End System ID:   1000.0000.00A0           |
+--------------------------------------------+
|              Merkle Hash                   | // 64 fragments over 2 systems
+--------------------------------------------+
]]></artwork>
            </figure>

            <t>
<!--                Based on a local decision a node can start to further compress the HSNPs until-->
<!--                in most extreme case it will send just one packet full of hashes. This will divide-->
<!--                in our case 512 nodes across 70 hashes (since all of those have same amount of fragments-->
<!--                for simplicity reason). This ends up being about 8 nodes per hash (equal to 8 * 32 fragments)-->
<!--                and the packet will look as following.-->

                Based on local decisions, a node can further compress HSNPs until - in the most extreme case - it sends just
                one packet full of hashes. In our example with 512 nodes, this divides them across 70 hashes (assuming
                equal fragment counts for simplicity), resulting in about 8 nodes per hash (equivalent to 8 * 32
                fragments). The resulting packet would look like this:
            </t>

            <figure anchor="second-order">
                <artwork align="center" name="" type="" alt=""><![CDATA[
        ...

+--------------------------------------------+
|  Start System ID: 0000.0000.0000           |
+--------------------------------------------+
|  End System ID:   FFFF.FFFF.FFFF           |
+--------------------------------------------+
|  Start System ID: 1000.0000.0000           |
+--------------------------------------------+
|  End System ID:   1000.0000.0010           |
+--------------------------------------------+
|              Merkle Hash                   |
...
+--------------------------------------------+
|  Start System ID: 1000.0000.01F0           |
+--------------------------------------------+
|  End System ID:   1000.0000.0200           |
+--------------------------------------------+
|              Merkle Hash                   |
+--------------------------------------------+
]]></artwork>
            </figure>

        </section>


        <section title="IS-IS Scale Envelope Considerations" toc="default" anchor="envelope">
<!--            <t>-->
<!--                As first, obvious observation, HSNPs are of negligible value on small networks of tens of nodes-->
<!--                and hundreds of fragments. Arguably, perfectly correctly implemented flooding is enough at any size (which is-->
<!--                a very optimistic assumption historically speaking) and CSNPs are unnecessary overhead built into-->
<!--                the protocol although practically speaking they contributed largely to the stability of the protocol-->
<!--                deployments over years.-->
<!--                The larger the network becomes the higher the cost of link flap or node restart and the higher-->
<!--                the steady state cost of issuing periodic CSNPs and hence HSNP feature can stretch the scale of IS-IS-->
<!--                by a significant degree.-->
<!--                </t>-->

            <t>
                HSNPs provide negligible benefit in small networks with only tens of nodes and hundreds of fragments.
                Perfect flooding would theoretically suffice at any scale (though history shows this is too optimistic), and
                even CSNPs represent under such assumptions
                protocol overhead only - yet they have significantly contributed to IS-IS stability in real
                deployments.
                </t>
            <t>
                As networks grow larger, the costs of link flaps, node restarts, and periodic CSNP exchanges increase
                substantially. HSNPs can significantly extend IS-IS scalability in these scenarios.
            </t>



<!--            <t>-->
<!--                If we start from the assumption that we want to push IS-IS to its practical limit we realize-->
<!--                that in deployment we have to contend with the limit of what comprises practically viable-->
<!--                flooding rate generated by fragment refreshes and database synchronization.-->
<!--                Path computations still play a role but-->
<!--                they can be much better deferred and take advantage of techniques like dampening and parallelization so-->
<!--                we disregard their impact here unless we deal with pathological cases of 1E6 nodes in a chain-->
<!--                with 1 fragment-->
<!--                each. -->
<!--                Hence, with more realistic assumption of something like 50E3 nodes and 1E6 fragments-->
<!--                we are at maximum configured lifetime in the realms of-->
<!--                1E6/2^16 fragments being refreshed per second and thus generating about 15 packets per-->
<!--                second load per interface (disregarding any flood reduction attempts using techniques like-->
<!--                <xref target="ID.draft-ietf-lsr-distoptflood-11"/>). Additionally, 1E6 fragments generate-->
<!--                about 10E3 CSNP packets on prevalent interface MTUs and thus during LSDB synchronization, to-->
<!--                achieve something in the order of relatively lame 120 seconds LSDB sync up we would have to contend with a-->
<!--                flooding rate of roughly 80 CSNPs per seconds. Multiplied that by an envelope of 16E3 desirable interfaces-->
<!--                we look at a sustained peak flooding rate to the control plane of somewhere around 1.5E6 packets-->
<!--                per second in the system, something that is at least an order of magnitude outside the envelope of any fast flooding-->
<!--                technique and power available for a on-system distributed control plane. Assuming maximum compression-->
<!--                using HSNP we are however in the order of one additional packet on top of the flooding caused by-->
<!--                refreshes. This leaves the 15 * 16E3 ~ 2.4E5 flooding rate to be shouldered by reduction and-->
<!--                fast flooding techniques. Not an easy task but within the realm of the possible.-->

            <t>
                To push IS-IS to its practical limits, we must account for flooding rates driven by fragment refreshes
                and LSDB synchronization. Path computation impacts can be deferred - bare pathological scenarios -
                by using dampening and parallelization,
                and thus we focus on realistic scenarios in the order of 50,000 nodes and 1 million fragments.
            </t>
            <t>
                At maximum configured lifetime, this generates ~15 packets/second per interface from refreshes (1M/65K
                fragments/second), plus ~10,000 CSNP packets for full LSDB sync.
            </t>
            <t>
                Achieving a modest 120-second sync
                requires ~80 CSNPs/second, and across 16,000 interfaces, that represents a peak of 1.5 million
                packets/second - far
                beyond current fast-flooding capabilities.
                We disregard here further techniques like <xref target="ID.draft-ietf-lsr-distoptflood-11"/>, 
                especially since they do not improve CSNP scale.
            </t>

            <t>

                With maximum HSNP compression, however, sync overhead drops to roughly one additional packet beyond
                refresh flooding, leaving ~250,000 packets/second (15 * 16K) to be handled by fast flooding and
                flood reduction
                techniques - a challenging but feasible target.
            </t>


            <t>
                The considerations above make it clear that combining fast flooding, flood reduction, and HSNP features
                will be essential in extending IS-IS scalability as deployments continue to grow larger.

<!--                With the above considerations it should become clear that a mixture of fast flooding, flood reduction-->
<!--                and HSNP features will be critical to extend the deployable envelope of IS-IS given the ever-increasing-->
<!--                deployment of the protocol.-->
            </t>

        </section>

        <section title="Further Considerations" toc="default" anchor="further">

            <section title="Maximum Advisable Hash Coverage">
                <t>
<!--                Although the mechanism can be applied to the point where a single merkle hash represents-->
<!--                an arbitrarily large database, such a single hash is not advisable. Limiting hash coverage-->
<!--                to the point where at maximum a single full HSNP packet is far more preferable.-->

                    Although the mechanism can theoretically use a single Merkle hash to represent an arbitrarily large
                    database, such an approach is not advisable. Instead, it is far preferable to limit hash coverage so
                    that at minimum one full HSNP packet is required.
                </t>

                <t>
                    In practice, limiting compression so that a maximum of about a dozen HSNP packets covers the entire
                    database is usually sufficient. For example, a single maximally compressed HSNP packet for a
                    1,000 - fragment database covers ~140 fragments per hash. Allowing for more HSNP packets (e.g., 10 instead of
                    100 CSNPs) still provides a 10x compression factor, reduces disaggregation needs during LSDB changes, and
                    further lowers the already negligible collision risk (which in 1K sized LSDB is however
                    vanishingly small with a single hash already).


<!--                    Generally, limiting compression to the point where maximum of a dozen HSNP packets cover the database-->
<!--                    can be practically speaking better than enough. As practical example a single HSNP packet on-->
<!--                    a 1E3 fragments database compressed to maximum extent-->
<!--                    covers with each hash ~140 fragments. Allowing more HSNP packets will lead to less disaggregation-->
<!--                    on LSDB changes obviously, lower collision possibility further (which in 1E3 sized LSDB is however-->
<!--                    vanishingly small with a single hash already) and e.g.-->
<!--                    10 HSNP packets compared to 100 CSNP packets covering the LSDB normally-->
<!--                    is a compression factor of 10x already.-->
                </t>
            </section>

            <section title="Hash Collision Probabilities" anchor="collisions">
<!--                <t>-->
<!--                    As first observation it is worth noting that even with CSNPs or PSNPs IS-IS does-->
<!--                    harbor a corner case where LSDB may not end up synchronized, especially on a node restart.-->
<!--                    In simple terms, issuing a fragment with the same sequence number and checksum'ed content that-->
<!--                    collides with a previously sent fragment with a dissimilar content that generates however-->
<!--                    the same 16 bit fletcher checksum will go undetected until the fragment lifetime expires.-->
<!--                    The likelihood of such an event is largely determined by the assumptions as to likelihood-->
<!--                    of a fletcher checksum colliding. A simple assumption we will work from is that values of-->
<!--                    fletcher-->
<!--                    checksums are uniformly likely, even if the content varies by relatively few bytes.-->
<!--                    With that assumption, the-->
<!--                    likelihood of such event is simply 1/2^16 or 0.001%. This will allow for further-->
<!--                    comparisons with HSNP fletcher collision likelihood.-->
<!--                    </t>-->
                <t>


                    Even with CSNPs or PSNPs, IS-IS has a corner case where LSDB synchronization can fail - particularly
                    during node restarts. In simple terms, if a new fragment has the same sequence number and
                    different content but an identical 16-bit Fletcher
                    checksum, the collision goes undetected until the fragment expires.
                </t>
                <t>
                    Assuming Fletcher checksums are uniformly distributed (even with minor content changes), the
                    collision probability for that case is 1/2**16 ~ 0.0015%. This baseline enables meaningful
                    comparisons with HSNP
                    Fletcher collision probabilities.

                </t>
                <t>
<!--                    HSNP generates 48 bit fletcher checksums over what is basically PSNP data of a fragment-->
<!--                    and ISIS length of the fragment.-->
<!--                    We will be concerned about the likelihood of two fragments being at the same time generating-->
<!--                    the same fletcher checksum while they are covered by the same HSNP since such a constellation-->
<!--                    will make both fragments "disappear" due to the nature of the XOR checksum. Any collision-->
<!--                    that is not the in the node range hash is irrelevant.-->

                    HSNP uses 64-bit <xref target="SIPHASH"/> over what is essentially PSNP data for a fragment plus the
                    fragment's IS-IS length. The key concern is the probability that two fragments - covered by the same
                    hash - generate the same hash simultaneously. This would cause both fragments to
                    "disappear" due to the XOR checksum nature. Collisions occurring for fragments in 
                    different node range hashes are irrelevant.

                </t>
                <t>
                    One might argue that XORing different sets of hashes could produce the same result, but the
                    probability of two distinct sets having identical modulo-1 sums across all 64 bits is vanishingly
                    small. This scenario is not considered further.
                    </t>
<!--                <t>-->
<!--                    One could further argue that XOR'ing several hashes can produce the same value as another-->
<!--                set of XOR'ed hashes but the likelihood of the sets having the same combination of modulo-->
<!--                1 sums on all of the 48 bits is so small (somewhere in the order of 1E-15)-->
<!--                    that we don't consider it further.-->
<!--                </t>-->

                <t>
                    Collision probability analysis is complex for the general case, though the birthday paradox gives a
                    rough estimate of 0.18% collision likelihood for 48-bit hashes in a 1M - sized set. For 64-bit
                    hashes this intuitively reduces to ~0.000,002,7%. To reflect reality better than
                    simply relying on statistical assumptions, we instead rely on extensive simulations that
                    mirror real-world conditions.
</t>
                <t>
                    These simulations model 50,000-node networks with 1M fragments, assuming node IDs differ by only 3
                    bytes, maximum fragment lifetimes, random protocol checksums on fragment refresh, and packet length changes
                    in just 5% of refreshes to reflect network stability. Results are derived from 32 networks running
                    for 2 years each.
                   </t>

<!--                <t>-->
<!--                    Probability statistics in not trivial to deal with the generic case of collisions (though it delivers based on-->
<!--                    birthday paradox first ballpark number of 0.18% collision likelihood in-->
<!--                    48 bit numbers colliding in 1E6 sized set, i.e. a single hash). Intuitively we would expect that using-->
<!--                    64 bit numbers with resulting birthday paradox probability of 0.0000027% would be necessary.-->
<!--                    But with intention of mirroring operational realities as close as possible, we fall on extensive-->
<!--                    simulations of refreshes on a vast network of 50E3 nodes with 1E6 fragments under assumptions-->
<!--                    of node IDs differing in 3 bytes only, maximum fragment lifetime and protocol checksums-->
<!--                    on each sequence number refresh being a random value. We change the length of the packet in-->
<!--                    5% of all refreshes only to mirror a rather stable network. 32 networks, each running for 2 years-->
<!--                    are used to derive the numbers.-->
<!--                </t>-->
                <t>
                    Across 64 years of simulation and resulting
                    36E9 refreshes of all fragments, we observed a total of 142 collisions for 48 bits variant of
                    <xref target="SIPHASH"/> on the whole set or roughly ~0.000,000,4%, much lower than birthday paradox
                    prediction.
                    However, assuming a single HSNP packet covering the whole database only 3 collisions of those matter
                    - a probability of ~0.000,000,008%,
                    or roughly 1 occurrence per 20 years. These collisions have an average lifetime of about 10 hours. These
                    rates are orders of magnitude lower than the birthday paradox predictions, likely because node
                    IDs act as a consistent "salt," effectively pre-partitioning the probability space. This would be
                    arguably "good enough" by a long stretch.

<!--                    The absolute number of collisions in 36E9 refreshes over set of all fragments-->
<!--                    is in the order of 200-300 collisions or-->
<!--                    6E-9 probability or expressed otherwise about 3 occurrences a year. Lifetime of such a-->
<!--                    collision shows on simulations as average of about 10 hours. The 6E-9 being significantly-->
<!--                    lower than the 1E-3 predicted by birthday paradox is likely to be attributed to the fact-->
<!--                    that node id can be seen as an always differing "salt" to the checksums and hence "pre-partitioning"-->
<!--                    the space of probabilities.-->
                </t>

                <t>
                    Nevertheless, on further investigation of <xref target="SIPHASH"/> using the standard 64 bits 1-3 variant
                    simulations over the same scenario generate *no* collisions that can be detected. Measuring the CPU
                    cost of 48-bit variant vs. 64-bit variant or even 64-bit variant of traditional fletcher checksum
                    is delivering negligible differences of low single digit percent when using modern implementation
                    techniques. Thus, 64-bit <xref target="SIPHASH"/> 1-3 variant has been chosen and should represent
                    a very safe margin for even much larger sizes of databases.
                </t>

<!--                <t>-->
<!--                    The situation becomes more practically relevant when considering collisions within a single-->
<!--                    highest-compression packet containing 70 hashes. This roughly halves the collision rate, leading to-->
<!--                    about 1 collision per year in such a large network. If compression is instead limited to 70 HSNP-->
<!--                    packets (rather than maximum compression), the rate drops further to approximately 1 collision per-->
<!--                    10 years.-->
<!--                    </t>-->

<!--                <t>-->
<!--                    The situation becomes more interesting and practically relevant-->
<!--                    once we start to consider how many collision occur in-->
<!--                    the highest compression packet with 70 hashes. This roughly halves the amount of collisions-->
<!--                    so we can expect in such a large network about 1 collision per year. Once we go to the assumption-->
<!--                    that we don't use maximum compression but limit it to 70 HSNP packets we end up with about 1 collision-->
<!--                    per 10 years.-->
<!--                </t>-->

<!--                <t>-->
<!--                    Surprisingly, switching to 64-bit hashes reduces total collisions by only about 50%, and under-->
<!--                    maximum compression (single full HSNP packet), the results are actually measurably worse. This-->
<!--                    appears to be because 64-bit collisions tend to cluster more closely together in the database - a-->
<!--                    phenomenon we currently lack an explanation for.-->

<!--&lt;!&ndash;                Surprisingly enough, going to 64 bit drops the total number of collisions by about 50% but under the&ndash;&gt;-->
<!--&lt;!&ndash;                assumptions of a single full HSNP packet the outcome is measurably worse. This is based on the fact&ndash;&gt;-->
<!--&lt;!&ndash;                taht 64 bit collision hashes occur much "closer together" on the database, something for which we lack an explanation.&ndash;&gt;-->
<!--                </t>-->

                <t>
                    Ultimately, a highly conservative (not to say paranoid) implementation can simply monitor the LSDB for colliding
                    fragment hashes and exclude them from the same HSNP hash. This forces receiving
                    nodes to use separate collision-free hashes instead. Such an approach completely eliminates any risk
                    of synchronization misses when using HSNPs.
                </t>

<!--                <t>Obviously, a very, very conservative, not to say paranoid implementation can easily-->
<!--                monitor the LSDB for presence of colliding fletcher hashes on the fragments and simply prevent-->
<!--                such fragments being included in a single HSNP hash which will force the receiving node-->
<!--                to consider separate hashes without a collision in them. This is sufficient to prevent-->
<!--                any kind of "misses" when using HSNP to synchronize the databases.</t>-->
                <t>
                    Other techniques are possible, such as slowly walking the database and sending CSNPs. However, for a
                    1M-fragment database that generates 10,000 such CSNP packets, the chance of this detecting a collision
                    during its 10-hour window is likely extremely small.
<!--                    -->
<!--                    Other techniques are obviously possible like walking the database at slow speed and sending-->
<!--                    CSNPs. Given a 1E6 fragments database generates 10E3 such packets the chance of this preventing a-->
<!--                    collision during its 10 hours time is probably extremely small.-->
                </t>
            </section>

            <section title="Impact of Packet Losses">
                <t>
                    Hashes covering large numbers of fragments are more vulnerable to packet losses, as each lost packet
                    affects a much larger portion of the LSDB during synchronization. Implementations can choose HSNP
                    node ranges freely, but should balance maximum compression against "good enough" compression that
                    reduces both collision risk and vulnerability to unavoidable packet drops.

<!--                    Hashes covering large numbers of fragments will be more susceptible to packet losses since each loss-->
<!--                    covers a much larger part of the LSDB during synchronization. An implementation can choose the-->
<!--                    node ranges covered by HSNPs in any way it desires but a consideration should be a balance-->
<!--                    between maximum reduction and a compression "good enough" that is less prone to collisions and-->
<!--                    unavoidable packet losses.-->
                </t>
            </section>

            <section title="Decompression and Caching/Comparison Optimizations">
                <t>
<!--                    As mentioned above a node may apply many strategies to speed up decompression.-->
<!--                    E.g. LSPs missing in HSNPs as not covered by ranges are clearly "missing in action" and can-->
<!--                    be reflooded. As another example,-->
<!--                    small ranges where merkle mismatched can generate CSNPs, PSNPs or lead to flooding immediately.-->

                    As mentioned earlier, nodes can use various strategies to accelerate decompression. For example,
                    LSPs missing from HSNPs (those not covered by any ranges) are clearly absent and can be immediately
                    reflooded. Similarly, small mismatched Merkle ranges can trigger immediate CSNPs, PSNPs, or direct
                    flooding.

                </t>
                <t>
<!--                    Caching of hashes can be applied at many levels since the merkle hashes suggested here are-->
<!--                    easily computed. Obviously keeping a hash on all fragments of a node and its pnodes is the-->
<!--                    simplest and most relevant candidate but other resolutions are easily achievable.-->
<!--                    Even if certain elements must be removed e.g. on receiving a-->
<!--                    range &lt;A &#45;&#45; B&gt; while the node already holds &lt; A &#45;&#45; B &amp; next-after-B&gt; the hash can be simply-->
<!--                    adjusted by removing 'next-after-B' node merkle hash from the cached result.-->

                    Caching of hashes can be applied at many levels. The
                    simplest and most useful approach is maintaining a hash for all fragments of a node and its
                    pseudonodes, though other granularities work equally well. Even when adjusting for changes - such as
                    receiving a range &lt; A - B &gt; while having cached &lt; A - B &amp; next-after-B &gt;
                    - the cached hash can be quickly updated by
                    simply XORing out the next-after-B node Merkle hash.

                </t>
            </section>

        </section> <!-- end of contributors -->

        <section title="Security Considerations" toc="default">

            <t>TBD
            </t>

        </section> <!-- end of security considerations -->

        <section anchor="IGP_IANA" title="IANA Section">
            <t>TBD
            </t>
        </section>

        <!-- 2 -->
        <section title="Contributors" toc="default">

            <t>TBD</t>

        </section> <!-- end of contributors -->

        <!-- 2 -->
        <section title="Acknowledgement" toc="default">
            <t>

                People have been talking about "compressing CSNPs" for a very long time, reportedly going back to when
                Radia Perlman and an insomniac Dave Katz were walking the halls discussing it. Recent attempts to scale
                the protocol much further have made it worthwhile to turn this idea into a standardized, practical
                engineering solution.
            </t>
            <t>
                Les Ginsberg identified several unresolved issues and contributed alternative ideas to the draft.
            </t>
            <t>
                Job Snijders initiated discussion of <xref target="SIPHASH"/> being likely a better solution that
                traditional fletcher checksumming of the fragments.
            </t>

        </section> <!-- end of contributors -->

    </middle>

    <back>


        <references title="Normative References">

            <!-- &RFC3032;
            &RFC8296;
            -->

            <reference anchor="MERKLE">
                <front>
                    <title>A Digital Signature Based on a Conventional Encryption Function</title>
                    <author initials="R.C." surname="Merkle">
                    </author>
                    <date year="1988"/>
                    <keyword>Advances in Cryptology &ndash; CRYPTO '87</keyword>
                </front>
            </reference>

            <reference anchor="SIPHASH" target="https://131002.net/siphash/">
                <front>
                    <title>SipHash: A Fast Short-Input PRF</title>
                    <author initials="J.-P." surname="Aumasson" fullname="Jean-Philippe Aumasson"/>
                    <author initials="D. J." surname="Bernstein" fullname="Daniel J. Bernstein"/>
                    <date year="2012"/>
                </front>
                <seriesInfo name="Lecture Notes in Computer Science" value="Vol. 7668, INDOCRYPT 2012, pp. 489-508"/>
            </reference>

            <reference anchor="HASHES">
                <front>
                    <title>Security considerations for incremental hash functions based on pair block chaining</title>
                    <author initials="C.-W." surname="Phan">
                    </author>
                    <date year="2006"/>
                    <keyword>Computers and Security 25</keyword>

                </front>
            </reference>

            <reference anchor="SKIP">
                <front>
                    <title>Skip lists: A probabilistic alternative to balanced trees</title>
                    <author initials="C.-W." surname="Phan">
                    </author>
                    <date year="1990"/>
                    <keyword>Communications of the ACM</keyword>

                </front>
            </reference>

        </references> <!-- end of normative references -->

        <references title="Informative References">

            <reference anchor="ID.draft-ietf-lsr-distoptflood-11">
                <front>
                    <title>IS-IS Distributed Flooding Reduction</title>
                    <author initials="R." surname="White et al.">
                        <organization/>
                    </author>
                    <date month="Oct" year="2025"/>
                </front>
                <format target="https://www.ietf.org/id/draft-ietf-lsr-distoptflood-11.txt"
                        type="TXT"/>
            </reference>


        </references> <!-- end of informative references -->

        <section title="Reference Implementation of SIP Fragment Hashing" anchor="refcode">

            <figure>
            <artwork align="center" name="" type="" alt=""><![CDATA[
<CODE BEGINS>
pub fn fragment_hash(
    fragmentid: &SharedFragmentID,
    fragmentcontent: &FragmentContent,
    variant: Option<HSNPFragmentHashVariant>,
    size: Option<HSNPSize>,
) -> HSNPHash {
    let nid = fragmentid.node.node_id().0;
    let pnodebe = fragmentid.pnode.0.to_be_bytes();
    let seqnrbe = fragmentcontent.seqnr.0.to_be_bytes();
    let fragmentnrbe = fragmentid.fragmentnr.0.to_be_bytes();
    let csumbe = fragmentcontent.isis_checksum.0.to_be_bytes();
    let lenbe = fragmentcontent.isis_pdu_length.0.to_be_bytes();

    let mut rotate_in_primary = nid.iter().chain(
        csumbe.iter().chain(
            seqnrbe.iter().chain(
                fragmentnrbe
                    .iter()
                    .chain(lenbe.iter().chain(pnodebe.iter())),
            ),
        ),
    );

    let size = size.unwrap_or(HSNPSize::LIBRARY_HSNP_SIZE);
    let variant = variant.unwrap_or(HSNPFragmentHashVariant::LIBRARY_HSNP_FRAGMENT_HASH);

    match variant {
        HSNPFragmentHashVariant::Siphash => {

            let key = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16u8];
            let mut hasher = SipHasher13::new_with_key(&key);
            let mut sl = [0u8; 16];
            rotate_in_primary
                .map(|v| *v)
                .collect_slice_checked(&mut sl[..]);
            hasher.write(&sl);
            let r = hasher.finish();
            match size {
                HSNPSize::_64Bits => {
                    r.into()
                },
                HSNPSize::_48Bits => {
                    let hin = r ^ (r >> 48);
                    (hin & 0xffff_ffff_ffff).into()
                }
<CODE ENDS>
]]></artwork>
            </figure>
        </section>

    </back>

</rfc>
