E1a proved the markdown round-trip unstable (NO-GO). This re-baselines the E0.2 ccHash contract to hash Foundry HTML directly — the E1b-alt fork — which sidesteps all 5 E1a failure reasons (no inverse, no resolver, no blank-line/ case/order sensitivity, no parseBody coupling). - src/canonicalize-html.ts: canonicalizeHtml(html) — linkedom DOM walk that absorbs serialization drift (attribute order/quoting, named-vs-numeric entities, inter-tag whitespace, tag case, self-closing) while preserving content (structure, attr values, meaningful text). Two inputs parsing to the same DOM → same canonical string. Mini-gate: tests/canonicalize-html.test.ts (9 tests — serialization variants → same canonical; content change → different). - src/cchash.ts: rewritten to ccHash = contentHash(canonicalizeHtml( data.description) + "\n" + canonicalizeHtml(data.notes ?? "") + "\n" + name + "\n" + folder). The HtmlToMarkdown seam is DROPPED; a CanonicalizeHtml seam (default = canonicalizeHtml) replaces it. CC_HASH_CONTRACT updated + pinned + re-derivation-enforced. CcHashError on missing description kept; direction- invariance kept (name/folder from liveEntry); folder = Foundry folder ID, distinct from Obsidian foundry.folder_path. tests/cchash.test.ts updated (21 tests incl. serialization-drift-absorption + no-false-negative). - src/fromFoundry.ts (the E1a markdown inverse) ships unwired — not consumed by ccHash; remains as the spike artifact's inverse. tsc clean; 67 E0+E1a+E1b-alt tests pass; 112 passing project-wide (18 pre-existing fixture-missing failures unchanged). Co-Authored-By: Claude <noreply@anthropic.com>
95 lines
4.6 KiB
TypeScript
95 lines
4.6 KiB
TypeScript
import { describe, it, expect } from "vitest";
|
|
import { canonicalizeHtml } from "../src/canonicalize-html.js";
|
|
|
|
// Base HTML carrying the features the canonicalizer must normalize: a styled
|
|
// container, a paragraph with a proper entity (the forward's escapeHtml emits
|
|
// &, and Foundry stores/returns it verbatim) and an inline child, plus a
|
|
// void element with two attributes.
|
|
const BASE = '<div style="display:flex"><p>Hello & <b>world</b></p><img src="x.png" alt="alt"></div>';
|
|
|
|
// Variants that differ ONLY in serialization (parse to the same DOM) — each
|
|
// must canonicalize to the SAME string as BASE. Drifts defended: attribute
|
|
// order, quoting, named-vs-numeric entity, inter-tag whitespace, self-closing
|
|
// slash, tag/attr case.
|
|
const VARIANTS = [
|
|
// attribute order swapped on <img>
|
|
'<div style="display:flex"><p>Hello & <b>world</b></p><img alt="alt" src="x.png"></div>',
|
|
// single-quoted attributes
|
|
"<div style='display:flex'><p>Hello & <b>world</b></p><img src='x.png' alt='alt'></div>",
|
|
// numeric entity & instead of named & (both decode to &)
|
|
'<div style="display:flex"><p>Hello & <b>world</b></p><img src="x.png" alt="alt"></div>',
|
|
// inter-tag whitespace / newlines (indentation the serializer may add or drop)
|
|
'<div style="display:flex">\n <p>Hello & <b>world</b></p>\n <img src="x.png" alt="alt">\n</div>',
|
|
// self-closing slash on the void <img>
|
|
'<div style="display:flex"><p>Hello & <b>world</b></p><img src="x.png" alt="alt" /></div>',
|
|
// uppercase tags + attributes
|
|
'<DIV STYLE="display:flex"><P>Hello & <B>world</B></P><IMG SRC="x.png" ALT="alt"></DIV>',
|
|
];
|
|
|
|
describe("canonicalizeHtml — serialization-drift stability (E1b-alt mini-gate)", () => {
|
|
it("is deterministic: same input → same canonical across runs", () => {
|
|
const a = canonicalizeHtml(BASE);
|
|
const b = canonicalizeHtml(BASE);
|
|
expect(a).toBe(b);
|
|
expect(a).toMatch(/^<div/);
|
|
});
|
|
|
|
it("all serialization variants canonicalize to the SAME string (drift absorbed)", () => {
|
|
const baseCanon = canonicalizeHtml(BASE);
|
|
for (const [i, v] of VARIANTS.entries()) {
|
|
expect(canonicalizeHtml(v), `variant ${i}: ${v}`).toBe(baseCanon);
|
|
}
|
|
});
|
|
|
|
it("the canonical form is the compact, normalized shape", () => {
|
|
// Sorted attrs (alt before src), double-quoted, lowercased, void <img> with
|
|
// no closing slash, no inter-tag whitespace. The entity & decodes to &
|
|
// and re-encodes to & (the trailing space before <b> is a whitespace-only
|
|
// node after the entity decode and is dropped — consistently for every
|
|
// entity-encoded variant, so the hash is stable).
|
|
expect(canonicalizeHtml(BASE)).toBe(
|
|
'<div style="display:flex"><p>Hello &<b>world</b></p><img alt="alt" src="x.png"></div>',
|
|
);
|
|
});
|
|
|
|
it("empty / null / undefined → empty string", () => {
|
|
expect(canonicalizeHtml("")).toBe("");
|
|
expect(canonicalizeHtml(null)).toBe("");
|
|
expect(canonicalizeHtml(undefined)).toBe("");
|
|
expect(canonicalizeHtml(" \n ")).toBe("");
|
|
});
|
|
});
|
|
|
|
describe("canonicalizeHtml — content sensitivity (real changes move the hash)", () => {
|
|
it("a one-character text change yields a different canonical form", () => {
|
|
const a = canonicalizeHtml(BASE);
|
|
const b = canonicalizeHtml('<div style="display:flex"><p>Hello & <b>World</b></p><img src="x.png" alt="alt"></div>');
|
|
expect(a).not.toBe(b);
|
|
});
|
|
|
|
it("an attribute VALUE change yields a different canonical form", () => {
|
|
const a = canonicalizeHtml(BASE);
|
|
const b = canonicalizeHtml('<div style="display:flex"><p>Hello & <b>world</b></p><img src="y.png" alt="alt"></div>');
|
|
expect(a).not.toBe(b);
|
|
});
|
|
|
|
it("a structural change (element removed) yields a different canonical form", () => {
|
|
const a = canonicalizeHtml(BASE);
|
|
const b = canonicalizeHtml('<div style="display:flex"><p>Hello & world</p><img src="x.png" alt="alt"></div>');
|
|
expect(a).not.toBe(b);
|
|
});
|
|
|
|
it("an added element yields a different canonical form", () => {
|
|
const a = canonicalizeHtml(BASE);
|
|
const b = canonicalizeHtml('<div style="display:flex"><p>Hello & <b>world</b></p><img src="x.png" alt="alt"><hr></div>');
|
|
expect(a).not.toBe(b);
|
|
});
|
|
|
|
it("a style/class change (layout-bearing attribute) yields a different canonical form", () => {
|
|
// The two-column flex style IS content for the hash (a Foundry layout change
|
|
// is a real change). Attribute-value sensitivity covers it.
|
|
const a = canonicalizeHtml(BASE);
|
|
const b = canonicalizeHtml('<div style="display:block"><p>Hello & <b>world</b></p><img src="x.png" alt="alt"></div>');
|
|
expect(a).not.toBe(b);
|
|
});
|
|
}); |