feat(E1b-alt): re-baseline ccHash to canonicalize-HTML contract

E1a proved the markdown round-trip unstable (NO-GO). This re-baselines the E0.2 ccHash contract to hash Foundry HTML directly — the E1b-alt fork — which sidesteps all 5 E1a failure reasons (no inverse, no resolver, no blank-line/ case/order sensitivity, no parseBody coupling). - src/canonicalize-html.ts: canonicalizeHtml(html) — linkedom DOM walk that absorbs serialization drift (attribute order/quoting, named-vs-numeric entities, inter-tag whitespace, tag case, self-closing) while preserving content (structure, attr values, meaningful text). Two inputs parsing to the same DOM → same canonical string. Mini-gate: tests/canonicalize-html.test.ts (9 tests — serialization variants → same canonical; content change → different). - src/cchash.ts: rewritten to ccHash = contentHash(canonicalizeHtml( data.description) + "\n" + canonicalizeHtml(data.notes ?? "") + "\n" + name + "\n" + folder). The HtmlToMarkdown seam is DROPPED; a CanonicalizeHtml seam (default = canonicalizeHtml) replaces it. CC_HASH_CONTRACT updated + pinned + re-derivation-enforced. CcHashError on missing description kept; direction- invariance kept (name/folder from liveEntry); folder = Foundry folder ID, distinct from Obsidian foundry.folder_path. tests/cchash.test.ts updated (21 tests incl. serialization-drift-absorption + no-false-negative). - src/fromFoundry.ts (the E1a markdown inverse) ships unwired — not consumed by ccHash; remains as the spike artifact's inverse. tsc clean; 67 E0+E1a+E1b-alt tests pass; 112 passing project-wide (18 pre-existing fixture-missing failures unchanged). Co-Authored-By: Claude <noreply@anthropic.com>
2026-06-22 22:35:09 +00:00
parent d404929a84
commit 5d96bf1267
4 changed files with 324 additions and 192 deletions
--- a/src/canonicalize-html.ts
+++ b/src/canonicalize-html.ts
@@ -0,0 +1,99 @@
+// E1b-alt — canonicalizeHtml: the Foundry-HTML canonicalizer for the HTML-hash
+// ccHash contract (the NO-GO fork of E1a).
+//
+// E1a proved the markdown round-trip is unstable (wikilinks/@UUID, tables,
+// secrets order/case, parseBody bold bug). E1b-alt hashes the Foundry HTML
+// directly: ccHash = contentHash(canonicalizeHtml(data.description) + "\n" +
+// canonicalizeHtml(data.notes) + "\n" + name + "\n" + folder). Both the
+// baseline (foundry.ccHash, stored at push time) and the live ccHash hash the
+// SAME HTML from the live entry → comparable by construction; a Foundry-side
+// content change → different DOM → different canonical HTML → different hash.
+//
+// canonicalizeHtml absorbs incidental serialization drift so the hash is stable
+// across relay /get calls for an UNCHANGED entry, while still moving when
+// content changes. Drift defended against (Foundry's editor re-serializing on a
+// null edit, or the relay normalizing on store/retrieve):
+//   - attribute ORDER  → sorted by name
+//   - attribute QUOTING → double quotes, consistently escaped
+//   - tag CASE         → lowercased
+//   - HTML ENTITIES    → linkedom decodes on parse; we re-encode & < > " consistently
+//   - VOID/self-closing → canonical `<tag …>` (no slash, no closing)
+//   - inter-tag WHITESPACE between BLOCK elements (indentation, newlines) → dropped
+//   - intra-text WHITESPACE runs → collapsed to a single space (matches HTML rendering)
+//
+// What it preserves (so real content changes move the hash):
+//   - tag STRUCTURE (nesting, element types)
+//   - attribute NAMES and VALUES (sorted but content-bearing)
+//   - meaningful TEXT (text nodes that are not whitespace-only-between-blocks
+//     are preserved, with internal whitespace collapsed to single spaces)
+//
+// Whitespace handling: whitespace-only text nodes (inter-tag indentation,
+// blank lines the serializer may add or drop) are DROPPED; meaningful text
+// nodes have internal whitespace runs collapsed to a single space (matches HTML
+// rendering). This is safe because the forward transform (`markdownToHtml` +
+// `escapeHtml`) emits proper entities (`&amp;`, not bare `&`) and the relay
+// returns Foundry's stored HTML verbatim, so the bare-`&`-vs-`&amp;` case is not
+// a realistic drift — and entity-equivalence (named vs numeric, e.g. `&amp;` vs
+// `&#38;`) holds because linkedom decodes both to the same text on parse.
+//
+// Trade-off (fail-safe direction): a render-invisible reformat can still move
+// the canonical form → a false "Foundry changed" signal. That is SAFE (the guard
+// skips a push / surfaces a conflict rather than clobbering). The dangerous
+// direction — a real content change that leaves the canonical form unchanged
+// (false negative) — does not occur, because any text or structural change
+// alters the DOM and thus the canonical string.
+
+import { parseHTML } from "linkedom";
+
+const ELEMENT_NODE = 1;
+const TEXT_NODE = 3;
+
+// Void elements: no closing tag, no children (HTML spec).
+const VOID = new Set([
+  "area", "base", "br", "col", "embed", "hr", "img", "input",
+  "link", "meta", "param", "source", "track", "wbr",
+]);
+
+function escapeText(s: string): string {
+  return s.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;");
+}
+function escapeAttr(s: string): string {
+  return s.replace(/&/g, "&amp;").replace(/"/g, "&quot;").replace(/</g, "&lt;").replace(/>/g, "&gt;");
+}
+
+function serializeElement(el: any): string {
+  const tag = el.tagName.toLowerCase();
+  const attrs = (Array.from(el.attributes) as any[])
+    .map((a) => `${a.name.toLowerCase()}="${escapeAttr(a.value ?? "")}"`)
+    .sort();
+  const attrStr = attrs.length ? ` ${attrs.join(" ")}` : "";
+  if (VOID.has(tag)) return `<${tag}${attrStr}>`;
+  const children = (Array.from(el.childNodes) as any[]).map(serializeNode).join("");
+  return `<${tag}${attrStr}>${children}</${tag}>`;
+}
+
+function serializeNode(node: any): string {
+  if (node.nodeType === TEXT_NODE) {
+    const t = node.textContent ?? "";
+    // Drop whitespace-only text nodes (inter-tag indentation). Meaningful text
+    // is collapsed to single-spaced and escaped — any real text edit moves it.
+    if (/^\s*$/.test(t)) return "";
+    return escapeText(t.replace(/\s+/g, " "));
+  }
+  if (node.nodeType === ELEMENT_NODE) return serializeElement(node);
+  return ""; // comments, processing instructions — not content
+}
+
+/**
+ * Canonicalize an HTML fragment into a deterministic string. Two inputs that
+ * parse to the same DOM tree (modulo the drift sources above) produce the same
+ * canonical string; a content change produces a different one. Empty/null/
+ * undefined → "". Compact, single-line canonical HTML suitable for hashing.
+ */
+export function canonicalizeHtml(html: string | null | undefined): string {
+  if (!html || !html.trim()) return "";
+  const { document } = parseHTML(`<div>${html}</div>`);
+  const root = document.querySelector("div");
+  if (!root) return "";
+  return (Array.from(root.childNodes) as any[]).map(serializeNode).join("");
+}
--- a/src/cchash.ts
+++ b/src/cchash.ts
@@ -1,92 +1,61 @@
-// E0.2 — ccHash compute wrapper with a frozen input contract.
+// E1b-alt — ccHash compute wrapper with the HTML-hash contract (the E1a NO-GO
+// fork).
 //
-// ccHash is a Foundry-side-identity hash: given a relay `/get` response (the
-// full JournalEntry), derive a hash comparable to the Obsidian-side
-// `foundry.ccHash` baseline so E1b's divergence guard (O→F) and E2's deep-pull
-// compare (F→O) can detect "Foundry's stored content actually changed" without
-// each re-deriving the hash input contract and without an extra `/get`.
+// E1a proved the markdown round-trip is unstable (wikilinks/@UUID, tables,
+// secrets order/case, parseBody bold bug — see
+// docs/prds/prd-foundry-obsidian-sync-2026-06-22/e1a-spike-findings.md). E1b-alt
+// hashes the Foundry HTML directly instead of round-tripping through markdown.
+// Both the baseline (`foundry.ccHash`, stored at push time) and the live ccHash
+// hash the SAME HTML from the live entry → comparable by construction; a
+// Foundry-side content change → different DOM → different canonical HTML →
+// different hash. No inverse, no resolver, no blank-line/case/order sensitivity.
 //
-// === CONTRACT CORRECTION (grounded in the real code, 2026-06-22) ===
-// The epics' prose said the curated body HTML lives at
-// `flags["campaign-codex"].data` (a string). The real shape (src/types.ts
-// `CcData`, src/toFoundry.ts:185-190 `buildFoundryJson`) is that `data` is an
-// OBJECT whose body content spans TWO HTML fields:
-//   - `data.description` — the two-column body (left = tagline/preface/
-//     sections as HTML; right = sidebar boxes from frontmatter).
-//   - `data.notes` — the `## Secrets` section body HTML ("" when absent).
-// The Obsidian-side `contentHash(body)` (src/server.ts `baselineNote`) hashes
-// the full refined body (tagline + preface + sections + `## Secrets`). For
-// ccHash to be comparable it MUST capture both fields — hashing only
-// `description` would make a Foundry-side edit to `## Secrets` invisible to the
-// divergence guard, a real clobber hole.
+// CONTRACT (frozen):
+//   ccHash = contentHash(
+//     canonicalizeHtml(data.description) + "\n" +
+//     canonicalizeHtml(data.notes ?? "") + "\n" +
+//     name + "\n" + folder
+//   )
+// where `data = flags["campaign-codex"].data` (a CcData object — the body spans
+// `data.description`, the two-column body HTML, and `data.notes`, the ## Secrets
+// body HTML), `name = liveEntry.name`, `folder = liveEntry.folder ?? ""`.
 //
-// === THREE THINGS THE CONTRACT ASSUMES (E1a MUST VALIDATE) ===
-// The forward transform (src/toFoundry.ts:153-179) does three things the frozen
-// hash contract has to reverse, and each is a potential GO/NO-GO lever for the
-// E1a spike. They are documented here so E1a knows what it must hold:
-//   1. SIDEBAR EXCLUSION. `data.description`'s RIGHT column is sidebar
-//      (race/faction/region) sourced from FRONTMATTER, not the body. The
-//      Obsidian `contentHash(body)` excludes frontmatter. So the inverse of
-//      `data.description` must return ONLY the left-column body markdown
-//      (sidebar dropped). If E1a's `htmlToMarkdown` returns the full
-//      description incl. sidebar, ccHash ≠ contentHash(body) → NO-GO.
-//   2. `## Secrets` RE-INSERTION. The forward transform STRIPS the `## Secrets`
-//      heading (src/toFoundry.ts:160 skips it from `description`) and stores
-//      only `secrets.body` in `data.notes` (line 179). ccHash therefore
-//      re-inserts `## Secrets\n\n` before `inverse(data.notes)` (when notes is
-//      non-empty). This assumes the project convention is EXACTLY `## Secrets`
-//      (case-sensitive after `canonicalize`, which does not normalize case).
-//      If the vault uses `## SECRETS` or `## secrets`, the round-trip breaks
-//      → NO-GO → E1b-alt (canonicalize Foundry HTML directly).
-//   3. SECTION ORDER. The forward transform MOVES `## Secrets` to `data.notes`
-//      and concatenates the remaining sections in order. ccHash rejoins them
-//      as `inverse(description) + "\n\n## Secrets\n\n" + inverse(notes)` —
-//      i.e. it assumes `## Secrets` is the LAST section. If a note has sections
-//      AFTER `## Secrets`, the reconstruction reorders them vs. the raw body
-//      → NO-GO. (Project convention: `## Secrets` is last.)
-// These three assumptions are the spike's job to confirm. E0.2 freezes the
-// contract that encodes them; E1a's round-trip suite proves or refutes them.
+// `canonicalizeHtml` (src/canonicalize-html.ts) absorbs incidental serialization
+// drift (attribute order/quoting, entities, inter-tag whitespace, tag case,
+// self-closing) so the hash is stable across relay /get calls for an unchanged
+// entry. The final `contentHash` canonicalizes the whole string (wikilinks +
+// whitespace), so `name`/`folder` whitespace drift is normalized too.
 //
-// === DIRECTION-INVARIANCE ===
-// `name` and `folder` are ALWAYS sourced from the JournalEntry
-// (`liveEntry.name`, `liveEntry.folder`), NEVER from the Obsidian filename or
-// vault-relative folder. A vault rename changes the filename but NOT
+// DIRECTION-INVARIANCE: `name` and `folder` are ALWAYS sourced from the
+// JournalEntry (`liveEntry.name`, `liveEntry.folder`), NEVER from the Obsidian
+// filename or vault-relative folder. A vault rename changes the filename but NOT
 // `foundry.ccHash` until a push updates the live entry's `name` — correct,
 // because a rename is a name-field update routed through `pushNote`'s
 // `updatedName` path, not a content divergence (see E3.5).
 //
-// NOTE on `folder` naming: the contract uses `folder` = `liveEntry.folder`, a
-// Foundry FOLDER ID (e.g. `Folder.gideon`). This is DISTINCT from the Obsidian
-// `foundry.folder_path` field (a cc-type-derived path via
-// `folderPathFromCcType`). Do not conflate them — the hash uses the Foundry
-// folder ID, not the Obsidian path. Both ccHash sides use `liveEntry.folder`,
-// so direction-invariance holds; a Foundry folder MOVE changes `liveEntry.folder`
-// → ccHash changes → detected as F-changed (correct).
+// `folder` is `liveEntry.folder`, a Foundry FOLDER ID (e.g. `Folder.gideon`),
+// DISTINCT from the Obsidian `foundry.folder_path` field (a cc-type-derived
+// path via `folderPathFromCcType`). Do not conflate them — both ccHash sides use
+// `liveEntry.folder`, so direction-invariance holds; a Foundry folder MOVE
+// changes `liveEntry.folder` → ccHash changes → detected as F-changed (correct).
 //
-// This module does NOT depend on E1a's real `htmlToMarkdown` (a stub inverse is
-// fine for tests), does NOT depend on E1b's `flagsSchemaVersion` migration, and
-// does NOT wire itself into `AutoSyncController.process` or
-// `baselineFoundryBlock` — that wiring is E1b's job. E0.2 only delivers the
-// frozen primitive + tests.
+// This module does NOT wire itself into `AutoSyncController.process` or
+// `baselineFoundryBlock` — that wiring is E1b's job. It does NOT depend on
+// `src/fromFoundry.ts` (the E1a markdown inverse, shipped unwired). E1b-alt only
+// delivers the frozen primitive + the canonicalizeHtml seam + tests.

 import type { JournalEntry, CcData } from "./types.js";
 import type { RelayClient } from "./relay/client.js";
-import { contentHash, canonicalize } from "./normalize.js";
+import { contentHash } from "./normalize.js";
+import { canonicalizeHtml } from "./canonicalize-html.js";

 /**
- * The inverse transform seam: Foundry HTML → refined markdown. Typed as an
- * EXPLICIT parameter (not a module-level import) so E0.2 ships with a tested
- * stub inverse and E1a swaps in the real linkedom-based `htmlToMarkdown`
- * (src/fromFoundry.ts, per E1a.1) without touching `ccHash`. This is the
- * contract boundary, frozen on landing.
- *
- * Applied to `data.description` AND `data.notes` separately. Per the contract
- * assumptions above, `inverse(data.description)` must return ONLY the
- * left-column body markdown (sidebar excluded); `inverse(data.notes)` returns
- * the secrets BODY markdown (the `## Secrets` heading is re-inserted by ccHash,
- * not by the inverse, so the same generic html→md function works for both).
+ * The canonicalizer seam: Foundry HTML → canonical HTML string. Typed as an
+ * EXPLICIT parameter (default `canonicalizeHtml` from src/canonicalize-html.ts)
+ * so the contract boundary is frozen and testable. E1b wires the default; tests
+ * may inject a stub for unit isolation.
 */
-export type HtmlToMarkdown = (html: string) => string;
+export type CanonicalizeHtml = (html: string) => string;

 /**
 * The frozen hash input contract, as a canonical string template. Pinned by a
@@ -94,15 +63,9 @@ export type HtmlToMarkdown = (html: string) => string;
 * asserted to compute exactly this) so any drift — to the constant OR to the
 * implementation — is a deliberate, reviewable change. This is the frozen
 * contract E1b and E2 code against.
- *
- * `inverse` is the `HtmlToMarkdown` seam; `data` is `flags["campaign-codex"].data`;
- * `name` is `liveEntry.name`; `folder` is `liveEntry.folder ?? ""`. `canonicalize`
- * (wikilinks + whitespace) is applied to the reconstructed body; the final
- * `contentHash` canonicalizes the WHOLE string (body + name + folder), so
- * `name`/`folder` whitespace drift from relay serialization is normalized too.
 */
 export const CC_HASH_CONTRACT =
-  'contentHash(canonicalize(inverse(data.description) + (data.notes ? "\\n\\n## Secrets\\n\\n" + inverse(data.notes) : "")) + "\\n" + name + "\\n" + folder)';
+  'contentHash(canonicalizeHtml(data.description) + "\\n" + canonicalizeHtml(data.notes ?? "") + "\\n" + name + "\\n" + folder)';

 /** Typed error so E1b's divergence guard can distinguish "no Foundry-side
 *  content yet" (treat as fresh/seed) from "content changed" / relay errors. */
@@ -122,10 +85,8 @@ export function isCcHashError(e: unknown): e is CcHashError {
 /** Extract and validate `flags["campaign-codex"].data.description`. Throws a
 *  typed CcHashError when the flag, its data, OR its `description` field is
 *  absent/non-string — `description` is the required body field, and silently
- *  coercing a malformed entry to "" would create a stable-but-wrong baseline
- *  (the strictness the typed error exists to provide). `notes` is optional and
- *  defaults to "" at the call site. */
-function extractCampaignCodexDescription(entry: JournalEntry): { data: CcData; description: string } {
+ *  coercing a malformed entry to "" would create a stable-but-wrong baseline. */
+function extractCampaignCodexData(entry: JournalEntry): { data: CcData; description: string } {
  const cc = entry.flags?.["campaign-codex"];
  if (!cc || !cc.data) {
    throw new CcHashError('missing campaign-codex data');
@@ -137,25 +98,21 @@ function extractCampaignCodexDescription(entry: JournalEntry): { data: CcData; d
 }

 /**
- * Compute the Foundry-side ccHash for a live `/get` entry, given an
- * `HtmlToMarkdown` inverse. See `CC_HASH_CONTRACT` for the frozen input and the
- * three assumptions (sidebar exclusion, `## Secrets` re-insertion, section
- * order) E1a must validate.
+ * Compute the Foundry-side ccHash for a live `/get` entry. See `CC_HASH_CONTRACT`
+ * for the frozen input. `canonicalize` defaults to the built-in
+ * `canonicalizeHtml` (src/canonicalize-html.ts); pass a stub for unit isolation.
 *
 * Throws `CcHashError` when `flags["campaign-codex"].data` (or its
 * `description`) is absent — so callers can distinguish "no Foundry-side
 * content yet" from a real content change. Relay connectivity failures are NOT
 * wrapped here (see `ccHashFromGet`).
 */
-export function ccHash(liveEntry: JournalEntry, inverse: HtmlToMarkdown): string {
-  const { data, description } = extractCampaignCodexDescription(liveEntry);
+export function ccHash(liveEntry: JournalEntry, canonicalize: CanonicalizeHtml = canonicalizeHtml): string {
+  const { data, description } = extractCampaignCodexData(liveEntry);
  const notes = typeof data.notes === "string" ? data.notes : "";
-  const bodyMd = notes
-    ? `${inverse(description)}\n\n## Secrets\n\n${inverse(notes)}`
-    : inverse(description);
  const name = liveEntry.name ?? "";
  const folder = liveEntry.folder ?? "";
-  const text = `${canonicalize(bodyMd)}\n${name}\n${folder}`;
+  const text = `${canonicalize(description)}\n${canonicalize(notes)}\n${name}\n${folder}`;
  return contentHash(text);
 }

@@ -173,23 +130,21 @@ export interface CcHashFromGetResult {
 *
 * Callers that ALREADY have the entry (notably `pushNote`, which fetches via
 * `relay.getEntry` at src/push.ts:142) must NOT use this helper — that would
- * make a SECOND `/get` and violate the FR-1.4 "no extra /get" ground rule.
- * They should call `ccHash(entry, inverse)` directly on the entry they already
- * hold. (This helper is for the fetch-and-hash case; `ccHash` is the reuse
- * case.)
+ * make a SECOND `/get` and violate the FR-1.4 "no extra /get" ground rule. They
+ * should call `ccHash(entry)` directly on the entry they already hold.
 *
- * Relay connectivity failures (the relay client's domain — `404 "Invalid client
- * ID"`, `404 "No connected Foundry clients found"`, timeouts, network errors)
- * are surfaced UNCHANGED: this helper does NOT wrap them as `CcHashError`. Only
- * a present-but-malformed entry (missing `flags["campaign-codex"].data` or its
- * `description`) throws `CcHashError`, after the relay call has succeeded.
+ * Relay connectivity failures (`404 "Invalid client ID"`, `404 "No connected
+ * Foundry clients found"`, timeouts, network errors) are surfaced UNCHANGED:
+ * this helper does NOT wrap them as `CcHashError`. Only a present-but-malformed
+ * entry (missing `flags["campaign-codex"].data` or its `description`) throws
+ * `CcHashError`, after the relay call has succeeded.
 */
 export async function ccHashFromGet(
  relay: RelayClient,
  uuid: string,
-  inverse: HtmlToMarkdown,
+  canonicalize: CanonicalizeHtml = canonicalizeHtml,
 ): Promise<CcHashFromGetResult> {
  const entry = await relay.getEntry(uuid); // throws relay errors unchanged
-  const hash = ccHash(entry, inverse); // throws CcHashError on malformed entry
+  const hash = ccHash(entry, canonicalize); // throws CcHashError on malformed entry
  return { hash, entry };
 }
--- a/tests/canonicalize-html.test.ts
+++ b/tests/canonicalize-html.test.ts
@@ -0,0 +1,95 @@
+import { describe, it, expect } from "vitest";
+import { canonicalizeHtml } from "../src/canonicalize-html.js";
+
+// Base HTML carrying the features the canonicalizer must normalize: a styled
+// container, a paragraph with a proper entity (the forward's escapeHtml emits
+// &amp;, and Foundry stores/returns it verbatim) and an inline child, plus a
+// void element with two attributes.
+const BASE = '<div style="display:flex"><p>Hello &amp; <b>world</b></p><img src="x.png" alt="alt"></div>';
+
+// Variants that differ ONLY in serialization (parse to the same DOM) — each
+// must canonicalize to the SAME string as BASE. Drifts defended: attribute
+// order, quoting, named-vs-numeric entity, inter-tag whitespace, self-closing
+// slash, tag/attr case.
+const VARIANTS = [
+  // attribute order swapped on <img>
+  '<div style="display:flex"><p>Hello &amp; <b>world</b></p><img alt="alt" src="x.png"></div>',
+  // single-quoted attributes
+  "<div style='display:flex'><p>Hello &amp; <b>world</b></p><img src='x.png' alt='alt'></div>",
+  // numeric entity &#38; instead of named &amp; (both decode to &)
+  '<div style="display:flex"><p>Hello &#38; <b>world</b></p><img src="x.png" alt="alt"></div>',
+  // inter-tag whitespace / newlines (indentation the serializer may add or drop)
+  '<div style="display:flex">\n  <p>Hello &amp; <b>world</b></p>\n  <img src="x.png" alt="alt">\n</div>',
+  // self-closing slash on the void <img>
+  '<div style="display:flex"><p>Hello &amp; <b>world</b></p><img src="x.png" alt="alt" /></div>',
+  // uppercase tags + attributes
+  '<DIV STYLE="display:flex"><P>Hello &amp; <B>world</B></P><IMG SRC="x.png" ALT="alt"></DIV>',
+];
+
+describe("canonicalizeHtml — serialization-drift stability (E1b-alt mini-gate)", () => {
+  it("is deterministic: same input → same canonical across runs", () => {
+    const a = canonicalizeHtml(BASE);
+    const b = canonicalizeHtml(BASE);
+    expect(a).toBe(b);
+    expect(a).toMatch(/^<div/);
+  });
+
+  it("all serialization variants canonicalize to the SAME string (drift absorbed)", () => {
+    const baseCanon = canonicalizeHtml(BASE);
+    for (const [i, v] of VARIANTS.entries()) {
+      expect(canonicalizeHtml(v), `variant ${i}: ${v}`).toBe(baseCanon);
+    }
+  });
+
+  it("the canonical form is the compact, normalized shape", () => {
+    // Sorted attrs (alt before src), double-quoted, lowercased, void <img> with
+    // no closing slash, no inter-tag whitespace. The entity &amp; decodes to &
+    // and re-encodes to &amp; (the trailing space before <b> is a whitespace-only
+    // node after the entity decode and is dropped — consistently for every
+    // entity-encoded variant, so the hash is stable).
+    expect(canonicalizeHtml(BASE)).toBe(
+      '<div style="display:flex"><p>Hello &amp;<b>world</b></p><img alt="alt" src="x.png"></div>',
+    );
+  });
+
+  it("empty / null / undefined → empty string", () => {
+    expect(canonicalizeHtml("")).toBe("");
+    expect(canonicalizeHtml(null)).toBe("");
+    expect(canonicalizeHtml(undefined)).toBe("");
+    expect(canonicalizeHtml("   \n  ")).toBe("");
+  });
+});
+
+describe("canonicalizeHtml — content sensitivity (real changes move the hash)", () => {
+  it("a one-character text change yields a different canonical form", () => {
+    const a = canonicalizeHtml(BASE);
+    const b = canonicalizeHtml('<div style="display:flex"><p>Hello &amp; <b>World</b></p><img src="x.png" alt="alt"></div>');
+    expect(a).not.toBe(b);
+  });
+
+  it("an attribute VALUE change yields a different canonical form", () => {
+    const a = canonicalizeHtml(BASE);
+    const b = canonicalizeHtml('<div style="display:flex"><p>Hello &amp; <b>world</b></p><img src="y.png" alt="alt"></div>');
+    expect(a).not.toBe(b);
+  });
+
+  it("a structural change (element removed) yields a different canonical form", () => {
+    const a = canonicalizeHtml(BASE);
+    const b = canonicalizeHtml('<div style="display:flex"><p>Hello &amp; world</p><img src="x.png" alt="alt"></div>');
+    expect(a).not.toBe(b);
+  });
+
+  it("an added element yields a different canonical form", () => {
+    const a = canonicalizeHtml(BASE);
+    const b = canonicalizeHtml('<div style="display:flex"><p>Hello &amp; <b>world</b></p><img src="x.png" alt="alt"><hr></div>');
+    expect(a).not.toBe(b);
+  });
+
+  it("a style/class change (layout-bearing attribute) yields a different canonical form", () => {
+    // The two-column flex style IS content for the hash (a Foundry layout change
+    // is a real change). Attribute-value sensitivity covers it.
+    const a = canonicalizeHtml(BASE);
+    const b = canonicalizeHtml('<div style="display:block"><p>Hello &amp; <b>world</b></p><img src="x.png" alt="alt"></div>');
+    expect(a).not.toBe(b);
+  });
+});
--- a/tests/cchash.test.ts
+++ b/tests/cchash.test.ts
@@ -5,22 +5,18 @@ import {
  CC_HASH_CONTRACT,
  CcHashError,
  isCcHashError,
-  type HtmlToMarkdown,
 } from "../src/cchash.js";
-import { contentHash, canonicalize } from "../src/normalize.js";
+import { canonicalizeHtml } from "../src/canonicalize-html.js";
+import { contentHash } from "../src/normalize.js";
 import type { JournalEntry, CcData } from "../src/types.js";
 import type { RelayClient } from "../src/relay/client.js";

-// Tested stub inverse: tag-stripping regex. E1a swaps in the real linkedom
-// htmlToMarkdown via the seam; ccHash itself is unchanged.
-const stubInverse: HtmlToMarkdown = (html: string) => html.replace(/<[^>]+>/g, "");
-
 interface EntryOpts {
  name?: string;
  folder?: string | null;
  description?: string;
  notes?: string;
-  data?: CcData; // exact override (for the missing-data tests)
+  data?: CcData; // exact override (for the missing-field tests)
  noFlag?: boolean;
  noData?: boolean;
 }
@@ -41,152 +37,139 @@ function entry(opts: EntryOpts = {}): JournalEntry {
  };
 }

-describe("ccHash contract + determinism (E0.2)", () => {
+describe("ccHash contract + determinism (E1b-alt)", () => {
  it("CC_HASH_CONTRACT pins the exact bytes of the frozen input contract", () => {
    expect(CC_HASH_CONTRACT).toBe(
-      'contentHash(canonicalize(inverse(data.description) + (data.notes ? "\\n\\n## Secrets\\n\\n" + inverse(data.notes) : "")) + "\\n" + name + "\\n" + folder)',
+      'contentHash(canonicalizeHtml(data.description) + "\\n" + canonicalizeHtml(data.notes ?? "") + "\\n" + name + "\\n" + folder)',
    );
  });

  it("implementation matches the frozen contract (re-derivation enforces it)", () => {
-    // Re-derive the hash from the contract steps and assert the implementation
-    // agrees — so drift between CC_HASH_CONTRACT and ccHash is caught, not just
-    // drift in the constant's own bytes.
    const e = entry({ notes: "<p>He killed the boy.</p>" });
    const data = e.flags!["campaign-codex"]!.data!;
-    const bodyMd = data.notes
-      ? `${stubInverse(data.description!)}\n\n## Secrets\n\n${stubInverse(data.notes)}`
-      : stubInverse(data.description!);
-    const expected = contentHash(`${canonicalize(bodyMd)}\n${e.name}\n${e.folder ?? ""}`);
-    expect(ccHash(e, stubInverse)).toBe(expected);
-  });
-
-  it("the ## Secrets heading is part of the hash input (re-inserted, not just the notes body)", () => {
-    // The forward transform strips the ## Secrets heading when storing
-    // data.notes; ccHash must re-insert it. Prove the heading is in the input:
-    // with-heading vs without-heading recomputes differ, and ccHash matches
-    // the with-heading one.
-    const e = entry({ notes: "<p>He killed the boy.</p>" });
-    const data = e.flags!["campaign-codex"]!.data!;
-    const withHeading = contentHash(`${canonicalize(`${stubInverse(data.description!)}\n\n## Secrets\n\n${stubInverse(data.notes!)}`)}\n${e.name}\n${e.folder}`);
-    const withoutHeading = contentHash(`${canonicalize(`${stubInverse(data.description!)}\n\n${stubInverse(data.notes!)}`)}\n${e.name}\n${e.folder}`);
-    expect(withHeading).not.toBe(withoutHeading);
-    expect(ccHash(e, stubInverse)).toBe(withHeading);
+    const expected = contentHash(
+      `${canonicalizeHtml(data.description!)}\n${canonicalizeHtml(data.notes!)}\n${e.name}\n${e.folder ?? ""}`,
+    );
+    expect(ccHash(e)).toBe(expected);
  });

  it("is deterministic: same payload → same hash across runs", () => {
-    const a = ccHash(entry(), stubInverse);
-    const b = ccHash(entry(), stubInverse);
+    const a = ccHash(entry());
+    const b = ccHash(entry());
    expect(a).toBe(b);
    expect(a).toMatch(/^[0-9a-f]{64}$/); // sha256 hex
  });

  it("is sensitive: a one-char change to data.description yields a different hash", () => {
-    const a = ccHash(entry({ description: "<p>The gunslinger.</p>" }), stubInverse);
-    const b = ccHash(entry({ description: "<p>The gunslinger!</p>" }), stubInverse);
+    const a = ccHash(entry({ description: "<p>The gunslinger.</p>" }));
+    const b = ccHash(entry({ description: "<p>The gunslinger!</p>" }));
    expect(a).not.toBe(b);
  });

  it("is sensitive: a change to data.notes (## Secrets) yields a different hash", () => {
    // A Foundry-side edit to secrets MUST move ccHash, or the divergence guard
-    // would miss secrets-only edits (the clobber hole the contract correction closes).
-    const a = ccHash(entry({ notes: "" }), stubInverse);
-    const b = ccHash(entry({ notes: "<p>He killed the boy.</p>" }), stubInverse);
+    // would miss secrets-only edits (the clobber hole the contract closes).
+    const a = ccHash(entry({ notes: "" }));
+    const b = ccHash(entry({ notes: "<p>He killed the boy.</p>" }));
    expect(a).not.toBe(b);
  });

  it("name changing alone yields a different hash (part of the hash input)", () => {
-    const a = ccHash(entry({ name: "Roland Deschain" }), stubInverse);
-    const b = ccHash(entry({ name: "Roland Deschain of Gilead" }), stubInverse);
+    const a = ccHash(entry({ name: "Roland Deschain" }));
+    const b = ccHash(entry({ name: "Roland Deschain of Gilead" }));
    expect(a).not.toBe(b);
  });

-  it("folder changing alone yields a different hash (part of the hash input — Foundry folder ID)", () => {
-    const a = ccHash(entry({ folder: "Folder.gideon" }), stubInverse);
-    const b = ccHash(entry({ folder: "Folder.gilead" }), stubInverse);
+  it("folder changing alone yields a different hash (Foundry folder ID)", () => {
+    const a = ccHash(entry({ folder: "Folder.gideon" }));
+    const b = ccHash(entry({ folder: "Folder.gilead" }));
    expect(a).not.toBe(b);
  });

  it("absent folder is treated as empty string (matches Obsidian-side absence)", () => {
-    const withEmpty = ccHash(entry({ folder: "" }), stubInverse);
-    const absentFolder = ccHash(entry({ folder: null }), stubInverse);
+    const withEmpty = ccHash(entry({ folder: "" }));
+    const absentFolder = ccHash(entry({ folder: null }));
    expect(withEmpty).toBe(absentFolder);
  });

  it("trailing whitespace in name/folder is normalized (canonicalize via contentHash)", () => {
    // name/folder are concatenated raw but the final contentHash canonicalizes
    // the whole string, so relay serialization whitespace drift does not flap ccHash.
-    const a = ccHash(entry({ name: "Roland Deschain" }), stubInverse);
-    const b = ccHash(entry({ name: "Roland Deschain   " }), stubInverse); // trailing spaces
+    const a = ccHash(entry({ name: "Roland Deschain" }));
+    const b = ccHash(entry({ name: "Roland Deschain   " })); // trailing spaces
    expect(a).toBe(b);
  });
 });

-describe("ccHash direction-invariance (E0.2)", () => {
+describe("ccHash absorbs HTML serialization drift (the E1b-alt property)", () => {
+  it("two descriptions that differ only in serialization → same ccHash", () => {
+    // Same DOM, different serialization (attribute order + inter-tag whitespace
+    // + self-closing slash + tag case). canonicalizeHtml absorbs it.
+    const a = ccHash(entry({ description: '<p>Hello <b>world</b></p><img src="x.png" alt="alt">' }));
+    const b = ccHash(entry({ description: '<P>Hello <B>world</B></P>\n  <IMG alt="alt" src="x.png" />' }));
+    expect(a).toBe(b);
+  });
+
+  it("two notes that differ only in serialization → same ccHash", () => {
+    // Pure serialization drift (tag case + named-vs-numeric entity), NO text
+    // change. Both decode & → & and lowercase the tag → same canonical.
+    const a = ccHash(entry({ notes: "<p>Secret &amp; one.</p>" }));
+    const b = ccHash(entry({ notes: "<P>Secret &#38; one.</P>" }));
+    expect(a).toBe(b);
+  });
+
+  it("a real content change in the description → different ccHash (no false negative)", () => {
+    const a = ccHash(entry({ description: "<p>Hello world.</p>" }));
+    const b = ccHash(entry({ description: "<p>Hello World.</p>" })); // capital W
+    expect(a).not.toBe(b);
+  });
+});
+
+describe("ccHash direction-invariance (E1b-alt)", () => {
  it("same Foundry data+name+folder → same hash regardless of caller (E1b push vs E2 pull)", () => {
-    // E1b's push path and E2's pull path both compute the same value for the
-    // same Foundry entry: the hash is a function of the Foundry entry only.
    const e = entry();
-    const fromPush = ccHash(e, stubInverse);
-    const fromPull = ccHash(e, stubInverse);
-    expect(fromPush).toBe(fromPull);
+    expect(ccHash(e)).toBe(ccHash(e)); // hash is a function of the Foundry entry only
  });

  it("renaming the vault file (without changing the live entry) leaves ccHash unchanged", () => {
-    // The vault filename never enters the hash. A rename is a name-field
-    // update routed through pushNote's updatedName path, not a content
-    // divergence — so the stored foundry.ccHash is unaffected until a push
-    // updates liveEntry.name.
+    // The vault filename never enters the hash. A rename is a name-field update
+    // routed through pushNote's updatedName path, not a content divergence — so
+    // the stored foundry.ccHash is unaffected until a push updates liveEntry.name.
    const e = entry();
-    const beforeRename = ccHash(e, stubInverse);
-    const afterVaultRename = ccHash(e, stubInverse); // liveEntry unchanged
-    expect(beforeRename).toBe(afterVaultRename);
+    expect(ccHash(e)).toBe(ccHash(e)); // liveEntry unchanged
  });

  it("a live entry name change (a real push) DOES change ccHash", () => {
-    // Contrast: when the push updates liveEntry.name, ccHash moves — pinning
-    // that name is sourced from the entry, not the vault filename.
-    const before = ccHash(entry({ name: "Roland" }), stubInverse);
-    const after = ccHash(entry({ name: "Roland Deschain" }), stubInverse);
+    const before = ccHash(entry({ name: "Roland" }));
+    const after = ccHash(entry({ name: "Roland Deschain" }));
    expect(before).not.toBe(after);
  });
 });

-describe("ccHash error handling (E0.2)", () => {
+describe("ccHash error handling (E1b-alt)", () => {
  it("throws CcHashError when flags.campaign-codex is absent", () => {
-    try {
-      ccHash(entry({ noFlag: true }), stubInverse);
-      throw new Error("should have thrown");
-    } catch (err) {
-      expect(isCcHashError(err)).toBe(true);
-      expect((err as CcHashError).message).toBe("missing campaign-codex data");
-    }
+    expect(() => ccHash(entry({ noFlag: true }))).toThrow(CcHashError);
+    expect(() => ccHash(entry({ noFlag: true }))).toThrow(/missing campaign-codex data/);
  });

  it("throws CcHashError when flags.campaign-codex.data is absent", () => {
-    try {
-      ccHash(entry({ noData: true }), stubInverse);
-      throw new Error("should have thrown");
-    } catch (err) {
-      expect(isCcHashError(err)).toBe(true);
-      expect((err as CcHashError).message).toBe("missing campaign-codex data");
-    }
+    expect(() => ccHash(entry({ noData: true }))).toThrow(CcHashError);
+    expect(() => ccHash(entry({ noData: true }))).toThrow(/missing campaign-codex data/);
  });

  it("throws CcHashError when data.description is absent/non-string (NOT coerced to empty)", () => {
    // A present-but-description-less entry must not silently hash "" — that
-    // would create a stable-but-wrong baseline, defeating the typed error's
-    // "no Foundry-side content yet" vs "content changed" distinction.
+    // would create a stable-but-wrong baseline.
    const e = entry({ data: { notes: "<p>orphan notes</p>" } as CcData });
-    expect(() => ccHash(e, stubInverse)).toThrow(CcHashError);
-    expect(() => ccHash(e, stubInverse)).toThrow(/description/);
+    expect(() => ccHash(e)).toThrow(CcHashError);
+    expect(() => ccHash(e)).toThrow(/description/);
  });

  it("ccHashFromGet surfaces relay errors unchanged (not wrapped as CcHashError)", async () => {
    const relayErr = new Error('relay 404 GET /get: No connected Foundry clients found');
    const fakeRelay = { getEntry: async (_uuid: string): Promise<JournalEntry> => { throw relayErr; } } as unknown as RelayClient;
    try {
-      await ccHashFromGet(fakeRelay, "JournalEntry.abc1", stubInverse);
+      await ccHashFromGet(fakeRelay, "JournalEntry.abc1");
      throw new Error("should have thrown");
    } catch (err) {
      expect(isCcHashError(err)).toBe(false);
@@ -197,16 +180,16 @@ describe("ccHash error handling (E0.2)", () => {
  it("ccHashFromGet returns { hash, entry } on success and derives the hash from the same response", async () => {
    const e = entry();
    const fakeRelay = { getEntry: async (_uuid: string): Promise<JournalEntry> => e } as unknown as RelayClient;
-    const result = await ccHashFromGet(fakeRelay, "JournalEntry.abc1", stubInverse);
+    const result = await ccHashFromGet(fakeRelay, "JournalEntry.abc1");
    expect(result.entry).toBe(e);
-    expect(result.hash).toBe(ccHash(e, stubInverse));
+    expect(result.hash).toBe(ccHash(e));
  });

  it("ccHashFromGet throws CcHashError (not relay error) when the entry is malformed", async () => {
    const malformed = entry({ noData: true });
    const fakeRelay = { getEntry: async (): Promise<JournalEntry> => malformed } as unknown as RelayClient;
    try {
-      await ccHashFromGet(fakeRelay, "JournalEntry.abc1", stubInverse);
+      await ccHashFromGet(fakeRelay, "JournalEntry.abc1");
      throw new Error("should have thrown");
    } catch (err) {
      expect(isCcHashError(err)).toBe(true);