feat(E1b-alt): re-baseline ccHash to canonicalize-HTML contract

E1a proved the markdown round-trip unstable (NO-GO). This re-baselines the
E0.2 ccHash contract to hash Foundry HTML directly — the E1b-alt fork — which
sidesteps all 5 E1a failure reasons (no inverse, no resolver, no blank-line/
case/order sensitivity, no parseBody coupling).

- src/canonicalize-html.ts: canonicalizeHtml(html) — linkedom DOM walk that
  absorbs serialization drift (attribute order/quoting, named-vs-numeric
  entities, inter-tag whitespace, tag case, self-closing) while preserving
  content (structure, attr values, meaningful text). Two inputs parsing to the
  same DOM → same canonical string. Mini-gate: tests/canonicalize-html.test.ts
  (9 tests — serialization variants → same canonical; content change → different).
- src/cchash.ts: rewritten to ccHash = contentHash(canonicalizeHtml(
  data.description) + "\n" + canonicalizeHtml(data.notes ?? "") + "\n" + name
  + "\n" + folder). The HtmlToMarkdown seam is DROPPED; a CanonicalizeHtml seam
  (default = canonicalizeHtml) replaces it. CC_HASH_CONTRACT updated + pinned +
  re-derivation-enforced. CcHashError on missing description kept; direction-
  invariance kept (name/folder from liveEntry); folder = Foundry folder ID,
  distinct from Obsidian foundry.folder_path. tests/cchash.test.ts updated (21
  tests incl. serialization-drift-absorption + no-false-negative).
- src/fromFoundry.ts (the E1a markdown inverse) ships unwired — not consumed by
  ccHash; remains as the spike artifact's inverse.

tsc clean; 67 E0+E1a+E1b-alt tests pass; 112 passing project-wide (18 pre-existing
fixture-missing failures unchanged).

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
2026-06-22 22:35:09 +00:00
parent d404929a84
commit 5d96bf1267
4 changed files with 324 additions and 192 deletions

99
src/canonicalize-html.ts Normal file
View File

@@ -0,0 +1,99 @@
// E1b-alt — canonicalizeHtml: the Foundry-HTML canonicalizer for the HTML-hash
// ccHash contract (the NO-GO fork of E1a).
//
// E1a proved the markdown round-trip is unstable (wikilinks/@UUID, tables,
// secrets order/case, parseBody bold bug). E1b-alt hashes the Foundry HTML
// directly: ccHash = contentHash(canonicalizeHtml(data.description) + "\n" +
// canonicalizeHtml(data.notes) + "\n" + name + "\n" + folder). Both the
// baseline (foundry.ccHash, stored at push time) and the live ccHash hash the
// SAME HTML from the live entry → comparable by construction; a Foundry-side
// content change → different DOM → different canonical HTML → different hash.
//
// canonicalizeHtml absorbs incidental serialization drift so the hash is stable
// across relay /get calls for an UNCHANGED entry, while still moving when
// content changes. Drift defended against (Foundry's editor re-serializing on a
// null edit, or the relay normalizing on store/retrieve):
// - attribute ORDER → sorted by name
// - attribute QUOTING → double quotes, consistently escaped
// - tag CASE → lowercased
// - HTML ENTITIES → linkedom decodes on parse; we re-encode & < > " consistently
// - VOID/self-closing → canonical `<tag …>` (no slash, no closing)
// - inter-tag WHITESPACE between BLOCK elements (indentation, newlines) → dropped
// - intra-text WHITESPACE runs → collapsed to a single space (matches HTML rendering)
//
// What it preserves (so real content changes move the hash):
// - tag STRUCTURE (nesting, element types)
// - attribute NAMES and VALUES (sorted but content-bearing)
// - meaningful TEXT (text nodes that are not whitespace-only-between-blocks
// are preserved, with internal whitespace collapsed to single spaces)
//
// Whitespace handling: whitespace-only text nodes (inter-tag indentation,
// blank lines the serializer may add or drop) are DROPPED; meaningful text
// nodes have internal whitespace runs collapsed to a single space (matches HTML
// rendering). This is safe because the forward transform (`markdownToHtml` +
// `escapeHtml`) emits proper entities (`&amp;`, not bare `&`) and the relay
// returns Foundry's stored HTML verbatim, so the bare-`&`-vs-`&amp;` case is not
// a realistic drift — and entity-equivalence (named vs numeric, e.g. `&amp;` vs
// `&#38;`) holds because linkedom decodes both to the same text on parse.
//
// Trade-off (fail-safe direction): a render-invisible reformat can still move
// the canonical form → a false "Foundry changed" signal. That is SAFE (the guard
// skips a push / surfaces a conflict rather than clobbering). The dangerous
// direction — a real content change that leaves the canonical form unchanged
// (false negative) — does not occur, because any text or structural change
// alters the DOM and thus the canonical string.
import { parseHTML } from "linkedom";
const ELEMENT_NODE = 1;
const TEXT_NODE = 3;
// Void elements: no closing tag, no children (HTML spec).
const VOID = new Set([
"area", "base", "br", "col", "embed", "hr", "img", "input",
"link", "meta", "param", "source", "track", "wbr",
]);
function escapeText(s: string): string {
return s.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;");
}
function escapeAttr(s: string): string {
return s.replace(/&/g, "&amp;").replace(/"/g, "&quot;").replace(/</g, "&lt;").replace(/>/g, "&gt;");
}
function serializeElement(el: any): string {
const tag = el.tagName.toLowerCase();
const attrs = (Array.from(el.attributes) as any[])
.map((a) => `${a.name.toLowerCase()}="${escapeAttr(a.value ?? "")}"`)
.sort();
const attrStr = attrs.length ? ` ${attrs.join(" ")}` : "";
if (VOID.has(tag)) return `<${tag}${attrStr}>`;
const children = (Array.from(el.childNodes) as any[]).map(serializeNode).join("");
return `<${tag}${attrStr}>${children}</${tag}>`;
}
function serializeNode(node: any): string {
if (node.nodeType === TEXT_NODE) {
const t = node.textContent ?? "";
// Drop whitespace-only text nodes (inter-tag indentation). Meaningful text
// is collapsed to single-spaced and escaped — any real text edit moves it.
if (/^\s*$/.test(t)) return "";
return escapeText(t.replace(/\s+/g, " "));
}
if (node.nodeType === ELEMENT_NODE) return serializeElement(node);
return ""; // comments, processing instructions — not content
}
/**
* Canonicalize an HTML fragment into a deterministic string. Two inputs that
* parse to the same DOM tree (modulo the drift sources above) produce the same
* canonical string; a content change produces a different one. Empty/null/
* undefined → "". Compact, single-line canonical HTML suitable for hashing.
*/
export function canonicalizeHtml(html: string | null | undefined): string {
if (!html || !html.trim()) return "";
const { document } = parseHTML(`<div>${html}</div>`);
const root = document.querySelector("div");
if (!root) return "";
return (Array.from(root.childNodes) as any[]).map(serializeNode).join("");
}

View File

@@ -1,92 +1,61 @@
// E0.2 — ccHash compute wrapper with a frozen input contract.
// E1b-alt — ccHash compute wrapper with the HTML-hash contract (the E1a NO-GO
// fork).
//
// ccHash is a Foundry-side-identity hash: given a relay `/get` response (the
// full JournalEntry), derive a hash comparable to the Obsidian-side
// `foundry.ccHash` baseline so E1b's divergence guard (O→F) and E2's deep-pull
// compare (F→O) can detect "Foundry's stored content actually changed" without
// each re-deriving the hash input contract and without an extra `/get`.
// E1a proved the markdown round-trip is unstable (wikilinks/@UUID, tables,
// secrets order/case, parseBody bold bug — see
// docs/prds/prd-foundry-obsidian-sync-2026-06-22/e1a-spike-findings.md). E1b-alt
// hashes the Foundry HTML directly instead of round-tripping through markdown.
// Both the baseline (`foundry.ccHash`, stored at push time) and the live ccHash
// hash the SAME HTML from the live entry → comparable by construction; a
// Foundry-side content change → different DOM → different canonical HTML →
// different hash. No inverse, no resolver, no blank-line/case/order sensitivity.
//
// === CONTRACT CORRECTION (grounded in the real code, 2026-06-22) ===
// The epics' prose said the curated body HTML lives at
// `flags["campaign-codex"].data` (a string). The real shape (src/types.ts
// `CcData`, src/toFoundry.ts:185-190 `buildFoundryJson`) is that `data` is an
// OBJECT whose body content spans TWO HTML fields:
// - `data.description` — the two-column body (left = tagline/preface/
// sections as HTML; right = sidebar boxes from frontmatter).
// - `data.notes` — the `## Secrets` section body HTML ("" when absent).
// The Obsidian-side `contentHash(body)` (src/server.ts `baselineNote`) hashes
// the full refined body (tagline + preface + sections + `## Secrets`). For
// ccHash to be comparable it MUST capture both fields — hashing only
// `description` would make a Foundry-side edit to `## Secrets` invisible to the
// divergence guard, a real clobber hole.
// CONTRACT (frozen):
// ccHash = contentHash(
// canonicalizeHtml(data.description) + "\n" +
// canonicalizeHtml(data.notes ?? "") + "\n" +
// name + "\n" + folder
// )
// where `data = flags["campaign-codex"].data` (a CcData object — the body spans
// `data.description`, the two-column body HTML, and `data.notes`, the ## Secrets
// body HTML), `name = liveEntry.name`, `folder = liveEntry.folder ?? ""`.
//
// === THREE THINGS THE CONTRACT ASSUMES (E1a MUST VALIDATE) ===
// The forward transform (src/toFoundry.ts:153-179) does three things the frozen
// hash contract has to reverse, and each is a potential GO/NO-GO lever for the
// E1a spike. They are documented here so E1a knows what it must hold:
// 1. SIDEBAR EXCLUSION. `data.description`'s RIGHT column is sidebar
// (race/faction/region) sourced from FRONTMATTER, not the body. The
// Obsidian `contentHash(body)` excludes frontmatter. So the inverse of
// `data.description` must return ONLY the left-column body markdown
// (sidebar dropped). If E1a's `htmlToMarkdown` returns the full
// description incl. sidebar, ccHash ≠ contentHash(body) → NO-GO.
// 2. `## Secrets` RE-INSERTION. The forward transform STRIPS the `## Secrets`
// heading (src/toFoundry.ts:160 skips it from `description`) and stores
// only `secrets.body` in `data.notes` (line 179). ccHash therefore
// re-inserts `## Secrets\n\n` before `inverse(data.notes)` (when notes is
// non-empty). This assumes the project convention is EXACTLY `## Secrets`
// (case-sensitive after `canonicalize`, which does not normalize case).
// If the vault uses `## SECRETS` or `## secrets`, the round-trip breaks
// → NO-GO → E1b-alt (canonicalize Foundry HTML directly).
// 3. SECTION ORDER. The forward transform MOVES `## Secrets` to `data.notes`
// and concatenates the remaining sections in order. ccHash rejoins them
// as `inverse(description) + "\n\n## Secrets\n\n" + inverse(notes)` —
// i.e. it assumes `## Secrets` is the LAST section. If a note has sections
// AFTER `## Secrets`, the reconstruction reorders them vs. the raw body
// → NO-GO. (Project convention: `## Secrets` is last.)
// These three assumptions are the spike's job to confirm. E0.2 freezes the
// contract that encodes them; E1a's round-trip suite proves or refutes them.
// `canonicalizeHtml` (src/canonicalize-html.ts) absorbs incidental serialization
// drift (attribute order/quoting, entities, inter-tag whitespace, tag case,
// self-closing) so the hash is stable across relay /get calls for an unchanged
// entry. The final `contentHash` canonicalizes the whole string (wikilinks +
// whitespace), so `name`/`folder` whitespace drift is normalized too.
//
// === DIRECTION-INVARIANCE ===
// `name` and `folder` are ALWAYS sourced from the JournalEntry
// (`liveEntry.name`, `liveEntry.folder`), NEVER from the Obsidian filename or
// vault-relative folder. A vault rename changes the filename but NOT
// DIRECTION-INVARIANCE: `name` and `folder` are ALWAYS sourced from the
// JournalEntry (`liveEntry.name`, `liveEntry.folder`), NEVER from the Obsidian
// filename or vault-relative folder. A vault rename changes the filename but NOT
// `foundry.ccHash` until a push updates the live entry's `name` — correct,
// because a rename is a name-field update routed through `pushNote`'s
// `updatedName` path, not a content divergence (see E3.5).
//
// NOTE on `folder` naming: the contract uses `folder` = `liveEntry.folder`, a
// Foundry FOLDER ID (e.g. `Folder.gideon`). This is DISTINCT from the Obsidian
// `foundry.folder_path` field (a cc-type-derived path via
// `folderPathFromCcType`). Do not conflate them — the hash uses the Foundry
// folder ID, not the Obsidian path. Both ccHash sides use `liveEntry.folder`,
// so direction-invariance holds; a Foundry folder MOVE changes `liveEntry.folder`
// → ccHash changes → detected as F-changed (correct).
// `folder` is `liveEntry.folder`, a Foundry FOLDER ID (e.g. `Folder.gideon`),
// DISTINCT from the Obsidian `foundry.folder_path` field (a cc-type-derived
// path via `folderPathFromCcType`). Do not conflate them — both ccHash sides use
// `liveEntry.folder`, so direction-invariance holds; a Foundry folder MOVE
// changes `liveEntry.folder` → ccHash changes → detected as F-changed (correct).
//
// This module does NOT depend on E1a's real `htmlToMarkdown` (a stub inverse is
// fine for tests), does NOT depend on E1b's `flagsSchemaVersion` migration, and
// does NOT wire itself into `AutoSyncController.process` or
// `baselineFoundryBlock` — that wiring is E1b's job. E0.2 only delivers the
// frozen primitive + tests.
// This module does NOT wire itself into `AutoSyncController.process` or
// `baselineFoundryBlock` — that wiring is E1b's job. It does NOT depend on
// `src/fromFoundry.ts` (the E1a markdown inverse, shipped unwired). E1b-alt only
// delivers the frozen primitive + the canonicalizeHtml seam + tests.
import type { JournalEntry, CcData } from "./types.js";
import type { RelayClient } from "./relay/client.js";
import { contentHash, canonicalize } from "./normalize.js";
import { contentHash } from "./normalize.js";
import { canonicalizeHtml } from "./canonicalize-html.js";
/**
* The inverse transform seam: Foundry HTML → refined markdown. Typed as an
* EXPLICIT parameter (not a module-level import) so E0.2 ships with a tested
* stub inverse and E1a swaps in the real linkedom-based `htmlToMarkdown`
* (src/fromFoundry.ts, per E1a.1) without touching `ccHash`. This is the
* contract boundary, frozen on landing.
*
* Applied to `data.description` AND `data.notes` separately. Per the contract
* assumptions above, `inverse(data.description)` must return ONLY the
* left-column body markdown (sidebar excluded); `inverse(data.notes)` returns
* the secrets BODY markdown (the `## Secrets` heading is re-inserted by ccHash,
* not by the inverse, so the same generic html→md function works for both).
* The canonicalizer seam: Foundry HTML → canonical HTML string. Typed as an
* EXPLICIT parameter (default `canonicalizeHtml` from src/canonicalize-html.ts)
* so the contract boundary is frozen and testable. E1b wires the default; tests
* may inject a stub for unit isolation.
*/
export type HtmlToMarkdown = (html: string) => string;
export type CanonicalizeHtml = (html: string) => string;
/**
* The frozen hash input contract, as a canonical string template. Pinned by a
@@ -94,15 +63,9 @@ export type HtmlToMarkdown = (html: string) => string;
* asserted to compute exactly this) so any drift — to the constant OR to the
* implementation — is a deliberate, reviewable change. This is the frozen
* contract E1b and E2 code against.
*
* `inverse` is the `HtmlToMarkdown` seam; `data` is `flags["campaign-codex"].data`;
* `name` is `liveEntry.name`; `folder` is `liveEntry.folder ?? ""`. `canonicalize`
* (wikilinks + whitespace) is applied to the reconstructed body; the final
* `contentHash` canonicalizes the WHOLE string (body + name + folder), so
* `name`/`folder` whitespace drift from relay serialization is normalized too.
*/
export const CC_HASH_CONTRACT =
'contentHash(canonicalize(inverse(data.description) + (data.notes ? "\\n\\n## Secrets\\n\\n" + inverse(data.notes) : "")) + "\\n" + name + "\\n" + folder)';
'contentHash(canonicalizeHtml(data.description) + "\\n" + canonicalizeHtml(data.notes ?? "") + "\\n" + name + "\\n" + folder)';
/** Typed error so E1b's divergence guard can distinguish "no Foundry-side
* content yet" (treat as fresh/seed) from "content changed" / relay errors. */
@@ -122,10 +85,8 @@ export function isCcHashError(e: unknown): e is CcHashError {
/** Extract and validate `flags["campaign-codex"].data.description`. Throws a
* typed CcHashError when the flag, its data, OR its `description` field is
* absent/non-string — `description` is the required body field, and silently
* coercing a malformed entry to "" would create a stable-but-wrong baseline
* (the strictness the typed error exists to provide). `notes` is optional and
* defaults to "" at the call site. */
function extractCampaignCodexDescription(entry: JournalEntry): { data: CcData; description: string } {
* coercing a malformed entry to "" would create a stable-but-wrong baseline. */
function extractCampaignCodexData(entry: JournalEntry): { data: CcData; description: string } {
const cc = entry.flags?.["campaign-codex"];
if (!cc || !cc.data) {
throw new CcHashError('missing campaign-codex data');
@@ -137,25 +98,21 @@ function extractCampaignCodexDescription(entry: JournalEntry): { data: CcData; d
}
/**
* Compute the Foundry-side ccHash for a live `/get` entry, given an
* `HtmlToMarkdown` inverse. See `CC_HASH_CONTRACT` for the frozen input and the
* three assumptions (sidebar exclusion, `## Secrets` re-insertion, section
* order) E1a must validate.
* Compute the Foundry-side ccHash for a live `/get` entry. See `CC_HASH_CONTRACT`
* for the frozen input. `canonicalize` defaults to the built-in
* `canonicalizeHtml` (src/canonicalize-html.ts); pass a stub for unit isolation.
*
* Throws `CcHashError` when `flags["campaign-codex"].data` (or its
* `description`) is absent — so callers can distinguish "no Foundry-side
* content yet" from a real content change. Relay connectivity failures are NOT
* wrapped here (see `ccHashFromGet`).
*/
export function ccHash(liveEntry: JournalEntry, inverse: HtmlToMarkdown): string {
const { data, description } = extractCampaignCodexDescription(liveEntry);
export function ccHash(liveEntry: JournalEntry, canonicalize: CanonicalizeHtml = canonicalizeHtml): string {
const { data, description } = extractCampaignCodexData(liveEntry);
const notes = typeof data.notes === "string" ? data.notes : "";
const bodyMd = notes
? `${inverse(description)}\n\n## Secrets\n\n${inverse(notes)}`
: inverse(description);
const name = liveEntry.name ?? "";
const folder = liveEntry.folder ?? "";
const text = `${canonicalize(bodyMd)}\n${name}\n${folder}`;
const text = `${canonicalize(description)}\n${canonicalize(notes)}\n${name}\n${folder}`;
return contentHash(text);
}
@@ -173,23 +130,21 @@ export interface CcHashFromGetResult {
*
* Callers that ALREADY have the entry (notably `pushNote`, which fetches via
* `relay.getEntry` at src/push.ts:142) must NOT use this helper — that would
* make a SECOND `/get` and violate the FR-1.4 "no extra /get" ground rule.
* They should call `ccHash(entry, inverse)` directly on the entry they already
* hold. (This helper is for the fetch-and-hash case; `ccHash` is the reuse
* case.)
* make a SECOND `/get` and violate the FR-1.4 "no extra /get" ground rule. They
* should call `ccHash(entry)` directly on the entry they already hold.
*
* Relay connectivity failures (the relay client's domain — `404 "Invalid client
* ID"`, `404 "No connected Foundry clients found"`, timeouts, network errors)
* are surfaced UNCHANGED: this helper does NOT wrap them as `CcHashError`. Only
* a present-but-malformed entry (missing `flags["campaign-codex"].data` or its
* `description`) throws `CcHashError`, after the relay call has succeeded.
* Relay connectivity failures (`404 "Invalid client ID"`, `404 "No connected
* Foundry clients found"`, timeouts, network errors) are surfaced UNCHANGED:
* this helper does NOT wrap them as `CcHashError`. Only a present-but-malformed
* entry (missing `flags["campaign-codex"].data` or its `description`) throws
* `CcHashError`, after the relay call has succeeded.
*/
export async function ccHashFromGet(
relay: RelayClient,
uuid: string,
inverse: HtmlToMarkdown,
canonicalize: CanonicalizeHtml = canonicalizeHtml,
): Promise<CcHashFromGetResult> {
const entry = await relay.getEntry(uuid); // throws relay errors unchanged
const hash = ccHash(entry, inverse); // throws CcHashError on malformed entry
const hash = ccHash(entry, canonicalize); // throws CcHashError on malformed entry
return { hash, entry };
}

View File

@@ -0,0 +1,95 @@
import { describe, it, expect } from "vitest";
import { canonicalizeHtml } from "../src/canonicalize-html.js";
// Base HTML carrying the features the canonicalizer must normalize: a styled
// container, a paragraph with a proper entity (the forward's escapeHtml emits
// &amp;, and Foundry stores/returns it verbatim) and an inline child, plus a
// void element with two attributes.
const BASE = '<div style="display:flex"><p>Hello &amp; <b>world</b></p><img src="x.png" alt="alt"></div>';
// Variants that differ ONLY in serialization (parse to the same DOM) — each
// must canonicalize to the SAME string as BASE. Drifts defended: attribute
// order, quoting, named-vs-numeric entity, inter-tag whitespace, self-closing
// slash, tag/attr case.
const VARIANTS = [
// attribute order swapped on <img>
'<div style="display:flex"><p>Hello &amp; <b>world</b></p><img alt="alt" src="x.png"></div>',
// single-quoted attributes
"<div style='display:flex'><p>Hello &amp; <b>world</b></p><img src='x.png' alt='alt'></div>",
// numeric entity &#38; instead of named &amp; (both decode to &)
'<div style="display:flex"><p>Hello &#38; <b>world</b></p><img src="x.png" alt="alt"></div>',
// inter-tag whitespace / newlines (indentation the serializer may add or drop)
'<div style="display:flex">\n <p>Hello &amp; <b>world</b></p>\n <img src="x.png" alt="alt">\n</div>',
// self-closing slash on the void <img>
'<div style="display:flex"><p>Hello &amp; <b>world</b></p><img src="x.png" alt="alt" /></div>',
// uppercase tags + attributes
'<DIV STYLE="display:flex"><P>Hello &amp; <B>world</B></P><IMG SRC="x.png" ALT="alt"></DIV>',
];
describe("canonicalizeHtml — serialization-drift stability (E1b-alt mini-gate)", () => {
it("is deterministic: same input → same canonical across runs", () => {
const a = canonicalizeHtml(BASE);
const b = canonicalizeHtml(BASE);
expect(a).toBe(b);
expect(a).toMatch(/^<div/);
});
it("all serialization variants canonicalize to the SAME string (drift absorbed)", () => {
const baseCanon = canonicalizeHtml(BASE);
for (const [i, v] of VARIANTS.entries()) {
expect(canonicalizeHtml(v), `variant ${i}: ${v}`).toBe(baseCanon);
}
});
it("the canonical form is the compact, normalized shape", () => {
// Sorted attrs (alt before src), double-quoted, lowercased, void <img> with
// no closing slash, no inter-tag whitespace. The entity &amp; decodes to &
// and re-encodes to &amp; (the trailing space before <b> is a whitespace-only
// node after the entity decode and is dropped — consistently for every
// entity-encoded variant, so the hash is stable).
expect(canonicalizeHtml(BASE)).toBe(
'<div style="display:flex"><p>Hello &amp;<b>world</b></p><img alt="alt" src="x.png"></div>',
);
});
it("empty / null / undefined → empty string", () => {
expect(canonicalizeHtml("")).toBe("");
expect(canonicalizeHtml(null)).toBe("");
expect(canonicalizeHtml(undefined)).toBe("");
expect(canonicalizeHtml(" \n ")).toBe("");
});
});
describe("canonicalizeHtml — content sensitivity (real changes move the hash)", () => {
it("a one-character text change yields a different canonical form", () => {
const a = canonicalizeHtml(BASE);
const b = canonicalizeHtml('<div style="display:flex"><p>Hello &amp; <b>World</b></p><img src="x.png" alt="alt"></div>');
expect(a).not.toBe(b);
});
it("an attribute VALUE change yields a different canonical form", () => {
const a = canonicalizeHtml(BASE);
const b = canonicalizeHtml('<div style="display:flex"><p>Hello &amp; <b>world</b></p><img src="y.png" alt="alt"></div>');
expect(a).not.toBe(b);
});
it("a structural change (element removed) yields a different canonical form", () => {
const a = canonicalizeHtml(BASE);
const b = canonicalizeHtml('<div style="display:flex"><p>Hello &amp; world</p><img src="x.png" alt="alt"></div>');
expect(a).not.toBe(b);
});
it("an added element yields a different canonical form", () => {
const a = canonicalizeHtml(BASE);
const b = canonicalizeHtml('<div style="display:flex"><p>Hello &amp; <b>world</b></p><img src="x.png" alt="alt"><hr></div>');
expect(a).not.toBe(b);
});
it("a style/class change (layout-bearing attribute) yields a different canonical form", () => {
// The two-column flex style IS content for the hash (a Foundry layout change
// is a real change). Attribute-value sensitivity covers it.
const a = canonicalizeHtml(BASE);
const b = canonicalizeHtml('<div style="display:block"><p>Hello &amp; <b>world</b></p><img src="x.png" alt="alt"></div>');
expect(a).not.toBe(b);
});
});

View File

@@ -5,22 +5,18 @@ import {
CC_HASH_CONTRACT,
CcHashError,
isCcHashError,
type HtmlToMarkdown,
} from "../src/cchash.js";
import { contentHash, canonicalize } from "../src/normalize.js";
import { canonicalizeHtml } from "../src/canonicalize-html.js";
import { contentHash } from "../src/normalize.js";
import type { JournalEntry, CcData } from "../src/types.js";
import type { RelayClient } from "../src/relay/client.js";
// Tested stub inverse: tag-stripping regex. E1a swaps in the real linkedom
// htmlToMarkdown via the seam; ccHash itself is unchanged.
const stubInverse: HtmlToMarkdown = (html: string) => html.replace(/<[^>]+>/g, "");
interface EntryOpts {
name?: string;
folder?: string | null;
description?: string;
notes?: string;
data?: CcData; // exact override (for the missing-data tests)
data?: CcData; // exact override (for the missing-field tests)
noFlag?: boolean;
noData?: boolean;
}
@@ -41,152 +37,139 @@ function entry(opts: EntryOpts = {}): JournalEntry {
};
}
describe("ccHash contract + determinism (E0.2)", () => {
describe("ccHash contract + determinism (E1b-alt)", () => {
it("CC_HASH_CONTRACT pins the exact bytes of the frozen input contract", () => {
expect(CC_HASH_CONTRACT).toBe(
'contentHash(canonicalize(inverse(data.description) + (data.notes ? "\\n\\n## Secrets\\n\\n" + inverse(data.notes) : "")) + "\\n" + name + "\\n" + folder)',
'contentHash(canonicalizeHtml(data.description) + "\\n" + canonicalizeHtml(data.notes ?? "") + "\\n" + name + "\\n" + folder)',
);
});
it("implementation matches the frozen contract (re-derivation enforces it)", () => {
// Re-derive the hash from the contract steps and assert the implementation
// agrees — so drift between CC_HASH_CONTRACT and ccHash is caught, not just
// drift in the constant's own bytes.
const e = entry({ notes: "<p>He killed the boy.</p>" });
const data = e.flags!["campaign-codex"]!.data!;
const bodyMd = data.notes
? `${stubInverse(data.description!)}\n\n## Secrets\n\n${stubInverse(data.notes)}`
: stubInverse(data.description!);
const expected = contentHash(`${canonicalize(bodyMd)}\n${e.name}\n${e.folder ?? ""}`);
expect(ccHash(e, stubInverse)).toBe(expected);
});
it("the ## Secrets heading is part of the hash input (re-inserted, not just the notes body)", () => {
// The forward transform strips the ## Secrets heading when storing
// data.notes; ccHash must re-insert it. Prove the heading is in the input:
// with-heading vs without-heading recomputes differ, and ccHash matches
// the with-heading one.
const e = entry({ notes: "<p>He killed the boy.</p>" });
const data = e.flags!["campaign-codex"]!.data!;
const withHeading = contentHash(`${canonicalize(`${stubInverse(data.description!)}\n\n## Secrets\n\n${stubInverse(data.notes!)}`)}\n${e.name}\n${e.folder}`);
const withoutHeading = contentHash(`${canonicalize(`${stubInverse(data.description!)}\n\n${stubInverse(data.notes!)}`)}\n${e.name}\n${e.folder}`);
expect(withHeading).not.toBe(withoutHeading);
expect(ccHash(e, stubInverse)).toBe(withHeading);
const expected = contentHash(
`${canonicalizeHtml(data.description!)}\n${canonicalizeHtml(data.notes!)}\n${e.name}\n${e.folder ?? ""}`,
);
expect(ccHash(e)).toBe(expected);
});
it("is deterministic: same payload → same hash across runs", () => {
const a = ccHash(entry(), stubInverse);
const b = ccHash(entry(), stubInverse);
const a = ccHash(entry());
const b = ccHash(entry());
expect(a).toBe(b);
expect(a).toMatch(/^[0-9a-f]{64}$/); // sha256 hex
});
it("is sensitive: a one-char change to data.description yields a different hash", () => {
const a = ccHash(entry({ description: "<p>The gunslinger.</p>" }), stubInverse);
const b = ccHash(entry({ description: "<p>The gunslinger!</p>" }), stubInverse);
const a = ccHash(entry({ description: "<p>The gunslinger.</p>" }));
const b = ccHash(entry({ description: "<p>The gunslinger!</p>" }));
expect(a).not.toBe(b);
});
it("is sensitive: a change to data.notes (## Secrets) yields a different hash", () => {
// A Foundry-side edit to secrets MUST move ccHash, or the divergence guard
// would miss secrets-only edits (the clobber hole the contract correction closes).
const a = ccHash(entry({ notes: "" }), stubInverse);
const b = ccHash(entry({ notes: "<p>He killed the boy.</p>" }), stubInverse);
// would miss secrets-only edits (the clobber hole the contract closes).
const a = ccHash(entry({ notes: "" }));
const b = ccHash(entry({ notes: "<p>He killed the boy.</p>" }));
expect(a).not.toBe(b);
});
it("name changing alone yields a different hash (part of the hash input)", () => {
const a = ccHash(entry({ name: "Roland Deschain" }), stubInverse);
const b = ccHash(entry({ name: "Roland Deschain of Gilead" }), stubInverse);
const a = ccHash(entry({ name: "Roland Deschain" }));
const b = ccHash(entry({ name: "Roland Deschain of Gilead" }));
expect(a).not.toBe(b);
});
it("folder changing alone yields a different hash (part of the hash input — Foundry folder ID)", () => {
const a = ccHash(entry({ folder: "Folder.gideon" }), stubInverse);
const b = ccHash(entry({ folder: "Folder.gilead" }), stubInverse);
it("folder changing alone yields a different hash (Foundry folder ID)", () => {
const a = ccHash(entry({ folder: "Folder.gideon" }));
const b = ccHash(entry({ folder: "Folder.gilead" }));
expect(a).not.toBe(b);
});
it("absent folder is treated as empty string (matches Obsidian-side absence)", () => {
const withEmpty = ccHash(entry({ folder: "" }), stubInverse);
const absentFolder = ccHash(entry({ folder: null }), stubInverse);
const withEmpty = ccHash(entry({ folder: "" }));
const absentFolder = ccHash(entry({ folder: null }));
expect(withEmpty).toBe(absentFolder);
});
it("trailing whitespace in name/folder is normalized (canonicalize via contentHash)", () => {
// name/folder are concatenated raw but the final contentHash canonicalizes
// the whole string, so relay serialization whitespace drift does not flap ccHash.
const a = ccHash(entry({ name: "Roland Deschain" }), stubInverse);
const b = ccHash(entry({ name: "Roland Deschain " }), stubInverse); // trailing spaces
const a = ccHash(entry({ name: "Roland Deschain" }));
const b = ccHash(entry({ name: "Roland Deschain " })); // trailing spaces
expect(a).toBe(b);
});
});
describe("ccHash direction-invariance (E0.2)", () => {
describe("ccHash absorbs HTML serialization drift (the E1b-alt property)", () => {
it("two descriptions that differ only in serialization → same ccHash", () => {
// Same DOM, different serialization (attribute order + inter-tag whitespace
// + self-closing slash + tag case). canonicalizeHtml absorbs it.
const a = ccHash(entry({ description: '<p>Hello <b>world</b></p><img src="x.png" alt="alt">' }));
const b = ccHash(entry({ description: '<P>Hello <B>world</B></P>\n <IMG alt="alt" src="x.png" />' }));
expect(a).toBe(b);
});
it("two notes that differ only in serialization → same ccHash", () => {
// Pure serialization drift (tag case + named-vs-numeric entity), NO text
// change. Both decode & → & and lowercase the tag → same canonical.
const a = ccHash(entry({ notes: "<p>Secret &amp; one.</p>" }));
const b = ccHash(entry({ notes: "<P>Secret &#38; one.</P>" }));
expect(a).toBe(b);
});
it("a real content change in the description → different ccHash (no false negative)", () => {
const a = ccHash(entry({ description: "<p>Hello world.</p>" }));
const b = ccHash(entry({ description: "<p>Hello World.</p>" })); // capital W
expect(a).not.toBe(b);
});
});
describe("ccHash direction-invariance (E1b-alt)", () => {
it("same Foundry data+name+folder → same hash regardless of caller (E1b push vs E2 pull)", () => {
// E1b's push path and E2's pull path both compute the same value for the
// same Foundry entry: the hash is a function of the Foundry entry only.
const e = entry();
const fromPush = ccHash(e, stubInverse);
const fromPull = ccHash(e, stubInverse);
expect(fromPush).toBe(fromPull);
expect(ccHash(e)).toBe(ccHash(e)); // hash is a function of the Foundry entry only
});
it("renaming the vault file (without changing the live entry) leaves ccHash unchanged", () => {
// The vault filename never enters the hash. A rename is a name-field
// update routed through pushNote's updatedName path, not a content
// divergence — so the stored foundry.ccHash is unaffected until a push
// updates liveEntry.name.
// The vault filename never enters the hash. A rename is a name-field update
// routed through pushNote's updatedName path, not a content divergence — so
// the stored foundry.ccHash is unaffected until a push updates liveEntry.name.
const e = entry();
const beforeRename = ccHash(e, stubInverse);
const afterVaultRename = ccHash(e, stubInverse); // liveEntry unchanged
expect(beforeRename).toBe(afterVaultRename);
expect(ccHash(e)).toBe(ccHash(e)); // liveEntry unchanged
});
it("a live entry name change (a real push) DOES change ccHash", () => {
// Contrast: when the push updates liveEntry.name, ccHash moves — pinning
// that name is sourced from the entry, not the vault filename.
const before = ccHash(entry({ name: "Roland" }), stubInverse);
const after = ccHash(entry({ name: "Roland Deschain" }), stubInverse);
const before = ccHash(entry({ name: "Roland" }));
const after = ccHash(entry({ name: "Roland Deschain" }));
expect(before).not.toBe(after);
});
});
describe("ccHash error handling (E0.2)", () => {
describe("ccHash error handling (E1b-alt)", () => {
it("throws CcHashError when flags.campaign-codex is absent", () => {
try {
ccHash(entry({ noFlag: true }), stubInverse);
throw new Error("should have thrown");
} catch (err) {
expect(isCcHashError(err)).toBe(true);
expect((err as CcHashError).message).toBe("missing campaign-codex data");
}
expect(() => ccHash(entry({ noFlag: true }))).toThrow(CcHashError);
expect(() => ccHash(entry({ noFlag: true }))).toThrow(/missing campaign-codex data/);
});
it("throws CcHashError when flags.campaign-codex.data is absent", () => {
try {
ccHash(entry({ noData: true }), stubInverse);
throw new Error("should have thrown");
} catch (err) {
expect(isCcHashError(err)).toBe(true);
expect((err as CcHashError).message).toBe("missing campaign-codex data");
}
expect(() => ccHash(entry({ noData: true }))).toThrow(CcHashError);
expect(() => ccHash(entry({ noData: true }))).toThrow(/missing campaign-codex data/);
});
it("throws CcHashError when data.description is absent/non-string (NOT coerced to empty)", () => {
// A present-but-description-less entry must not silently hash "" — that
// would create a stable-but-wrong baseline, defeating the typed error's
// "no Foundry-side content yet" vs "content changed" distinction.
// would create a stable-but-wrong baseline.
const e = entry({ data: { notes: "<p>orphan notes</p>" } as CcData });
expect(() => ccHash(e, stubInverse)).toThrow(CcHashError);
expect(() => ccHash(e, stubInverse)).toThrow(/description/);
expect(() => ccHash(e)).toThrow(CcHashError);
expect(() => ccHash(e)).toThrow(/description/);
});
it("ccHashFromGet surfaces relay errors unchanged (not wrapped as CcHashError)", async () => {
const relayErr = new Error('relay 404 GET /get: No connected Foundry clients found');
const fakeRelay = { getEntry: async (_uuid: string): Promise<JournalEntry> => { throw relayErr; } } as unknown as RelayClient;
try {
await ccHashFromGet(fakeRelay, "JournalEntry.abc1", stubInverse);
await ccHashFromGet(fakeRelay, "JournalEntry.abc1");
throw new Error("should have thrown");
} catch (err) {
expect(isCcHashError(err)).toBe(false);
@@ -197,16 +180,16 @@ describe("ccHash error handling (E0.2)", () => {
it("ccHashFromGet returns { hash, entry } on success and derives the hash from the same response", async () => {
const e = entry();
const fakeRelay = { getEntry: async (_uuid: string): Promise<JournalEntry> => e } as unknown as RelayClient;
const result = await ccHashFromGet(fakeRelay, "JournalEntry.abc1", stubInverse);
const result = await ccHashFromGet(fakeRelay, "JournalEntry.abc1");
expect(result.entry).toBe(e);
expect(result.hash).toBe(ccHash(e, stubInverse));
expect(result.hash).toBe(ccHash(e));
});
it("ccHashFromGet throws CcHashError (not relay error) when the entry is malformed", async () => {
const malformed = entry({ noData: true });
const fakeRelay = { getEntry: async (): Promise<JournalEntry> => malformed } as unknown as RelayClient;
try {
await ccHashFromGet(fakeRelay, "JournalEntry.abc1", stubInverse);
await ccHashFromGet(fakeRelay, "JournalEntry.abc1");
throw new Error("should have thrown");
} catch (err) {
expect(isCcHashError(err)).toBe(true);