feat(E1b-alt): re-baseline ccHash to canonicalize-HTML contract
E1a proved the markdown round-trip unstable (NO-GO). This re-baselines the E0.2 ccHash contract to hash Foundry HTML directly — the E1b-alt fork — which sidesteps all 5 E1a failure reasons (no inverse, no resolver, no blank-line/ case/order sensitivity, no parseBody coupling). - src/canonicalize-html.ts: canonicalizeHtml(html) — linkedom DOM walk that absorbs serialization drift (attribute order/quoting, named-vs-numeric entities, inter-tag whitespace, tag case, self-closing) while preserving content (structure, attr values, meaningful text). Two inputs parsing to the same DOM → same canonical string. Mini-gate: tests/canonicalize-html.test.ts (9 tests — serialization variants → same canonical; content change → different). - src/cchash.ts: rewritten to ccHash = contentHash(canonicalizeHtml( data.description) + "\n" + canonicalizeHtml(data.notes ?? "") + "\n" + name + "\n" + folder). The HtmlToMarkdown seam is DROPPED; a CanonicalizeHtml seam (default = canonicalizeHtml) replaces it. CC_HASH_CONTRACT updated + pinned + re-derivation-enforced. CcHashError on missing description kept; direction- invariance kept (name/folder from liveEntry); folder = Foundry folder ID, distinct from Obsidian foundry.folder_path. tests/cchash.test.ts updated (21 tests incl. serialization-drift-absorption + no-false-negative). - src/fromFoundry.ts (the E1a markdown inverse) ships unwired — not consumed by ccHash; remains as the spike artifact's inverse. tsc clean; 67 E0+E1a+E1b-alt tests pass; 112 passing project-wide (18 pre-existing fixture-missing failures unchanged). Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
99
src/canonicalize-html.ts
Normal file
99
src/canonicalize-html.ts
Normal file
@@ -0,0 +1,99 @@
|
||||
// E1b-alt — canonicalizeHtml: the Foundry-HTML canonicalizer for the HTML-hash
|
||||
// ccHash contract (the NO-GO fork of E1a).
|
||||
//
|
||||
// E1a proved the markdown round-trip is unstable (wikilinks/@UUID, tables,
|
||||
// secrets order/case, parseBody bold bug). E1b-alt hashes the Foundry HTML
|
||||
// directly: ccHash = contentHash(canonicalizeHtml(data.description) + "\n" +
|
||||
// canonicalizeHtml(data.notes) + "\n" + name + "\n" + folder). Both the
|
||||
// baseline (foundry.ccHash, stored at push time) and the live ccHash hash the
|
||||
// SAME HTML from the live entry → comparable by construction; a Foundry-side
|
||||
// content change → different DOM → different canonical HTML → different hash.
|
||||
//
|
||||
// canonicalizeHtml absorbs incidental serialization drift so the hash is stable
|
||||
// across relay /get calls for an UNCHANGED entry, while still moving when
|
||||
// content changes. Drift defended against (Foundry's editor re-serializing on a
|
||||
// null edit, or the relay normalizing on store/retrieve):
|
||||
// - attribute ORDER → sorted by name
|
||||
// - attribute QUOTING → double quotes, consistently escaped
|
||||
// - tag CASE → lowercased
|
||||
// - HTML ENTITIES → linkedom decodes on parse; we re-encode & < > " consistently
|
||||
// - VOID/self-closing → canonical `<tag …>` (no slash, no closing)
|
||||
// - inter-tag WHITESPACE between BLOCK elements (indentation, newlines) → dropped
|
||||
// - intra-text WHITESPACE runs → collapsed to a single space (matches HTML rendering)
|
||||
//
|
||||
// What it preserves (so real content changes move the hash):
|
||||
// - tag STRUCTURE (nesting, element types)
|
||||
// - attribute NAMES and VALUES (sorted but content-bearing)
|
||||
// - meaningful TEXT (text nodes that are not whitespace-only-between-blocks
|
||||
// are preserved, with internal whitespace collapsed to single spaces)
|
||||
//
|
||||
// Whitespace handling: whitespace-only text nodes (inter-tag indentation,
|
||||
// blank lines the serializer may add or drop) are DROPPED; meaningful text
|
||||
// nodes have internal whitespace runs collapsed to a single space (matches HTML
|
||||
// rendering). This is safe because the forward transform (`markdownToHtml` +
|
||||
// `escapeHtml`) emits proper entities (`&`, not bare `&`) and the relay
|
||||
// returns Foundry's stored HTML verbatim, so the bare-`&`-vs-`&` case is not
|
||||
// a realistic drift — and entity-equivalence (named vs numeric, e.g. `&` vs
|
||||
// `&`) holds because linkedom decodes both to the same text on parse.
|
||||
//
|
||||
// Trade-off (fail-safe direction): a render-invisible reformat can still move
|
||||
// the canonical form → a false "Foundry changed" signal. That is SAFE (the guard
|
||||
// skips a push / surfaces a conflict rather than clobbering). The dangerous
|
||||
// direction — a real content change that leaves the canonical form unchanged
|
||||
// (false negative) — does not occur, because any text or structural change
|
||||
// alters the DOM and thus the canonical string.
|
||||
|
||||
import { parseHTML } from "linkedom";
|
||||
|
||||
const ELEMENT_NODE = 1;
|
||||
const TEXT_NODE = 3;
|
||||
|
||||
// Void elements: no closing tag, no children (HTML spec).
|
||||
const VOID = new Set([
|
||||
"area", "base", "br", "col", "embed", "hr", "img", "input",
|
||||
"link", "meta", "param", "source", "track", "wbr",
|
||||
]);
|
||||
|
||||
function escapeText(s: string): string {
|
||||
return s.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">");
|
||||
}
|
||||
function escapeAttr(s: string): string {
|
||||
return s.replace(/&/g, "&").replace(/"/g, """).replace(/</g, "<").replace(/>/g, ">");
|
||||
}
|
||||
|
||||
function serializeElement(el: any): string {
|
||||
const tag = el.tagName.toLowerCase();
|
||||
const attrs = (Array.from(el.attributes) as any[])
|
||||
.map((a) => `${a.name.toLowerCase()}="${escapeAttr(a.value ?? "")}"`)
|
||||
.sort();
|
||||
const attrStr = attrs.length ? ` ${attrs.join(" ")}` : "";
|
||||
if (VOID.has(tag)) return `<${tag}${attrStr}>`;
|
||||
const children = (Array.from(el.childNodes) as any[]).map(serializeNode).join("");
|
||||
return `<${tag}${attrStr}>${children}</${tag}>`;
|
||||
}
|
||||
|
||||
function serializeNode(node: any): string {
|
||||
if (node.nodeType === TEXT_NODE) {
|
||||
const t = node.textContent ?? "";
|
||||
// Drop whitespace-only text nodes (inter-tag indentation). Meaningful text
|
||||
// is collapsed to single-spaced and escaped — any real text edit moves it.
|
||||
if (/^\s*$/.test(t)) return "";
|
||||
return escapeText(t.replace(/\s+/g, " "));
|
||||
}
|
||||
if (node.nodeType === ELEMENT_NODE) return serializeElement(node);
|
||||
return ""; // comments, processing instructions — not content
|
||||
}
|
||||
|
||||
/**
|
||||
* Canonicalize an HTML fragment into a deterministic string. Two inputs that
|
||||
* parse to the same DOM tree (modulo the drift sources above) produce the same
|
||||
* canonical string; a content change produces a different one. Empty/null/
|
||||
* undefined → "". Compact, single-line canonical HTML suitable for hashing.
|
||||
*/
|
||||
export function canonicalizeHtml(html: string | null | undefined): string {
|
||||
if (!html || !html.trim()) return "";
|
||||
const { document } = parseHTML(`<div>${html}</div>`);
|
||||
const root = document.querySelector("div");
|
||||
if (!root) return "";
|
||||
return (Array.from(root.childNodes) as any[]).map(serializeNode).join("");
|
||||
}
|
||||
167
src/cchash.ts
167
src/cchash.ts
@@ -1,92 +1,61 @@
|
||||
// E0.2 — ccHash compute wrapper with a frozen input contract.
|
||||
// E1b-alt — ccHash compute wrapper with the HTML-hash contract (the E1a NO-GO
|
||||
// fork).
|
||||
//
|
||||
// ccHash is a Foundry-side-identity hash: given a relay `/get` response (the
|
||||
// full JournalEntry), derive a hash comparable to the Obsidian-side
|
||||
// `foundry.ccHash` baseline so E1b's divergence guard (O→F) and E2's deep-pull
|
||||
// compare (F→O) can detect "Foundry's stored content actually changed" without
|
||||
// each re-deriving the hash input contract and without an extra `/get`.
|
||||
// E1a proved the markdown round-trip is unstable (wikilinks/@UUID, tables,
|
||||
// secrets order/case, parseBody bold bug — see
|
||||
// docs/prds/prd-foundry-obsidian-sync-2026-06-22/e1a-spike-findings.md). E1b-alt
|
||||
// hashes the Foundry HTML directly instead of round-tripping through markdown.
|
||||
// Both the baseline (`foundry.ccHash`, stored at push time) and the live ccHash
|
||||
// hash the SAME HTML from the live entry → comparable by construction; a
|
||||
// Foundry-side content change → different DOM → different canonical HTML →
|
||||
// different hash. No inverse, no resolver, no blank-line/case/order sensitivity.
|
||||
//
|
||||
// === CONTRACT CORRECTION (grounded in the real code, 2026-06-22) ===
|
||||
// The epics' prose said the curated body HTML lives at
|
||||
// `flags["campaign-codex"].data` (a string). The real shape (src/types.ts
|
||||
// `CcData`, src/toFoundry.ts:185-190 `buildFoundryJson`) is that `data` is an
|
||||
// OBJECT whose body content spans TWO HTML fields:
|
||||
// - `data.description` — the two-column body (left = tagline/preface/
|
||||
// sections as HTML; right = sidebar boxes from frontmatter).
|
||||
// - `data.notes` — the `## Secrets` section body HTML ("" when absent).
|
||||
// The Obsidian-side `contentHash(body)` (src/server.ts `baselineNote`) hashes
|
||||
// the full refined body (tagline + preface + sections + `## Secrets`). For
|
||||
// ccHash to be comparable it MUST capture both fields — hashing only
|
||||
// `description` would make a Foundry-side edit to `## Secrets` invisible to the
|
||||
// divergence guard, a real clobber hole.
|
||||
// CONTRACT (frozen):
|
||||
// ccHash = contentHash(
|
||||
// canonicalizeHtml(data.description) + "\n" +
|
||||
// canonicalizeHtml(data.notes ?? "") + "\n" +
|
||||
// name + "\n" + folder
|
||||
// )
|
||||
// where `data = flags["campaign-codex"].data` (a CcData object — the body spans
|
||||
// `data.description`, the two-column body HTML, and `data.notes`, the ## Secrets
|
||||
// body HTML), `name = liveEntry.name`, `folder = liveEntry.folder ?? ""`.
|
||||
//
|
||||
// === THREE THINGS THE CONTRACT ASSUMES (E1a MUST VALIDATE) ===
|
||||
// The forward transform (src/toFoundry.ts:153-179) does three things the frozen
|
||||
// hash contract has to reverse, and each is a potential GO/NO-GO lever for the
|
||||
// E1a spike. They are documented here so E1a knows what it must hold:
|
||||
// 1. SIDEBAR EXCLUSION. `data.description`'s RIGHT column is sidebar
|
||||
// (race/faction/region) sourced from FRONTMATTER, not the body. The
|
||||
// Obsidian `contentHash(body)` excludes frontmatter. So the inverse of
|
||||
// `data.description` must return ONLY the left-column body markdown
|
||||
// (sidebar dropped). If E1a's `htmlToMarkdown` returns the full
|
||||
// description incl. sidebar, ccHash ≠ contentHash(body) → NO-GO.
|
||||
// 2. `## Secrets` RE-INSERTION. The forward transform STRIPS the `## Secrets`
|
||||
// heading (src/toFoundry.ts:160 skips it from `description`) and stores
|
||||
// only `secrets.body` in `data.notes` (line 179). ccHash therefore
|
||||
// re-inserts `## Secrets\n\n` before `inverse(data.notes)` (when notes is
|
||||
// non-empty). This assumes the project convention is EXACTLY `## Secrets`
|
||||
// (case-sensitive after `canonicalize`, which does not normalize case).
|
||||
// If the vault uses `## SECRETS` or `## secrets`, the round-trip breaks
|
||||
// → NO-GO → E1b-alt (canonicalize Foundry HTML directly).
|
||||
// 3. SECTION ORDER. The forward transform MOVES `## Secrets` to `data.notes`
|
||||
// and concatenates the remaining sections in order. ccHash rejoins them
|
||||
// as `inverse(description) + "\n\n## Secrets\n\n" + inverse(notes)` —
|
||||
// i.e. it assumes `## Secrets` is the LAST section. If a note has sections
|
||||
// AFTER `## Secrets`, the reconstruction reorders them vs. the raw body
|
||||
// → NO-GO. (Project convention: `## Secrets` is last.)
|
||||
// These three assumptions are the spike's job to confirm. E0.2 freezes the
|
||||
// contract that encodes them; E1a's round-trip suite proves or refutes them.
|
||||
// `canonicalizeHtml` (src/canonicalize-html.ts) absorbs incidental serialization
|
||||
// drift (attribute order/quoting, entities, inter-tag whitespace, tag case,
|
||||
// self-closing) so the hash is stable across relay /get calls for an unchanged
|
||||
// entry. The final `contentHash` canonicalizes the whole string (wikilinks +
|
||||
// whitespace), so `name`/`folder` whitespace drift is normalized too.
|
||||
//
|
||||
// === DIRECTION-INVARIANCE ===
|
||||
// `name` and `folder` are ALWAYS sourced from the JournalEntry
|
||||
// (`liveEntry.name`, `liveEntry.folder`), NEVER from the Obsidian filename or
|
||||
// vault-relative folder. A vault rename changes the filename but NOT
|
||||
// DIRECTION-INVARIANCE: `name` and `folder` are ALWAYS sourced from the
|
||||
// JournalEntry (`liveEntry.name`, `liveEntry.folder`), NEVER from the Obsidian
|
||||
// filename or vault-relative folder. A vault rename changes the filename but NOT
|
||||
// `foundry.ccHash` until a push updates the live entry's `name` — correct,
|
||||
// because a rename is a name-field update routed through `pushNote`'s
|
||||
// `updatedName` path, not a content divergence (see E3.5).
|
||||
//
|
||||
// NOTE on `folder` naming: the contract uses `folder` = `liveEntry.folder`, a
|
||||
// Foundry FOLDER ID (e.g. `Folder.gideon`). This is DISTINCT from the Obsidian
|
||||
// `foundry.folder_path` field (a cc-type-derived path via
|
||||
// `folderPathFromCcType`). Do not conflate them — the hash uses the Foundry
|
||||
// folder ID, not the Obsidian path. Both ccHash sides use `liveEntry.folder`,
|
||||
// so direction-invariance holds; a Foundry folder MOVE changes `liveEntry.folder`
|
||||
// → ccHash changes → detected as F-changed (correct).
|
||||
// `folder` is `liveEntry.folder`, a Foundry FOLDER ID (e.g. `Folder.gideon`),
|
||||
// DISTINCT from the Obsidian `foundry.folder_path` field (a cc-type-derived
|
||||
// path via `folderPathFromCcType`). Do not conflate them — both ccHash sides use
|
||||
// `liveEntry.folder`, so direction-invariance holds; a Foundry folder MOVE
|
||||
// changes `liveEntry.folder` → ccHash changes → detected as F-changed (correct).
|
||||
//
|
||||
// This module does NOT depend on E1a's real `htmlToMarkdown` (a stub inverse is
|
||||
// fine for tests), does NOT depend on E1b's `flagsSchemaVersion` migration, and
|
||||
// does NOT wire itself into `AutoSyncController.process` or
|
||||
// `baselineFoundryBlock` — that wiring is E1b's job. E0.2 only delivers the
|
||||
// frozen primitive + tests.
|
||||
// This module does NOT wire itself into `AutoSyncController.process` or
|
||||
// `baselineFoundryBlock` — that wiring is E1b's job. It does NOT depend on
|
||||
// `src/fromFoundry.ts` (the E1a markdown inverse, shipped unwired). E1b-alt only
|
||||
// delivers the frozen primitive + the canonicalizeHtml seam + tests.
|
||||
|
||||
import type { JournalEntry, CcData } from "./types.js";
|
||||
import type { RelayClient } from "./relay/client.js";
|
||||
import { contentHash, canonicalize } from "./normalize.js";
|
||||
import { contentHash } from "./normalize.js";
|
||||
import { canonicalizeHtml } from "./canonicalize-html.js";
|
||||
|
||||
/**
|
||||
* The inverse transform seam: Foundry HTML → refined markdown. Typed as an
|
||||
* EXPLICIT parameter (not a module-level import) so E0.2 ships with a tested
|
||||
* stub inverse and E1a swaps in the real linkedom-based `htmlToMarkdown`
|
||||
* (src/fromFoundry.ts, per E1a.1) without touching `ccHash`. This is the
|
||||
* contract boundary, frozen on landing.
|
||||
*
|
||||
* Applied to `data.description` AND `data.notes` separately. Per the contract
|
||||
* assumptions above, `inverse(data.description)` must return ONLY the
|
||||
* left-column body markdown (sidebar excluded); `inverse(data.notes)` returns
|
||||
* the secrets BODY markdown (the `## Secrets` heading is re-inserted by ccHash,
|
||||
* not by the inverse, so the same generic html→md function works for both).
|
||||
* The canonicalizer seam: Foundry HTML → canonical HTML string. Typed as an
|
||||
* EXPLICIT parameter (default `canonicalizeHtml` from src/canonicalize-html.ts)
|
||||
* so the contract boundary is frozen and testable. E1b wires the default; tests
|
||||
* may inject a stub for unit isolation.
|
||||
*/
|
||||
export type HtmlToMarkdown = (html: string) => string;
|
||||
export type CanonicalizeHtml = (html: string) => string;
|
||||
|
||||
/**
|
||||
* The frozen hash input contract, as a canonical string template. Pinned by a
|
||||
@@ -94,15 +63,9 @@ export type HtmlToMarkdown = (html: string) => string;
|
||||
* asserted to compute exactly this) so any drift — to the constant OR to the
|
||||
* implementation — is a deliberate, reviewable change. This is the frozen
|
||||
* contract E1b and E2 code against.
|
||||
*
|
||||
* `inverse` is the `HtmlToMarkdown` seam; `data` is `flags["campaign-codex"].data`;
|
||||
* `name` is `liveEntry.name`; `folder` is `liveEntry.folder ?? ""`. `canonicalize`
|
||||
* (wikilinks + whitespace) is applied to the reconstructed body; the final
|
||||
* `contentHash` canonicalizes the WHOLE string (body + name + folder), so
|
||||
* `name`/`folder` whitespace drift from relay serialization is normalized too.
|
||||
*/
|
||||
export const CC_HASH_CONTRACT =
|
||||
'contentHash(canonicalize(inverse(data.description) + (data.notes ? "\\n\\n## Secrets\\n\\n" + inverse(data.notes) : "")) + "\\n" + name + "\\n" + folder)';
|
||||
'contentHash(canonicalizeHtml(data.description) + "\\n" + canonicalizeHtml(data.notes ?? "") + "\\n" + name + "\\n" + folder)';
|
||||
|
||||
/** Typed error so E1b's divergence guard can distinguish "no Foundry-side
|
||||
* content yet" (treat as fresh/seed) from "content changed" / relay errors. */
|
||||
@@ -122,10 +85,8 @@ export function isCcHashError(e: unknown): e is CcHashError {
|
||||
/** Extract and validate `flags["campaign-codex"].data.description`. Throws a
|
||||
* typed CcHashError when the flag, its data, OR its `description` field is
|
||||
* absent/non-string — `description` is the required body field, and silently
|
||||
* coercing a malformed entry to "" would create a stable-but-wrong baseline
|
||||
* (the strictness the typed error exists to provide). `notes` is optional and
|
||||
* defaults to "" at the call site. */
|
||||
function extractCampaignCodexDescription(entry: JournalEntry): { data: CcData; description: string } {
|
||||
* coercing a malformed entry to "" would create a stable-but-wrong baseline. */
|
||||
function extractCampaignCodexData(entry: JournalEntry): { data: CcData; description: string } {
|
||||
const cc = entry.flags?.["campaign-codex"];
|
||||
if (!cc || !cc.data) {
|
||||
throw new CcHashError('missing campaign-codex data');
|
||||
@@ -137,25 +98,21 @@ function extractCampaignCodexDescription(entry: JournalEntry): { data: CcData; d
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute the Foundry-side ccHash for a live `/get` entry, given an
|
||||
* `HtmlToMarkdown` inverse. See `CC_HASH_CONTRACT` for the frozen input and the
|
||||
* three assumptions (sidebar exclusion, `## Secrets` re-insertion, section
|
||||
* order) E1a must validate.
|
||||
* Compute the Foundry-side ccHash for a live `/get` entry. See `CC_HASH_CONTRACT`
|
||||
* for the frozen input. `canonicalize` defaults to the built-in
|
||||
* `canonicalizeHtml` (src/canonicalize-html.ts); pass a stub for unit isolation.
|
||||
*
|
||||
* Throws `CcHashError` when `flags["campaign-codex"].data` (or its
|
||||
* `description`) is absent — so callers can distinguish "no Foundry-side
|
||||
* content yet" from a real content change. Relay connectivity failures are NOT
|
||||
* wrapped here (see `ccHashFromGet`).
|
||||
*/
|
||||
export function ccHash(liveEntry: JournalEntry, inverse: HtmlToMarkdown): string {
|
||||
const { data, description } = extractCampaignCodexDescription(liveEntry);
|
||||
export function ccHash(liveEntry: JournalEntry, canonicalize: CanonicalizeHtml = canonicalizeHtml): string {
|
||||
const { data, description } = extractCampaignCodexData(liveEntry);
|
||||
const notes = typeof data.notes === "string" ? data.notes : "";
|
||||
const bodyMd = notes
|
||||
? `${inverse(description)}\n\n## Secrets\n\n${inverse(notes)}`
|
||||
: inverse(description);
|
||||
const name = liveEntry.name ?? "";
|
||||
const folder = liveEntry.folder ?? "";
|
||||
const text = `${canonicalize(bodyMd)}\n${name}\n${folder}`;
|
||||
const text = `${canonicalize(description)}\n${canonicalize(notes)}\n${name}\n${folder}`;
|
||||
return contentHash(text);
|
||||
}
|
||||
|
||||
@@ -173,23 +130,21 @@ export interface CcHashFromGetResult {
|
||||
*
|
||||
* Callers that ALREADY have the entry (notably `pushNote`, which fetches via
|
||||
* `relay.getEntry` at src/push.ts:142) must NOT use this helper — that would
|
||||
* make a SECOND `/get` and violate the FR-1.4 "no extra /get" ground rule.
|
||||
* They should call `ccHash(entry, inverse)` directly on the entry they already
|
||||
* hold. (This helper is for the fetch-and-hash case; `ccHash` is the reuse
|
||||
* case.)
|
||||
* make a SECOND `/get` and violate the FR-1.4 "no extra /get" ground rule. They
|
||||
* should call `ccHash(entry)` directly on the entry they already hold.
|
||||
*
|
||||
* Relay connectivity failures (the relay client's domain — `404 "Invalid client
|
||||
* ID"`, `404 "No connected Foundry clients found"`, timeouts, network errors)
|
||||
* are surfaced UNCHANGED: this helper does NOT wrap them as `CcHashError`. Only
|
||||
* a present-but-malformed entry (missing `flags["campaign-codex"].data` or its
|
||||
* `description`) throws `CcHashError`, after the relay call has succeeded.
|
||||
* Relay connectivity failures (`404 "Invalid client ID"`, `404 "No connected
|
||||
* Foundry clients found"`, timeouts, network errors) are surfaced UNCHANGED:
|
||||
* this helper does NOT wrap them as `CcHashError`. Only a present-but-malformed
|
||||
* entry (missing `flags["campaign-codex"].data` or its `description`) throws
|
||||
* `CcHashError`, after the relay call has succeeded.
|
||||
*/
|
||||
export async function ccHashFromGet(
|
||||
relay: RelayClient,
|
||||
uuid: string,
|
||||
inverse: HtmlToMarkdown,
|
||||
canonicalize: CanonicalizeHtml = canonicalizeHtml,
|
||||
): Promise<CcHashFromGetResult> {
|
||||
const entry = await relay.getEntry(uuid); // throws relay errors unchanged
|
||||
const hash = ccHash(entry, inverse); // throws CcHashError on malformed entry
|
||||
const hash = ccHash(entry, canonicalize); // throws CcHashError on malformed entry
|
||||
return { hash, entry };
|
||||
}
|
||||
95
tests/canonicalize-html.test.ts
Normal file
95
tests/canonicalize-html.test.ts
Normal file
@@ -0,0 +1,95 @@
|
||||
import { describe, it, expect } from "vitest";
|
||||
import { canonicalizeHtml } from "../src/canonicalize-html.js";
|
||||
|
||||
// Base HTML carrying the features the canonicalizer must normalize: a styled
|
||||
// container, a paragraph with a proper entity (the forward's escapeHtml emits
|
||||
// &, and Foundry stores/returns it verbatim) and an inline child, plus a
|
||||
// void element with two attributes.
|
||||
const BASE = '<div style="display:flex"><p>Hello & <b>world</b></p><img src="x.png" alt="alt"></div>';
|
||||
|
||||
// Variants that differ ONLY in serialization (parse to the same DOM) — each
|
||||
// must canonicalize to the SAME string as BASE. Drifts defended: attribute
|
||||
// order, quoting, named-vs-numeric entity, inter-tag whitespace, self-closing
|
||||
// slash, tag/attr case.
|
||||
const VARIANTS = [
|
||||
// attribute order swapped on <img>
|
||||
'<div style="display:flex"><p>Hello & <b>world</b></p><img alt="alt" src="x.png"></div>',
|
||||
// single-quoted attributes
|
||||
"<div style='display:flex'><p>Hello & <b>world</b></p><img src='x.png' alt='alt'></div>",
|
||||
// numeric entity & instead of named & (both decode to &)
|
||||
'<div style="display:flex"><p>Hello & <b>world</b></p><img src="x.png" alt="alt"></div>',
|
||||
// inter-tag whitespace / newlines (indentation the serializer may add or drop)
|
||||
'<div style="display:flex">\n <p>Hello & <b>world</b></p>\n <img src="x.png" alt="alt">\n</div>',
|
||||
// self-closing slash on the void <img>
|
||||
'<div style="display:flex"><p>Hello & <b>world</b></p><img src="x.png" alt="alt" /></div>',
|
||||
// uppercase tags + attributes
|
||||
'<DIV STYLE="display:flex"><P>Hello & <B>world</B></P><IMG SRC="x.png" ALT="alt"></DIV>',
|
||||
];
|
||||
|
||||
describe("canonicalizeHtml — serialization-drift stability (E1b-alt mini-gate)", () => {
|
||||
it("is deterministic: same input → same canonical across runs", () => {
|
||||
const a = canonicalizeHtml(BASE);
|
||||
const b = canonicalizeHtml(BASE);
|
||||
expect(a).toBe(b);
|
||||
expect(a).toMatch(/^<div/);
|
||||
});
|
||||
|
||||
it("all serialization variants canonicalize to the SAME string (drift absorbed)", () => {
|
||||
const baseCanon = canonicalizeHtml(BASE);
|
||||
for (const [i, v] of VARIANTS.entries()) {
|
||||
expect(canonicalizeHtml(v), `variant ${i}: ${v}`).toBe(baseCanon);
|
||||
}
|
||||
});
|
||||
|
||||
it("the canonical form is the compact, normalized shape", () => {
|
||||
// Sorted attrs (alt before src), double-quoted, lowercased, void <img> with
|
||||
// no closing slash, no inter-tag whitespace. The entity & decodes to &
|
||||
// and re-encodes to & (the trailing space before <b> is a whitespace-only
|
||||
// node after the entity decode and is dropped — consistently for every
|
||||
// entity-encoded variant, so the hash is stable).
|
||||
expect(canonicalizeHtml(BASE)).toBe(
|
||||
'<div style="display:flex"><p>Hello &<b>world</b></p><img alt="alt" src="x.png"></div>',
|
||||
);
|
||||
});
|
||||
|
||||
it("empty / null / undefined → empty string", () => {
|
||||
expect(canonicalizeHtml("")).toBe("");
|
||||
expect(canonicalizeHtml(null)).toBe("");
|
||||
expect(canonicalizeHtml(undefined)).toBe("");
|
||||
expect(canonicalizeHtml(" \n ")).toBe("");
|
||||
});
|
||||
});
|
||||
|
||||
describe("canonicalizeHtml — content sensitivity (real changes move the hash)", () => {
|
||||
it("a one-character text change yields a different canonical form", () => {
|
||||
const a = canonicalizeHtml(BASE);
|
||||
const b = canonicalizeHtml('<div style="display:flex"><p>Hello & <b>World</b></p><img src="x.png" alt="alt"></div>');
|
||||
expect(a).not.toBe(b);
|
||||
});
|
||||
|
||||
it("an attribute VALUE change yields a different canonical form", () => {
|
||||
const a = canonicalizeHtml(BASE);
|
||||
const b = canonicalizeHtml('<div style="display:flex"><p>Hello & <b>world</b></p><img src="y.png" alt="alt"></div>');
|
||||
expect(a).not.toBe(b);
|
||||
});
|
||||
|
||||
it("a structural change (element removed) yields a different canonical form", () => {
|
||||
const a = canonicalizeHtml(BASE);
|
||||
const b = canonicalizeHtml('<div style="display:flex"><p>Hello & world</p><img src="x.png" alt="alt"></div>');
|
||||
expect(a).not.toBe(b);
|
||||
});
|
||||
|
||||
it("an added element yields a different canonical form", () => {
|
||||
const a = canonicalizeHtml(BASE);
|
||||
const b = canonicalizeHtml('<div style="display:flex"><p>Hello & <b>world</b></p><img src="x.png" alt="alt"><hr></div>');
|
||||
expect(a).not.toBe(b);
|
||||
});
|
||||
|
||||
it("a style/class change (layout-bearing attribute) yields a different canonical form", () => {
|
||||
// The two-column flex style IS content for the hash (a Foundry layout change
|
||||
// is a real change). Attribute-value sensitivity covers it.
|
||||
const a = canonicalizeHtml(BASE);
|
||||
const b = canonicalizeHtml('<div style="display:block"><p>Hello & <b>world</b></p><img src="x.png" alt="alt"></div>');
|
||||
expect(a).not.toBe(b);
|
||||
});
|
||||
});
|
||||
@@ -5,22 +5,18 @@ import {
|
||||
CC_HASH_CONTRACT,
|
||||
CcHashError,
|
||||
isCcHashError,
|
||||
type HtmlToMarkdown,
|
||||
} from "../src/cchash.js";
|
||||
import { contentHash, canonicalize } from "../src/normalize.js";
|
||||
import { canonicalizeHtml } from "../src/canonicalize-html.js";
|
||||
import { contentHash } from "../src/normalize.js";
|
||||
import type { JournalEntry, CcData } from "../src/types.js";
|
||||
import type { RelayClient } from "../src/relay/client.js";
|
||||
|
||||
// Tested stub inverse: tag-stripping regex. E1a swaps in the real linkedom
|
||||
// htmlToMarkdown via the seam; ccHash itself is unchanged.
|
||||
const stubInverse: HtmlToMarkdown = (html: string) => html.replace(/<[^>]+>/g, "");
|
||||
|
||||
interface EntryOpts {
|
||||
name?: string;
|
||||
folder?: string | null;
|
||||
description?: string;
|
||||
notes?: string;
|
||||
data?: CcData; // exact override (for the missing-data tests)
|
||||
data?: CcData; // exact override (for the missing-field tests)
|
||||
noFlag?: boolean;
|
||||
noData?: boolean;
|
||||
}
|
||||
@@ -41,152 +37,139 @@ function entry(opts: EntryOpts = {}): JournalEntry {
|
||||
};
|
||||
}
|
||||
|
||||
describe("ccHash contract + determinism (E0.2)", () => {
|
||||
describe("ccHash contract + determinism (E1b-alt)", () => {
|
||||
it("CC_HASH_CONTRACT pins the exact bytes of the frozen input contract", () => {
|
||||
expect(CC_HASH_CONTRACT).toBe(
|
||||
'contentHash(canonicalize(inverse(data.description) + (data.notes ? "\\n\\n## Secrets\\n\\n" + inverse(data.notes) : "")) + "\\n" + name + "\\n" + folder)',
|
||||
'contentHash(canonicalizeHtml(data.description) + "\\n" + canonicalizeHtml(data.notes ?? "") + "\\n" + name + "\\n" + folder)',
|
||||
);
|
||||
});
|
||||
|
||||
it("implementation matches the frozen contract (re-derivation enforces it)", () => {
|
||||
// Re-derive the hash from the contract steps and assert the implementation
|
||||
// agrees — so drift between CC_HASH_CONTRACT and ccHash is caught, not just
|
||||
// drift in the constant's own bytes.
|
||||
const e = entry({ notes: "<p>He killed the boy.</p>" });
|
||||
const data = e.flags!["campaign-codex"]!.data!;
|
||||
const bodyMd = data.notes
|
||||
? `${stubInverse(data.description!)}\n\n## Secrets\n\n${stubInverse(data.notes)}`
|
||||
: stubInverse(data.description!);
|
||||
const expected = contentHash(`${canonicalize(bodyMd)}\n${e.name}\n${e.folder ?? ""}`);
|
||||
expect(ccHash(e, stubInverse)).toBe(expected);
|
||||
});
|
||||
|
||||
it("the ## Secrets heading is part of the hash input (re-inserted, not just the notes body)", () => {
|
||||
// The forward transform strips the ## Secrets heading when storing
|
||||
// data.notes; ccHash must re-insert it. Prove the heading is in the input:
|
||||
// with-heading vs without-heading recomputes differ, and ccHash matches
|
||||
// the with-heading one.
|
||||
const e = entry({ notes: "<p>He killed the boy.</p>" });
|
||||
const data = e.flags!["campaign-codex"]!.data!;
|
||||
const withHeading = contentHash(`${canonicalize(`${stubInverse(data.description!)}\n\n## Secrets\n\n${stubInverse(data.notes!)}`)}\n${e.name}\n${e.folder}`);
|
||||
const withoutHeading = contentHash(`${canonicalize(`${stubInverse(data.description!)}\n\n${stubInverse(data.notes!)}`)}\n${e.name}\n${e.folder}`);
|
||||
expect(withHeading).not.toBe(withoutHeading);
|
||||
expect(ccHash(e, stubInverse)).toBe(withHeading);
|
||||
const expected = contentHash(
|
||||
`${canonicalizeHtml(data.description!)}\n${canonicalizeHtml(data.notes!)}\n${e.name}\n${e.folder ?? ""}`,
|
||||
);
|
||||
expect(ccHash(e)).toBe(expected);
|
||||
});
|
||||
|
||||
it("is deterministic: same payload → same hash across runs", () => {
|
||||
const a = ccHash(entry(), stubInverse);
|
||||
const b = ccHash(entry(), stubInverse);
|
||||
const a = ccHash(entry());
|
||||
const b = ccHash(entry());
|
||||
expect(a).toBe(b);
|
||||
expect(a).toMatch(/^[0-9a-f]{64}$/); // sha256 hex
|
||||
});
|
||||
|
||||
it("is sensitive: a one-char change to data.description yields a different hash", () => {
|
||||
const a = ccHash(entry({ description: "<p>The gunslinger.</p>" }), stubInverse);
|
||||
const b = ccHash(entry({ description: "<p>The gunslinger!</p>" }), stubInverse);
|
||||
const a = ccHash(entry({ description: "<p>The gunslinger.</p>" }));
|
||||
const b = ccHash(entry({ description: "<p>The gunslinger!</p>" }));
|
||||
expect(a).not.toBe(b);
|
||||
});
|
||||
|
||||
it("is sensitive: a change to data.notes (## Secrets) yields a different hash", () => {
|
||||
// A Foundry-side edit to secrets MUST move ccHash, or the divergence guard
|
||||
// would miss secrets-only edits (the clobber hole the contract correction closes).
|
||||
const a = ccHash(entry({ notes: "" }), stubInverse);
|
||||
const b = ccHash(entry({ notes: "<p>He killed the boy.</p>" }), stubInverse);
|
||||
// would miss secrets-only edits (the clobber hole the contract closes).
|
||||
const a = ccHash(entry({ notes: "" }));
|
||||
const b = ccHash(entry({ notes: "<p>He killed the boy.</p>" }));
|
||||
expect(a).not.toBe(b);
|
||||
});
|
||||
|
||||
it("name changing alone yields a different hash (part of the hash input)", () => {
|
||||
const a = ccHash(entry({ name: "Roland Deschain" }), stubInverse);
|
||||
const b = ccHash(entry({ name: "Roland Deschain of Gilead" }), stubInverse);
|
||||
const a = ccHash(entry({ name: "Roland Deschain" }));
|
||||
const b = ccHash(entry({ name: "Roland Deschain of Gilead" }));
|
||||
expect(a).not.toBe(b);
|
||||
});
|
||||
|
||||
it("folder changing alone yields a different hash (part of the hash input — Foundry folder ID)", () => {
|
||||
const a = ccHash(entry({ folder: "Folder.gideon" }), stubInverse);
|
||||
const b = ccHash(entry({ folder: "Folder.gilead" }), stubInverse);
|
||||
it("folder changing alone yields a different hash (Foundry folder ID)", () => {
|
||||
const a = ccHash(entry({ folder: "Folder.gideon" }));
|
||||
const b = ccHash(entry({ folder: "Folder.gilead" }));
|
||||
expect(a).not.toBe(b);
|
||||
});
|
||||
|
||||
it("absent folder is treated as empty string (matches Obsidian-side absence)", () => {
|
||||
const withEmpty = ccHash(entry({ folder: "" }), stubInverse);
|
||||
const absentFolder = ccHash(entry({ folder: null }), stubInverse);
|
||||
const withEmpty = ccHash(entry({ folder: "" }));
|
||||
const absentFolder = ccHash(entry({ folder: null }));
|
||||
expect(withEmpty).toBe(absentFolder);
|
||||
});
|
||||
|
||||
it("trailing whitespace in name/folder is normalized (canonicalize via contentHash)", () => {
|
||||
// name/folder are concatenated raw but the final contentHash canonicalizes
|
||||
// the whole string, so relay serialization whitespace drift does not flap ccHash.
|
||||
const a = ccHash(entry({ name: "Roland Deschain" }), stubInverse);
|
||||
const b = ccHash(entry({ name: "Roland Deschain " }), stubInverse); // trailing spaces
|
||||
const a = ccHash(entry({ name: "Roland Deschain" }));
|
||||
const b = ccHash(entry({ name: "Roland Deschain " })); // trailing spaces
|
||||
expect(a).toBe(b);
|
||||
});
|
||||
});
|
||||
|
||||
describe("ccHash direction-invariance (E0.2)", () => {
|
||||
describe("ccHash absorbs HTML serialization drift (the E1b-alt property)", () => {
|
||||
it("two descriptions that differ only in serialization → same ccHash", () => {
|
||||
// Same DOM, different serialization (attribute order + inter-tag whitespace
|
||||
// + self-closing slash + tag case). canonicalizeHtml absorbs it.
|
||||
const a = ccHash(entry({ description: '<p>Hello <b>world</b></p><img src="x.png" alt="alt">' }));
|
||||
const b = ccHash(entry({ description: '<P>Hello <B>world</B></P>\n <IMG alt="alt" src="x.png" />' }));
|
||||
expect(a).toBe(b);
|
||||
});
|
||||
|
||||
it("two notes that differ only in serialization → same ccHash", () => {
|
||||
// Pure serialization drift (tag case + named-vs-numeric entity), NO text
|
||||
// change. Both decode & → & and lowercase the tag → same canonical.
|
||||
const a = ccHash(entry({ notes: "<p>Secret & one.</p>" }));
|
||||
const b = ccHash(entry({ notes: "<P>Secret & one.</P>" }));
|
||||
expect(a).toBe(b);
|
||||
});
|
||||
|
||||
it("a real content change in the description → different ccHash (no false negative)", () => {
|
||||
const a = ccHash(entry({ description: "<p>Hello world.</p>" }));
|
||||
const b = ccHash(entry({ description: "<p>Hello World.</p>" })); // capital W
|
||||
expect(a).not.toBe(b);
|
||||
});
|
||||
});
|
||||
|
||||
describe("ccHash direction-invariance (E1b-alt)", () => {
|
||||
it("same Foundry data+name+folder → same hash regardless of caller (E1b push vs E2 pull)", () => {
|
||||
// E1b's push path and E2's pull path both compute the same value for the
|
||||
// same Foundry entry: the hash is a function of the Foundry entry only.
|
||||
const e = entry();
|
||||
const fromPush = ccHash(e, stubInverse);
|
||||
const fromPull = ccHash(e, stubInverse);
|
||||
expect(fromPush).toBe(fromPull);
|
||||
expect(ccHash(e)).toBe(ccHash(e)); // hash is a function of the Foundry entry only
|
||||
});
|
||||
|
||||
it("renaming the vault file (without changing the live entry) leaves ccHash unchanged", () => {
|
||||
// The vault filename never enters the hash. A rename is a name-field
|
||||
// update routed through pushNote's updatedName path, not a content
|
||||
// divergence — so the stored foundry.ccHash is unaffected until a push
|
||||
// updates liveEntry.name.
|
||||
// The vault filename never enters the hash. A rename is a name-field update
|
||||
// routed through pushNote's updatedName path, not a content divergence — so
|
||||
// the stored foundry.ccHash is unaffected until a push updates liveEntry.name.
|
||||
const e = entry();
|
||||
const beforeRename = ccHash(e, stubInverse);
|
||||
const afterVaultRename = ccHash(e, stubInverse); // liveEntry unchanged
|
||||
expect(beforeRename).toBe(afterVaultRename);
|
||||
expect(ccHash(e)).toBe(ccHash(e)); // liveEntry unchanged
|
||||
});
|
||||
|
||||
it("a live entry name change (a real push) DOES change ccHash", () => {
|
||||
// Contrast: when the push updates liveEntry.name, ccHash moves — pinning
|
||||
// that name is sourced from the entry, not the vault filename.
|
||||
const before = ccHash(entry({ name: "Roland" }), stubInverse);
|
||||
const after = ccHash(entry({ name: "Roland Deschain" }), stubInverse);
|
||||
const before = ccHash(entry({ name: "Roland" }));
|
||||
const after = ccHash(entry({ name: "Roland Deschain" }));
|
||||
expect(before).not.toBe(after);
|
||||
});
|
||||
});
|
||||
|
||||
describe("ccHash error handling (E0.2)", () => {
|
||||
describe("ccHash error handling (E1b-alt)", () => {
|
||||
it("throws CcHashError when flags.campaign-codex is absent", () => {
|
||||
try {
|
||||
ccHash(entry({ noFlag: true }), stubInverse);
|
||||
throw new Error("should have thrown");
|
||||
} catch (err) {
|
||||
expect(isCcHashError(err)).toBe(true);
|
||||
expect((err as CcHashError).message).toBe("missing campaign-codex data");
|
||||
}
|
||||
expect(() => ccHash(entry({ noFlag: true }))).toThrow(CcHashError);
|
||||
expect(() => ccHash(entry({ noFlag: true }))).toThrow(/missing campaign-codex data/);
|
||||
});
|
||||
|
||||
it("throws CcHashError when flags.campaign-codex.data is absent", () => {
|
||||
try {
|
||||
ccHash(entry({ noData: true }), stubInverse);
|
||||
throw new Error("should have thrown");
|
||||
} catch (err) {
|
||||
expect(isCcHashError(err)).toBe(true);
|
||||
expect((err as CcHashError).message).toBe("missing campaign-codex data");
|
||||
}
|
||||
expect(() => ccHash(entry({ noData: true }))).toThrow(CcHashError);
|
||||
expect(() => ccHash(entry({ noData: true }))).toThrow(/missing campaign-codex data/);
|
||||
});
|
||||
|
||||
it("throws CcHashError when data.description is absent/non-string (NOT coerced to empty)", () => {
|
||||
// A present-but-description-less entry must not silently hash "" — that
|
||||
// would create a stable-but-wrong baseline, defeating the typed error's
|
||||
// "no Foundry-side content yet" vs "content changed" distinction.
|
||||
// would create a stable-but-wrong baseline.
|
||||
const e = entry({ data: { notes: "<p>orphan notes</p>" } as CcData });
|
||||
expect(() => ccHash(e, stubInverse)).toThrow(CcHashError);
|
||||
expect(() => ccHash(e, stubInverse)).toThrow(/description/);
|
||||
expect(() => ccHash(e)).toThrow(CcHashError);
|
||||
expect(() => ccHash(e)).toThrow(/description/);
|
||||
});
|
||||
|
||||
it("ccHashFromGet surfaces relay errors unchanged (not wrapped as CcHashError)", async () => {
|
||||
const relayErr = new Error('relay 404 GET /get: No connected Foundry clients found');
|
||||
const fakeRelay = { getEntry: async (_uuid: string): Promise<JournalEntry> => { throw relayErr; } } as unknown as RelayClient;
|
||||
try {
|
||||
await ccHashFromGet(fakeRelay, "JournalEntry.abc1", stubInverse);
|
||||
await ccHashFromGet(fakeRelay, "JournalEntry.abc1");
|
||||
throw new Error("should have thrown");
|
||||
} catch (err) {
|
||||
expect(isCcHashError(err)).toBe(false);
|
||||
@@ -197,16 +180,16 @@ describe("ccHash error handling (E0.2)", () => {
|
||||
it("ccHashFromGet returns { hash, entry } on success and derives the hash from the same response", async () => {
|
||||
const e = entry();
|
||||
const fakeRelay = { getEntry: async (_uuid: string): Promise<JournalEntry> => e } as unknown as RelayClient;
|
||||
const result = await ccHashFromGet(fakeRelay, "JournalEntry.abc1", stubInverse);
|
||||
const result = await ccHashFromGet(fakeRelay, "JournalEntry.abc1");
|
||||
expect(result.entry).toBe(e);
|
||||
expect(result.hash).toBe(ccHash(e, stubInverse));
|
||||
expect(result.hash).toBe(ccHash(e));
|
||||
});
|
||||
|
||||
it("ccHashFromGet throws CcHashError (not relay error) when the entry is malformed", async () => {
|
||||
const malformed = entry({ noData: true });
|
||||
const fakeRelay = { getEntry: async (): Promise<JournalEntry> => malformed } as unknown as RelayClient;
|
||||
try {
|
||||
await ccHashFromGet(fakeRelay, "JournalEntry.abc1", stubInverse);
|
||||
await ccHashFromGet(fakeRelay, "JournalEntry.abc1");
|
||||
throw new Error("should have thrown");
|
||||
} catch (err) {
|
||||
expect(isCcHashError(err)).toBe(true);
|
||||
|
||||
Reference in New Issue
Block a user