diff --git a/explorer.qmd b/explorer.qmd index 38f2a4d..dbd645b 100644 --- a/explorer.qmd +++ b/explorer.qmd @@ -797,6 +797,11 @@ FACET_TREE = (() => { // parent_uri, depth, count) and per-sample membership (pid ↔ every ancestor). facet_tree_url = `${R2_BASE}/isamples_202608_facet_tree_summaries.parquet` membership_url = `${R2_BASE}/isamples_202608_sample_facet_membership.parquet` +// #290/#293 single-active-filter cross-filter cube over the trees + source. +// Precomputed COUNT(DISTINCT pid) for each target node given ONE active filter, +// so global-view cross-filtered tree counts are instant instead of a live +// membership near-full-scan. ~1k rows; same schema as facet_cross_filter. +tree_cross_filter_url = `${R2_BASE}/isamples_202608_facet_tree_cross_filter.parquet` // Canonical palette — see issue #113. Path-relative so this works under // both isamples.org (custom domain at root) and project-pages fork @@ -1117,16 +1122,24 @@ function facetFilterSQL() { // concept NODES → filter via the membership table (which encodes each sample // under every ancestor), so a selected parent matches its whole subtree (no // client-side descendant expansion). A flat (non-tree) dim filters on facets_v3. + const treeClauses = []; // one "(facet_type='X' AND concept_uri IN(...))" per tree dim for (const key of TREE_DIM_KEYS) { const sel = treeSelection(key); if (sel.length === 0) continue; const list = sel.map(s => `'${escSql(s)}'`).join(','); if (treeActive(key)) { - parts.push(`pid IN (SELECT DISTINCT pid FROM read_parquet('${membership_url}') WHERE facet_type='${key}' AND concept_uri IN (${list}))`); + treeClauses.push(`(facet_type='${key}' AND concept_uri IN (${list}))`); } else { facetsConds.push(`${key} IN (${list})`); } } + // #293: collapse the tree-dim selections into ONE membership scan rather than + // one AND-ed subquery per dim (N full scans → 1 in DuckDB-WASM). AND across dims + // is `HAVING COUNT(DISTINCT facet_type) = <#dims>`; OR within a dim is the + // `concept_uri IN (...)`. Single-dim collapses to the same one scan as before. + if (treeClauses.length > 0) { + parts.push(`pid IN (SELECT pid FROM read_parquet('${membership_url}') WHERE ${treeClauses.join(' OR ')} GROUP BY pid HAVING COUNT(DISTINCT facet_type) = ${treeClauses.length})`); + } if (facetsConds.length > 0) { parts.push(`pid IN (SELECT DISTINCT pid FROM read_parquet('${facets_url}') WHERE ${facetsConds.join(' AND ')})`); } @@ -3070,6 +3083,64 @@ zoomWatcher = { return conds.length > 0 ? conds.join(' AND ') : '1=1'; } + // #290/#293: the single active filter (one node/value in exactly ONE dim), + // read DIRECTLY from the controls so it sees tree-node selections even at + // global view (describeCrossFilters zeroes those). Returns {key, value} for a + // source value or a single tree NODE; null otherwise (no filter, multi-value, + // multi-dim, or a flat-mode dim — those keep their existing paths). The cube + // is keyed by concept_uri for tree dims, so flat-mode dim selections are + // deliberately excluded here (they'd miss the cube and fall through anyway). + function effectiveSingleFilter() { + const sel = []; + const sourceTotal = document.querySelectorAll('#sourceFilter input[type="checkbox"]').length; + const sources = getActiveSources(); + if (sources.length > 0 && sources.length < sourceTotal) sel.push({ key: 'source', values: sources }); + for (const key of TREE_DIM_KEYS) { + if (!treeActive(key)) continue; // flat-mode dim → not in this cube + const vals = treeSelection(key); + if (vals && vals.length > 0) sel.push({ key, values: vals }); + } + if (sel.length === 1 && sel[0].values.length === 1) { + return { key: sel[0].key, value: sel[0].values[0] }; + } + return null; + } + + // Apply cross-filtered counts for a single active filter from the precomputed + // facet_tree_cross_filter cube. Returns true if counts were applied (or the + // request was superseded mid-flight), false on a miss/error so the caller can + // fall through to the existing paths. The cube's schema mirrors + // facet_cross_filter: filter_ columns + facet_type/facet_value/count. + async function applyTreeCubeCounts(eff, dims, myReq) { + const filterColForKey = { + source: 'filter_source', material: 'filter_material', + context: 'filter_context', object_type: 'filter_object_type', + }; + const cols = ['filter_source', 'filter_material', 'filter_context', 'filter_object_type']; + const targetCol = filterColForKey[eff.key]; + const value = escSql(eff.value); + const where = cols.map(c => c === targetCol ? `${c} = '${value}'` : `${c} IS NULL`).join(' AND '); + try { + const rows = await db.query(` + SELECT facet_type, facet_value, count + FROM read_parquet('${tree_cross_filter_url}') + WHERE ${where} + `); + if (myReq !== facetCountsReqId) return true; // superseded — stop, don't fall through + if (!rows || rows.length === 0) return false; // miss — fall through + const grouped = { source: new Map(), material: new Map(), context: new Map(), object_type: new Map() }; + for (const r of rows) { + if (grouped[r.facet_type]) grouped[r.facet_type].set(r.facet_value, Number(r.count)); + } + // The active dim shows ALL its own nodes (null = no cross-filter on self). + for (const d of dims) applyFacetCounts(d.key, d.key === eff.key ? null : grouped[d.key]); + return true; + } catch (err) { + console.warn('Tree cross-filter cube lookup failed; falling back to on-the-fly:', err); + return false; + } + } + async function updateCrossFilteredCounts(myReq) { if (myReq !== facetCountsReqId) return; const { dims, activeDims, totalActiveValues, sourceImpossible } = describeCrossFilters(); @@ -3107,6 +3178,37 @@ zoomWatcher = { const isGlobal = isGlobalView(); const bboxSQL = isGlobal ? null : viewerBboxSQL('l.latitude', 'l.longitude', VIEWPORT_PAD_FACTOR); + // #290/#293 TREE cross-filter cube fast-path. At (near-)global view, a + // SINGLE active filter — one node in any one dim — is answered instantly + // by the precomputed facet_tree_cross_filter cube, replacing the live + // membership COUNT(DISTINCT pid) self-scan that hits the DuckDB-WASM + // data-scale wall. CRITICAL: describeCrossFilters() intentionally ZEROES + // tree-dim selections at global view (so activeDims excludes them), which + // is exactly why a selected tree node otherwise fails to cross-filter the + // other dims at global view (#290). We therefore read the effective single + // filter (incl. tree nodes) DIRECTLY here, ahead of the baseline early- + // return. Gated to global view (bboxSQL===null; viewport needs live + // counts) and no active search (the cube is global, can't be pid-scoped). + // Any miss/error (incl. cube not yet published) returns false → falls + // through to the existing baseline / flat-cube / slow paths unchanged. + // GATE: only when ALL tree dims are rendered as trees (full tree mode — + // the deployed default). In flat mode (?facets=flat) the cube's subtree- + // membership semantics would be WRONG for a flat dim (it counts a sample + // under ancestor nodes it never directly asserts), and a flat-mode + // selection isn't representable in the cube — so we defer entirely to the + // flat-cube/slow paths (no flat-count regression). (Codex P1.) + // SCOPE NOTE (Codex r2): the degenerate "mixed" state (FACET_TREE on but + // one tree failed to render) also fails this gate and falls through to the + // pre-cube baseline path — i.e. the SAME behavior as before this cube + // existed (no regression). Cross-filtering that transient state is out of + // scope; the cube's contract is full-tree-mode + global view + one filter. + if (bboxSQL === null && !searchIsActive() && !sourceImpossible + && TREE_DIM_KEYS.every(treeActive)) { + const eff = effectiveSingleFilter(); + if (eff && await applyTreeCubeCounts(eff, dims, myReq)) return; + if (myReq !== facetCountsReqId) return; + } + // Baseline early-return only applies when there is no filter AND no // spatial constraint. In a non-global view with no facet filter, B1 // still wants per-value counts scoped to what's visible — fall diff --git a/scripts/build_frontend_derived.py b/scripts/build_frontend_derived.py index 0394a75..2b63321 100755 --- a/scripts/build_frontend_derived.py +++ b/scripts/build_frontend_derived.py @@ -63,7 +63,8 @@ # the artifacts this script knows how to build (for --only/--skip validation) ARTIFACTS = ["sample_facets_v2", "samples_map_lite", "h3_summaries", "facet_summaries", "facet_cross_filter", "wide_h3", - "sample_facet_membership", "facet_tree_summaries"] + "sample_facet_membership", "facet_tree_summaries", + "facet_tree_cross_filter"] # Shared SQL expression for sample_facets_v2.description (#277 part 2). # Appends space-joined concept labels (IC labels across all 4 concept dims) @@ -427,6 +428,59 @@ def build_facet_tree_summaries(con, out): ) TO '{out}' (FORMAT PARQUET, COMPRESSION ZSTD)""") +def build_facet_tree_cross_filter(con, out): + # #290/#293: single-active-filter cross-filter COUNT cube spanning the 3 SKOS + # trees (material/context/object_type — keyed by concept_uri, subtree semantics + # via `membership`) AND the flat `source` dim. For every single active filter + # (one node/value in ONE dim) it precomputes COUNT(DISTINCT pid) for every + # OTHER dim's node/value, plus a baseline (no filter). This is the precomputed + # answer to the live tree-count membership self-scan that hits the DuckDB-WASM + # data-scale wall at global view (38.9M-row membership). Tiny output (~1k rows). + # + # Schema MIRRORS facet_cross_filter so the explorer reads it identically: + # filter_source/material/context/object_type, facet_type, facet_value, count + # The filter dim is encoded in its filter_ column (concept_uri for trees, + # source string for source); the target dim in facet_type/facet_value. A row is + # the cross-filtered count of target value GIVEN the single filter. Counts are + # GLOBAL (no viewport) — the explorer uses this only at/near global view, exactly + # like the flat cube. Determinism via COUNT(DISTINCT) + full-key ORDER BY. + # + # NOTE: this DELIBERATELY excludes same-dim pairs (t.dim <> f.dim) — the explorer + # never cross-filters a dim by its own selection (it shows all of a dim's nodes). + # It also excludes flat→flat pairs the existing facet_cross_filter already covers; + # here every row has a tree dim on at least one side (source has only one flat dim). + con.execute(f"""COPY ( + WITH xf AS ( + SELECT pid, facet_type AS dim, concept_uri AS value FROM membership + UNION ALL + SELECT pid, 'source' AS dim, source AS value + FROM samp_geo WHERE NULLIF(TRIM(source), '') IS NOT NULL + ), + single AS ( + SELECT f.dim AS fdim, f.value AS fval, + t.dim AS facet_type, t.value AS facet_value, + COUNT(DISTINCT t.pid) AS count + FROM xf f JOIN xf t ON t.pid = f.pid AND t.dim <> f.dim + GROUP BY 1, 2, 3, 4 + ), + base AS ( + SELECT NULL::VARCHAR AS fdim, NULL::VARCHAR AS fval, + dim AS facet_type, value AS facet_value, COUNT(DISTINCT pid) AS count + FROM xf GROUP BY dim, value + ), + allrows AS (SELECT * FROM single UNION ALL SELECT * FROM base) + SELECT + CASE WHEN fdim = 'source' THEN fval END AS filter_source, + CASE WHEN fdim = 'material' THEN fval END AS filter_material, + CASE WHEN fdim = 'context' THEN fval END AS filter_context, + CASE WHEN fdim = 'object_type' THEN fval END AS filter_object_type, + facet_type, facet_value, count + FROM allrows + ORDER BY filter_source, filter_material, filter_context, filter_object_type, + facet_type, facet_value + ) TO '{out}' (FORMAT PARQUET, COMPRESSION ZSTD)""") + + def file_meta(con, path): n = con.sql(f"SELECT COUNT(*) FROM read_parquet('{path}')").fetchone()[0] schema = [(r[0], r[1]) for r in con.sql(f"DESCRIBE SELECT * FROM read_parquet('{path}')").fetchall()] @@ -490,11 +544,11 @@ def emit(name, fn): emit("wide_h3", lambda o: build_wide_h3(con, args.wide, o)) # Hierarchy artifacts (#281/#282) — need vocab_labels for the SKOS tree. - if want("sample_facet_membership") or want("facet_tree_summaries"): + if want("sample_facet_membership") or want("facet_tree_summaries") or want("facet_tree_cross_filter"): if not args.vocab_labels: # Fail loud if the user EXPLICITLY asked for a hierarchy artifact # (Codex) — silently skipping an explicit --only target is wrong. - explicit = only & {"sample_facet_membership", "facet_tree_summaries"} + explicit = only & {"sample_facet_membership", "facet_tree_summaries", "facet_tree_cross_filter"} if explicit: sys.exit(f"FATAL: --only {sorted(explicit)} requires --vocab-labels ") log("SKIP hierarchy artifacts: pass --vocab-labels ", t0) @@ -502,6 +556,8 @@ def emit(name, fn): build_concept_membership(con, args.wide, args.vocab_labels, t0) emit("sample_facet_membership", lambda o: build_sample_facet_membership(con, o)) emit("facet_tree_summaries", lambda o: build_facet_tree_summaries(con, o)) + # #290/#293 cross-filter cube — needs membership (above) + samp_geo (source). + emit("facet_tree_cross_filter", lambda o: build_facet_tree_cross_filter(con, o)) if not args.no_manifest: log("hashing inputs/outputs for manifest…", t0) diff --git a/scripts/validate_frontend_derived.py b/scripts/validate_frontend_derived.py index 5ab768e..b2d041a 100755 --- a/scripts/validate_frontend_derived.py +++ b/scripts/validate_frontend_derived.py @@ -52,6 +52,7 @@ def main(): ap.add_argument("--h3", nargs=3, metavar=("R4", "R6", "R8")) ap.add_argument("--tree-summaries", help="facet_tree_summaries parquet (#281/#282); optional") ap.add_argument("--membership", help="sample_facet_membership parquet (#281/#282); optional") + ap.add_argument("--tree-cross-filter", help="facet_tree_cross_filter parquet (#290/#293); optional") ap.add_argument("--wide", help="source wide parquet — enables the SEMANTIC gate " "(re-derive and diff the written files against a fresh build)") ap.add_argument("--min-rows", type=int, default=1_000_000, @@ -293,6 +294,7 @@ def _opt(name, attr): return None tree = _opt("facet_tree_summaries", "tree_summaries") mem = _opt("sample_facet_membership", "membership") + treexf = _opt("facet_tree_cross_filter", "tree_cross_filter") if tree: T = f"read_parquet('{tree}')" # parent ≥ child for every edge, every dim (distinct-pid UNION semantics — @@ -339,6 +341,61 @@ def _opt(name, attr): + (SELECT COUNT(*) FROM (SELECT * FROM t EXCEPT SELECT * FROM g))""") check("tree counts == GROUP BY membership (symmetric)", mm == 0, f"{mm} rows disagree") + # --- facet_tree_cross_filter cube (#290/#293) — checked only when present --- + # CROSS-FILE ALGEBRA: the cube must EXACTLY equal a fresh re-derivation of the + # single-active-filter cross-filter self-join over the WRITTEN membership (tree + # dims, subtree semantics) ∪ source (from facets_v2), plus the baseline. This is + # the same algebra the builder runs, recomputed independently here from the + # written sibling files — a drifted/stale/corrupt cube FAILS. AI-free. + if treexf: + if not mem: + check("tree_cross_filter present but membership missing", False, + "need --membership (or {tag}_sample_facet_membership.parquet) to validate the cube") + else: + X = f"read_parquet('{treexf}')" + M = f"read_parquet('{mem}')" + # re-derive xf = tree membership ∪ flat source (from facets_v2 = located universe) + xf = (f"SELECT pid, facet_type AS dim, concept_uri AS value FROM {M} " + f"UNION ALL SELECT pid, 'source' AS dim, source AS value FROM {F} " + f"WHERE NULLIF(TRIM(source), '') IS NOT NULL") + ref = (f"WITH xf AS ({xf}), " + f"single AS (SELECT f.dim fdim, f.value fval, t.dim facet_type, t.value facet_value, " + f"COUNT(DISTINCT t.pid) count FROM xf f JOIN xf t ON t.pid=f.pid AND t.dim<>f.dim GROUP BY 1,2,3,4), " + f"base AS (SELECT NULL::VARCHAR fdim, NULL::VARCHAR fval, dim facet_type, value facet_value, " + f"COUNT(DISTINCT pid) count FROM xf GROUP BY dim, value) " + f"SELECT CASE WHEN fdim='source' THEN fval END filter_source, " + f"CASE WHEN fdim='material' THEN fval END filter_material, " + f"CASE WHEN fdim='context' THEN fval END filter_context, " + f"CASE WHEN fdim='object_type' THEN fval END filter_object_type, " + f"facet_type, facet_value, count FROM (SELECT * FROM single UNION ALL SELECT * FROM base)") + filecube = (f"SELECT filter_source, filter_material, filter_context, filter_object_type, " + f"facet_type, facet_value, count FROM {X}") + # GRAIN first: EXCEPT below is SET semantics, so a duplicated cube would + # pass the symmetric diff. One row per (all filter cols, facet_type, + # facet_value) is the contract the explorer relies on. (Codex P3.) + xdup = scalar(f"""SELECT COUNT(*) FROM ( + SELECT filter_source, filter_material, filter_context, filter_object_type, + facet_type, facet_value + FROM {X} GROUP BY 1,2,3,4,5,6 HAVING COUNT(*) > 1)""") + check("tree_cross_filter grain unique", xdup == 0, f"{xdup} duplicated cube keys") + mm = scalar(f"SELECT (SELECT COUNT(*) FROM (({ref}) EXCEPT ({filecube}))) " + f"+ (SELECT COUNT(*) FROM (({filecube}) EXCEPT ({ref})))") + check("tree_cross_filter == re-derived self-join (symmetric)", mm == 0, + f"{mm} rows disagree (drifted/stale/corrupt cube)") + # baseline (all filter_* NULL) tree rows == facet_tree_summaries counts + if tree: + T3 = f"read_parquet('{tree}')" + bmm = scalar(f""" + WITH cb AS (SELECT facet_type, facet_value, count FROM {X} + WHERE filter_source IS NULL AND filter_material IS NULL + AND filter_context IS NULL AND filter_object_type IS NULL + AND facet_type <> 'source'), + ts AS (SELECT facet_type, concept_uri AS facet_value, count FROM {T3}) + SELECT (SELECT COUNT(*) FROM (SELECT * FROM cb EXCEPT SELECT * FROM ts)) + + (SELECT COUNT(*) FROM (SELECT * FROM ts EXCEPT SELECT * FROM cb))""") + check("tree_cross_filter baseline == tree_summaries", bmm == 0, + f"{bmm} baseline tree rows disagree with facet_tree_summaries") + print(f"\n{'CHECK':<44} {'RESULT':<6} DETAIL\n" + "-" * 90) ok = True for name, passed, detail in R: diff --git a/tests/test_frontend_derived.py b/tests/test_frontend_derived.py index 3a7ca68..97ae6ec 100644 --- a/tests/test_frontend_derived.py +++ b/tests/test_frontend_derived.py @@ -287,6 +287,169 @@ def test_manifest_tamper_caught(tmp_path): assert v.returncode != 0 and "manifest sha256" in v.stdout, f"gate missed manifest tamper:\n{v.stdout}" +# --------------------------------------------------------------------------- +# facet_tree_cross_filter cube (#290/#293) +# A self-contained tree fixture: a tiny SKOS vocab (broader edges) + located +# samples whose concept arrays resolve into those trees, so the REAL builder +# produces membership + tree_summaries + the cross-filter cube. We assert +# EXPLICIT known counts (catches builder-logic bugs a re-derivation can't), +# then confirm the validator's cube gate fires on corruption. +# --------------------------------------------------------------------------- +SF = "https://w3id.org/isample/vocabulary/sampledfeature/1.0/" # context tree +OT = "https://w3id.org/isample/vocabulary/materialsampleobjecttype/1.0/" # object_type tree + +# row_id -> uri, for the tree fixture (roots match DIM_ROOT in the builder) +TREE_CONCEPTS = [ + (1, ROOT), (2, MAT + "mineral"), (3, MAT + "rock"), + (10, SF + "anysampledfeature"), (11, SF + "earthinterior"), + (20, OT + "materialsample"), (21, OT + "othersolidobject"), +] +# broader edges (data_v1 form): (uri, parent_uri-or-None) +TREE_EDGES = [ + (ROOT, None), (MAT + "mineral", ROOT), (MAT + "rock", ROOT), + (SF + "anysampledfeature", None), (SF + "earthinterior", SF + "anysampledfeature"), + (OT + "materialsample", None), (OT + "othersolidobject", OT + "materialsample"), +] +# (pid, source, material_rids, context_rids, object_rids) +TREE_SAMPLES = [ + ("s1", "A", [2], [11], [21]), # mineral, earthinterior, othersolidobject + ("s2", "A", [3], [11], [21]), # rock, earthinterior, othersolidobject + ("s3", "B", [2], [11], [21]), # mineral, earthinterior, othersolidobject +] + + +def build_tree_fixture(wide, vocab): + con = duckdb.connect(); con.execute("INSTALL spatial; LOAD spatial;") + ic_rows = " UNION ALL ".join( + f"SELECT 'IdentifiedConcept' AS otype, '{uri}' AS pid, {rid}::BIGINT AS row_id, NULL::VARCHAR AS n, " + f"'lbl' AS label, NULL::VARCHAR AS description, NULL::VARCHAR[] AS place_name, NULL::TIMESTAMP AS result_time, " + f"NULL AS geometry, NULL::BIGINT[] AS p__has_material_category, NULL::BIGINT[] AS p__has_context_category, " + f"NULL::BIGINT[] AS p__has_sample_object_type, NULL::BIGINT[] AS p__keywords" + for rid, uri in TREE_CONCEPTS) + msr = [] + for i, (pid, src, m, c, o) in enumerate(TREE_SAMPLES): + msr.append( + f"SELECT 'MaterialSampleRecord' AS otype, '{pid}' AS pid, NULL::BIGINT AS row_id, '{src}' AS n, " + f"'label {pid}' AS label, 'desc {pid}' AS description, ['plc-{pid}']::VARCHAR[] AS place_name, " + f"NULL::TIMESTAMP AS result_time, ST_AsWKB(ST_Point({10.0+i},{40.0+i})) AS geometry, " + f"{_arr(m)} AS p__has_material_category, {_arr(c)} AS p__has_context_category, " + f"{_arr(o)} AS p__has_sample_object_type, NULL::BIGINT[] AS p__keywords") + con.execute(f"COPY ({ic_rows} UNION ALL {' UNION ALL '.join(msr)}) TO '{wide}' (FORMAT PARQUET)") + edges = " UNION ALL ".join( + f"SELECT '{u}' uri, {('NULL' if p is None else repr(p))}::VARCHAR broader, 'data_v1' uri_form" + for u, p in TREE_EDGES) + con.execute(f"COPY ({edges}) TO '{vocab}' (FORMAT PARQUET)") + con.close() + + +def _build_tree(tmp_path, wide, vocab, tag="t"): + cmd = [sys.executable, BUILD, "--wide", wide, "--outdir", str(tmp_path), "--tag", tag, + "--skip", "wide_h3", "--no-manifest", "--vocab-labels", vocab] + return subprocess.run(cmd, capture_output=True, text=True) + + +def test_tree_cross_filter_explicit_counts(tmp_path): + wide = str(tmp_path / "wide.parquet"); vocab = str(tmp_path / "vocab.parquet") + build_tree_fixture(wide, vocab) + r = _build_tree(tmp_path, wide, vocab) + assert r.returncode == 0, f"tree build failed:\n{r.stdout}\n{r.stderr}" + cube = f"read_parquet('{tmp_path / 't_facet_tree_cross_filter.parquet'}')" + con = duckdb.connect() + + def cnt(fcol, fval, ftype, fvalue): + nulls = " AND ".join(f"{c} IS NULL" for c in + ["filter_source", "filter_material", "filter_context", "filter_object_type"] + if c != fcol) + where = (f"{fcol}='{fval}' AND " if fcol else "") + (nulls if fcol else + "filter_source IS NULL AND filter_material IS NULL AND filter_context IS NULL AND filter_object_type IS NULL") + row = con.sql(f"SELECT count FROM {cube} WHERE {where} AND facet_type='{ftype}' AND facet_value='{fvalue}'").fetchone() + return row[0] if row else 0 + + # filter material=mineral (subtree pids: s1,s3) -> context earthinterior = 2 + assert cnt("filter_material", MAT + "mineral", "context", SF + "earthinterior") == 2 + # ...and its ancestor anysampledfeature also = 2 (subtree semantics) + assert cnt("filter_material", MAT + "mineral", "context", SF + "anysampledfeature") == 2 + # filter material=mineral -> source A = 1 (s1), source B = 1 (s3) + assert cnt("filter_material", MAT + "mineral", "source", "A") == 1 + assert cnt("filter_material", MAT + "mineral", "source", "B") == 1 + # filter source=A (s1,s2) -> material mineral = 1 (s1), rock = 1 (s2), root = 2 + assert cnt("filter_source", "A", "material", MAT + "mineral") == 1 + assert cnt("filter_source", "A", "material", MAT + "rock") == 1 + assert cnt("filter_source", "A", "material", ROOT) == 2 + # baseline (no filter): material root = 3, context anysampledfeature = 3, source A = 2 + assert cnt(None, None, "material", ROOT) == 3 + assert cnt(None, None, "context", SF + "anysampledfeature") == 3 + assert cnt(None, None, "source", "A") == 2 + # no same-dim rows (a dim never cross-filters itself) + same = con.sql(f"""SELECT COUNT(*) FROM {cube} WHERE + (filter_material IS NOT NULL AND facet_type='material') OR + (filter_context IS NOT NULL AND facet_type='context') OR + (filter_object_type IS NOT NULL AND facet_type='object_type') OR + (filter_source IS NOT NULL AND facet_type='source')""").fetchone()[0] + assert same == 0, f"{same} forbidden same-dim cross-filter rows" + + +def test_tree_cross_filter_validator_passes_and_gate_bites(tmp_path): + wide = str(tmp_path / "wide.parquet"); vocab = str(tmp_path / "vocab.parquet") + build_tree_fixture(wide, vocab) + assert _build_tree(tmp_path, wide, vocab).returncode == 0 + # clean: validator (incl. the new cube gate + all tree gates) must PASS + v = subprocess.run([sys.executable, VALIDATE, "--dir", str(tmp_path), "--tag", "t", + "--min-rows", "1", "--wide", wide], capture_output=True, text=True) + assert v.returncode == 0, f"validator failed on clean tree fixture:\n{v.stdout}\n{v.stderr}" + assert "tree_cross_filter == re-derived self-join" in v.stdout + # corrupt: bump every cube count by 1 -> the cube gate must FAIL + cube = str(tmp_path / "t_facet_tree_cross_filter.parquet") + con = duckdb.connect(); tmp_c = cube + ".tmp" + con.execute(f"""COPY (SELECT filter_source, filter_material, filter_context, filter_object_type, + facet_type, facet_value, count+1 AS count FROM read_parquet('{cube}')) + TO '{tmp_c}' (FORMAT PARQUET)"""); con.close(); os.replace(tmp_c, cube) + v2 = subprocess.run([sys.executable, VALIDATE, "--dir", str(tmp_path), "--tag", "t", "--min-rows", "1"], + capture_output=True, text=True) + assert v2.returncode != 0 and "tree_cross_filter" in v2.stdout, \ + f"cube gate failed to catch corruption:\n{v2.stdout}" + + +def test_tree_cross_filter_only_builds(tmp_path): + """`--only facet_tree_cross_filter` must actually produce the file (Codex P2: + the hierarchy guard previously omitted it -> rc=0 but no output).""" + wide = str(tmp_path / "wide.parquet"); vocab = str(tmp_path / "vocab.parquet") + build_tree_fixture(wide, vocab) + r = subprocess.run([sys.executable, BUILD, "--wide", wide, "--outdir", str(tmp_path), + "--tag", "t", "--no-manifest", "--vocab-labels", vocab, + "--only", "facet_tree_cross_filter"], capture_output=True, text=True) + assert r.returncode == 0, f"{r.stdout}\n{r.stderr}" + assert (tmp_path / "t_facet_tree_cross_filter.parquet").exists(), \ + f"--only facet_tree_cross_filter produced no file:\n{r.stdout}\n{r.stderr}" + + +def test_tree_cross_filter_only_requires_vocab(tmp_path): + """Explicit --only facet_tree_cross_filter without --vocab-labels must fail loud.""" + wide = str(tmp_path / "wide.parquet"); vocab = str(tmp_path / "vocab.parquet") + build_tree_fixture(wide, vocab) + r = subprocess.run([sys.executable, BUILD, "--wide", wide, "--outdir", str(tmp_path), + "--tag", "t", "--no-manifest", "--only", "facet_tree_cross_filter"], + capture_output=True, text=True) + assert r.returncode != 0 and "vocab-labels" in (r.stdout + r.stderr).lower() + + +def test_tree_cross_filter_grain_gate_bites(tmp_path): + """Doubling every cube row keeps counts 'correct' under EXCEPT set-semantics; + the grain/uniqueness check must catch it (Codex P3).""" + wide = str(tmp_path / "wide.parquet"); vocab = str(tmp_path / "vocab.parquet") + build_tree_fixture(wide, vocab) + assert _build_tree(tmp_path, wide, vocab).returncode == 0 + cube = str(tmp_path / "t_facet_tree_cross_filter.parquet") + con = duckdb.connect(); tmp_c = cube + ".tmp" + con.execute(f"""COPY (SELECT * FROM read_parquet('{cube}') + UNION ALL SELECT * FROM read_parquet('{cube}')) + TO '{tmp_c}' (FORMAT PARQUET)"""); con.close(); os.replace(tmp_c, cube) + v = subprocess.run([sys.executable, VALIDATE, "--dir", str(tmp_path), "--tag", "t", "--min-rows", "1"], + capture_output=True, text=True) + assert v.returncode != 0 and "grain unique" in v.stdout, \ + f"grain gate failed to catch a doubled cube:\n{v.stdout}" + + def test_scheme_corruption_caught(tmp_path): wide = str(tmp_path / "wide.parquet"); build_fixture_wide(wide, "blob") assert _build(tmp_path, wide).returncode == 0