Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 103 additions & 1 deletion explorer.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -797,6 +797,11 @@ FACET_TREE = (() => {
// parent_uri, depth, count) and per-sample membership (pid ↔ every ancestor).
facet_tree_url = `${R2_BASE}/isamples_202608_facet_tree_summaries.parquet`
membership_url = `${R2_BASE}/isamples_202608_sample_facet_membership.parquet`
// #290/#293 single-active-filter cross-filter cube over the trees + source.
// Precomputed COUNT(DISTINCT pid) for each target node given ONE active filter,
// so global-view cross-filtered tree counts are instant instead of a live
// membership near-full-scan. ~1k rows; same schema as facet_cross_filter.
tree_cross_filter_url = `${R2_BASE}/isamples_202608_facet_tree_cross_filter.parquet`

// Canonical palette — see issue #113. Path-relative so this works under
// both isamples.org (custom domain at root) and project-pages fork
Expand Down Expand Up @@ -1117,16 +1122,24 @@ function facetFilterSQL() {
// concept NODES → filter via the membership table (which encodes each sample
// under every ancestor), so a selected parent matches its whole subtree (no
// client-side descendant expansion). A flat (non-tree) dim filters on facets_v3.
const treeClauses = []; // one "(facet_type='X' AND concept_uri IN(...))" per tree dim
for (const key of TREE_DIM_KEYS) {
const sel = treeSelection(key);
if (sel.length === 0) continue;
const list = sel.map(s => `'${escSql(s)}'`).join(',');
if (treeActive(key)) {
parts.push(`pid IN (SELECT DISTINCT pid FROM read_parquet('${membership_url}') WHERE facet_type='${key}' AND concept_uri IN (${list}))`);
treeClauses.push(`(facet_type='${key}' AND concept_uri IN (${list}))`);
} else {
facetsConds.push(`${key} IN (${list})`);
}
}
// #293: collapse the tree-dim selections into ONE membership scan rather than
// one AND-ed subquery per dim (N full scans → 1 in DuckDB-WASM). AND across dims
// is `HAVING COUNT(DISTINCT facet_type) = <#dims>`; OR within a dim is the
// `concept_uri IN (...)`. Single-dim collapses to the same one scan as before.
if (treeClauses.length > 0) {
parts.push(`pid IN (SELECT pid FROM read_parquet('${membership_url}') WHERE ${treeClauses.join(' OR ')} GROUP BY pid HAVING COUNT(DISTINCT facet_type) = ${treeClauses.length})`);
}
if (facetsConds.length > 0) {
parts.push(`pid IN (SELECT DISTINCT pid FROM read_parquet('${facets_url}') WHERE ${facetsConds.join(' AND ')})`);
}
Expand Down Expand Up @@ -3070,6 +3083,64 @@ zoomWatcher = {
return conds.length > 0 ? conds.join(' AND ') : '1=1';
}

// #290/#293: the single active filter (one node/value in exactly ONE dim),
// read DIRECTLY from the controls so it sees tree-node selections even at
// global view (describeCrossFilters zeroes those). Returns {key, value} for a
// source value or a single tree NODE; null otherwise (no filter, multi-value,
// multi-dim, or a flat-mode dim — those keep their existing paths). The cube
// is keyed by concept_uri for tree dims, so flat-mode dim selections are
// deliberately excluded here (they'd miss the cube and fall through anyway).
function effectiveSingleFilter() {
const sel = [];
const sourceTotal = document.querySelectorAll('#sourceFilter input[type="checkbox"]').length;
const sources = getActiveSources();
if (sources.length > 0 && sources.length < sourceTotal) sel.push({ key: 'source', values: sources });
for (const key of TREE_DIM_KEYS) {
if (!treeActive(key)) continue; // flat-mode dim → not in this cube
const vals = treeSelection(key);
if (vals && vals.length > 0) sel.push({ key, values: vals });
}
if (sel.length === 1 && sel[0].values.length === 1) {
return { key: sel[0].key, value: sel[0].values[0] };
}
return null;
}

// Apply cross-filtered counts for a single active filter from the precomputed
// facet_tree_cross_filter cube. Returns true if counts were applied (or the
// request was superseded mid-flight), false on a miss/error so the caller can
// fall through to the existing paths. The cube's schema mirrors
// facet_cross_filter: filter_<dim> columns + facet_type/facet_value/count.
async function applyTreeCubeCounts(eff, dims, myReq) {
const filterColForKey = {
source: 'filter_source', material: 'filter_material',
context: 'filter_context', object_type: 'filter_object_type',
};
const cols = ['filter_source', 'filter_material', 'filter_context', 'filter_object_type'];
const targetCol = filterColForKey[eff.key];
const value = escSql(eff.value);
const where = cols.map(c => c === targetCol ? `${c} = '${value}'` : `${c} IS NULL`).join(' AND ');
try {
const rows = await db.query(`
SELECT facet_type, facet_value, count
FROM read_parquet('${tree_cross_filter_url}')
WHERE ${where}
`);
if (myReq !== facetCountsReqId) return true; // superseded — stop, don't fall through
if (!rows || rows.length === 0) return false; // miss — fall through
const grouped = { source: new Map(), material: new Map(), context: new Map(), object_type: new Map() };
for (const r of rows) {
if (grouped[r.facet_type]) grouped[r.facet_type].set(r.facet_value, Number(r.count));
}
// The active dim shows ALL its own nodes (null = no cross-filter on self).
for (const d of dims) applyFacetCounts(d.key, d.key === eff.key ? null : grouped[d.key]);
return true;
} catch (err) {
console.warn('Tree cross-filter cube lookup failed; falling back to on-the-fly:', err);
return false;
}
}

async function updateCrossFilteredCounts(myReq) {
if (myReq !== facetCountsReqId) return;
const { dims, activeDims, totalActiveValues, sourceImpossible } = describeCrossFilters();
Expand Down Expand Up @@ -3107,6 +3178,37 @@ zoomWatcher = {
const isGlobal = isGlobalView();
const bboxSQL = isGlobal ? null : viewerBboxSQL('l.latitude', 'l.longitude', VIEWPORT_PAD_FACTOR);

// #290/#293 TREE cross-filter cube fast-path. At (near-)global view, a
// SINGLE active filter — one node in any one dim — is answered instantly
// by the precomputed facet_tree_cross_filter cube, replacing the live
// membership COUNT(DISTINCT pid) self-scan that hits the DuckDB-WASM
// data-scale wall. CRITICAL: describeCrossFilters() intentionally ZEROES
// tree-dim selections at global view (so activeDims excludes them), which
// is exactly why a selected tree node otherwise fails to cross-filter the
// other dims at global view (#290). We therefore read the effective single
// filter (incl. tree nodes) DIRECTLY here, ahead of the baseline early-
// return. Gated to global view (bboxSQL===null; viewport needs live
// counts) and no active search (the cube is global, can't be pid-scoped).
// Any miss/error (incl. cube not yet published) returns false → falls
// through to the existing baseline / flat-cube / slow paths unchanged.
// GATE: only when ALL tree dims are rendered as trees (full tree mode —
// the deployed default). In flat mode (?facets=flat) the cube's subtree-
// membership semantics would be WRONG for a flat dim (it counts a sample
// under ancestor nodes it never directly asserts), and a flat-mode
// selection isn't representable in the cube — so we defer entirely to the
// flat-cube/slow paths (no flat-count regression). (Codex P1.)
// SCOPE NOTE (Codex r2): the degenerate "mixed" state (FACET_TREE on but
// one tree failed to render) also fails this gate and falls through to the
// pre-cube baseline path — i.e. the SAME behavior as before this cube
// existed (no regression). Cross-filtering that transient state is out of
// scope; the cube's contract is full-tree-mode + global view + one filter.
if (bboxSQL === null && !searchIsActive() && !sourceImpossible
&& TREE_DIM_KEYS.every(treeActive)) {
const eff = effectiveSingleFilter();
if (eff && await applyTreeCubeCounts(eff, dims, myReq)) return;
if (myReq !== facetCountsReqId) return;
}

// Baseline early-return only applies when there is no filter AND no
// spatial constraint. In a non-global view with no facet filter, B1
// still wants per-value counts scoped to what's visible — fall
Expand Down
62 changes: 59 additions & 3 deletions scripts/build_frontend_derived.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,8 @@
# the artifacts this script knows how to build (for --only/--skip validation)
ARTIFACTS = ["sample_facets_v2", "samples_map_lite", "h3_summaries",
"facet_summaries", "facet_cross_filter", "wide_h3",
"sample_facet_membership", "facet_tree_summaries"]
"sample_facet_membership", "facet_tree_summaries",
"facet_tree_cross_filter"]

# Shared SQL expression for sample_facets_v2.description (#277 part 2).
# Appends space-joined concept labels (IC labels across all 4 concept dims)
Expand Down Expand Up @@ -427,6 +428,59 @@ def build_facet_tree_summaries(con, out):
) TO '{out}' (FORMAT PARQUET, COMPRESSION ZSTD)""")


def build_facet_tree_cross_filter(con, out):
# #290/#293: single-active-filter cross-filter COUNT cube spanning the 3 SKOS
# trees (material/context/object_type — keyed by concept_uri, subtree semantics
# via `membership`) AND the flat `source` dim. For every single active filter
# (one node/value in ONE dim) it precomputes COUNT(DISTINCT pid) for every
# OTHER dim's node/value, plus a baseline (no filter). This is the precomputed
# answer to the live tree-count membership self-scan that hits the DuckDB-WASM
# data-scale wall at global view (38.9M-row membership). Tiny output (~1k rows).
#
# Schema MIRRORS facet_cross_filter so the explorer reads it identically:
# filter_source/material/context/object_type, facet_type, facet_value, count
# The filter dim is encoded in its filter_<dim> column (concept_uri for trees,
# source string for source); the target dim in facet_type/facet_value. A row is
# the cross-filtered count of target value GIVEN the single filter. Counts are
# GLOBAL (no viewport) — the explorer uses this only at/near global view, exactly
# like the flat cube. Determinism via COUNT(DISTINCT) + full-key ORDER BY.
#
# NOTE: this DELIBERATELY excludes same-dim pairs (t.dim <> f.dim) — the explorer
# never cross-filters a dim by its own selection (it shows all of a dim's nodes).
# It also excludes flat→flat pairs the existing facet_cross_filter already covers;
# here every row has a tree dim on at least one side (source has only one flat dim).
con.execute(f"""COPY (
WITH xf AS (
SELECT pid, facet_type AS dim, concept_uri AS value FROM membership
UNION ALL
SELECT pid, 'source' AS dim, source AS value
FROM samp_geo WHERE NULLIF(TRIM(source), '') IS NOT NULL
),
single AS (
SELECT f.dim AS fdim, f.value AS fval,
t.dim AS facet_type, t.value AS facet_value,
COUNT(DISTINCT t.pid) AS count
FROM xf f JOIN xf t ON t.pid = f.pid AND t.dim <> f.dim
GROUP BY 1, 2, 3, 4
),
base AS (
SELECT NULL::VARCHAR AS fdim, NULL::VARCHAR AS fval,
dim AS facet_type, value AS facet_value, COUNT(DISTINCT pid) AS count
FROM xf GROUP BY dim, value
),
allrows AS (SELECT * FROM single UNION ALL SELECT * FROM base)
SELECT
CASE WHEN fdim = 'source' THEN fval END AS filter_source,
CASE WHEN fdim = 'material' THEN fval END AS filter_material,
CASE WHEN fdim = 'context' THEN fval END AS filter_context,
CASE WHEN fdim = 'object_type' THEN fval END AS filter_object_type,
facet_type, facet_value, count
FROM allrows
ORDER BY filter_source, filter_material, filter_context, filter_object_type,
facet_type, facet_value
) TO '{out}' (FORMAT PARQUET, COMPRESSION ZSTD)""")


def file_meta(con, path):
n = con.sql(f"SELECT COUNT(*) FROM read_parquet('{path}')").fetchone()[0]
schema = [(r[0], r[1]) for r in con.sql(f"DESCRIBE SELECT * FROM read_parquet('{path}')").fetchall()]
Expand Down Expand Up @@ -490,18 +544,20 @@ def emit(name, fn):
emit("wide_h3", lambda o: build_wide_h3(con, args.wide, o))

# Hierarchy artifacts (#281/#282) — need vocab_labels for the SKOS tree.
if want("sample_facet_membership") or want("facet_tree_summaries"):
if want("sample_facet_membership") or want("facet_tree_summaries") or want("facet_tree_cross_filter"):
if not args.vocab_labels:
# Fail loud if the user EXPLICITLY asked for a hierarchy artifact
# (Codex) — silently skipping an explicit --only target is wrong.
explicit = only & {"sample_facet_membership", "facet_tree_summaries"}
explicit = only & {"sample_facet_membership", "facet_tree_summaries", "facet_tree_cross_filter"}
if explicit:
sys.exit(f"FATAL: --only {sorted(explicit)} requires --vocab-labels <vocab_labels.parquet>")
log("SKIP hierarchy artifacts: pass --vocab-labels <vocab_labels.parquet>", t0)
else:
build_concept_membership(con, args.wide, args.vocab_labels, t0)
emit("sample_facet_membership", lambda o: build_sample_facet_membership(con, o))
emit("facet_tree_summaries", lambda o: build_facet_tree_summaries(con, o))
# #290/#293 cross-filter cube — needs membership (above) + samp_geo (source).
emit("facet_tree_cross_filter", lambda o: build_facet_tree_cross_filter(con, o))

if not args.no_manifest:
log("hashing inputs/outputs for manifest…", t0)
Expand Down
57 changes: 57 additions & 0 deletions scripts/validate_frontend_derived.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def main():
ap.add_argument("--h3", nargs=3, metavar=("R4", "R6", "R8"))
ap.add_argument("--tree-summaries", help="facet_tree_summaries parquet (#281/#282); optional")
ap.add_argument("--membership", help="sample_facet_membership parquet (#281/#282); optional")
ap.add_argument("--tree-cross-filter", help="facet_tree_cross_filter parquet (#290/#293); optional")
ap.add_argument("--wide", help="source wide parquet — enables the SEMANTIC gate "
"(re-derive and diff the written files against a fresh build)")
ap.add_argument("--min-rows", type=int, default=1_000_000,
Expand Down Expand Up @@ -293,6 +294,7 @@ def _opt(name, attr):
return None
tree = _opt("facet_tree_summaries", "tree_summaries")
mem = _opt("sample_facet_membership", "membership")
treexf = _opt("facet_tree_cross_filter", "tree_cross_filter")
if tree:
T = f"read_parquet('{tree}')"
# parent ≥ child for every edge, every dim (distinct-pid UNION semantics —
Expand Down Expand Up @@ -339,6 +341,61 @@ def _opt(name, attr):
+ (SELECT COUNT(*) FROM (SELECT * FROM t EXCEPT SELECT * FROM g))""")
check("tree counts == GROUP BY membership (symmetric)", mm == 0, f"{mm} rows disagree")

# --- facet_tree_cross_filter cube (#290/#293) — checked only when present ---
# CROSS-FILE ALGEBRA: the cube must EXACTLY equal a fresh re-derivation of the
# single-active-filter cross-filter self-join over the WRITTEN membership (tree
# dims, subtree semantics) ∪ source (from facets_v2), plus the baseline. This is
# the same algebra the builder runs, recomputed independently here from the
# written sibling files — a drifted/stale/corrupt cube FAILS. AI-free.
if treexf:
if not mem:
check("tree_cross_filter present but membership missing", False,
"need --membership (or {tag}_sample_facet_membership.parquet) to validate the cube")
else:
X = f"read_parquet('{treexf}')"
M = f"read_parquet('{mem}')"
# re-derive xf = tree membership ∪ flat source (from facets_v2 = located universe)
xf = (f"SELECT pid, facet_type AS dim, concept_uri AS value FROM {M} "
f"UNION ALL SELECT pid, 'source' AS dim, source AS value FROM {F} "
f"WHERE NULLIF(TRIM(source), '') IS NOT NULL")
ref = (f"WITH xf AS ({xf}), "
f"single AS (SELECT f.dim fdim, f.value fval, t.dim facet_type, t.value facet_value, "
f"COUNT(DISTINCT t.pid) count FROM xf f JOIN xf t ON t.pid=f.pid AND t.dim<>f.dim GROUP BY 1,2,3,4), "
f"base AS (SELECT NULL::VARCHAR fdim, NULL::VARCHAR fval, dim facet_type, value facet_value, "
f"COUNT(DISTINCT pid) count FROM xf GROUP BY dim, value) "
f"SELECT CASE WHEN fdim='source' THEN fval END filter_source, "
f"CASE WHEN fdim='material' THEN fval END filter_material, "
f"CASE WHEN fdim='context' THEN fval END filter_context, "
f"CASE WHEN fdim='object_type' THEN fval END filter_object_type, "
f"facet_type, facet_value, count FROM (SELECT * FROM single UNION ALL SELECT * FROM base)")
filecube = (f"SELECT filter_source, filter_material, filter_context, filter_object_type, "
f"facet_type, facet_value, count FROM {X}")
# GRAIN first: EXCEPT below is SET semantics, so a duplicated cube would
# pass the symmetric diff. One row per (all filter cols, facet_type,
# facet_value) is the contract the explorer relies on. (Codex P3.)
xdup = scalar(f"""SELECT COUNT(*) FROM (
SELECT filter_source, filter_material, filter_context, filter_object_type,
facet_type, facet_value
FROM {X} GROUP BY 1,2,3,4,5,6 HAVING COUNT(*) > 1)""")
check("tree_cross_filter grain unique", xdup == 0, f"{xdup} duplicated cube keys")
mm = scalar(f"SELECT (SELECT COUNT(*) FROM (({ref}) EXCEPT ({filecube}))) "
f"+ (SELECT COUNT(*) FROM (({filecube}) EXCEPT ({ref})))")
check("tree_cross_filter == re-derived self-join (symmetric)", mm == 0,
f"{mm} rows disagree (drifted/stale/corrupt cube)")
# baseline (all filter_* NULL) tree rows == facet_tree_summaries counts
if tree:
T3 = f"read_parquet('{tree}')"
bmm = scalar(f"""
WITH cb AS (SELECT facet_type, facet_value, count FROM {X}
WHERE filter_source IS NULL AND filter_material IS NULL
AND filter_context IS NULL AND filter_object_type IS NULL
AND facet_type <> 'source'),
ts AS (SELECT facet_type, concept_uri AS facet_value, count FROM {T3})
SELECT (SELECT COUNT(*) FROM (SELECT * FROM cb EXCEPT SELECT * FROM ts))
+ (SELECT COUNT(*) FROM (SELECT * FROM ts EXCEPT SELECT * FROM cb))""")
check("tree_cross_filter baseline == tree_summaries", bmm == 0,
f"{bmm} baseline tree rows disagree with facet_tree_summaries")

print(f"\n{'CHECK':<44} {'RESULT':<6} DETAIL\n" + "-" * 90)
ok = True
for name, passed, detail in R:
Expand Down
Loading
Loading