From 99a0335f92c2f6d901b1aea460bfbcd962b6af8a Mon Sep 17 00:00:00 2001 From: Raymond Yee Date: Wed, 17 Jun 2026 08:24:26 -0700 Subject: [PATCH 1/6] #208 PR4b: shared settled-camera tail via reconcileSettledCamera() (#14) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extract the shared settled-camera reconciliation both globe listeners run once the camera comes to rest — the cluster "Samples in View" stat refresh + the URL-hash write (writeGlobeHash) — into reconcileSettledCamera(v), and call it from BOTH camera.changed and moveEnd (#208 smell 1b). This gives moveEnd the same cluster-stat refresh camera.changed already did, closing the sub-10%-pan gap: a small cluster-mode drag fired moveEnd (which updated the URL via #204) but NOT camera.changed (debounced away by percentageChanged=0.1), leaving the "Samples in View" count stale. Scope is deliberately minimal (Codex Q3 / REFACTOR_PR4_PLAN.md §3): NOT the full handler merge. Mode-transition + resolution-reload stays camera.changed- only; facet/heatmap/point-exit stays moveEnd-only; #262 stays a separate tracked sibling. reconcileSettledCamera is a local fn (closes over getMode/currentRes/countInViewport), not top-level like writeGlobeHash. Behavior: - camera.changed: behavior-neutral (same order — cluster-stat then hash). - moveEnd: adds the cluster-stat refresh (point mode skips it via the getMode()==='cluster' guard, so point mode is unchanged). The stat read is synchronous, guarded by _clusterData, and writes no mode/selection/URL/ facet/heatmap state. Verification: smoke 4 + characterization 7 + url-roundtrip 5 all green (behavior-neutral URL contract from both handlers preserved); render clean; Codex review of the diff found no blocking issues. A dedicated headless regression for the moveEnd cluster-stat refresh proved unreliable (OJS cell re-evaluation yields multiple viewer instances in the harness; the one reachable at interaction time often has no camera listeners) — documented inline in url-roundtrip.spec.js rather than shipped flaky. Co-authored-by: Claude Opus 4.8 (1M context) --- explorer.qmd | 57 +++++++++++++++++++------- tests/playwright/url-roundtrip.spec.js | 15 +++++++ 2 files changed, 57 insertions(+), 15 deletions(-) diff --git a/explorer.qmd b/explorer.qmd index 73c8c0ee..4c4d9bae 100644 --- a/explorer.qmd +++ b/explorer.qmd @@ -3679,6 +3679,37 @@ zoomWatcher = { document.getElementById('contextFilterBody').addEventListener('change', handleFacetFilterChange); document.getElementById('objectTypeFilterBody').addEventListener('change', handleFacetFilterChange); + // --- Shared settled-camera tail (#208 smell 1b) --- + // The single reconciliation entry point both settled-camera listeners run + // once the camera has come to rest: refresh the cluster-mode "Samples in + // View" stat for the current viewport, then write the URL hash. Extracted + // so `moveEnd` runs the SAME cluster-stat refresh that `camera.changed` + // does — closing the sub-10%-pan gap where a small drag in cluster mode + // updated the URL (moveEnd fires) but left the "Samples in View" count + // stale (camera.changed's `percentageChanged=0.1` debounce didn't fire). + // + // Scope is deliberately minimal (Codex Q3, REFACTOR_PR4_PLAN.md §3): this + // touches NEITHER the mode-transition / resolution-reload logic (which + // stays camera.changed-only) NOR the facet/heatmap/point-exit logic (which + // stays moveEnd-only). The cluster-stat read is synchronous and cheap + // (counts already-loaded `_clusterData` against the padded viewport) and + // no-ops unless we're in cluster mode with data loaded. + function reconcileSettledCamera(v) { + // Cluster-mode viewport count only — point mode shows its own count via + // loadViewportSamples(). Padded bbox so the cluster "Samples in View" + // stat matches the samples table row total (issue #221 round 2). + if (getMode() === 'cluster' && v._clusterData) { + const inView = countInViewport(paddedViewportBounds(VIEWPORT_PAD_FACTOR)); + const total = v._clusterTotal; + if (total) { + updateStats(`H3 Res${currentRes}`, `${inView.clusters.toLocaleString()} / ${total.clusters.toLocaleString()}`, inView.samples.toLocaleString(), null, 'Clusters in View / Loaded', 'Samples in View'); + } + } + // _suppressHashWrite-gated default — the hashchange-flight path stays + // unaffected (same gate the two raw writers honored before). + writeGlobeHash(v); + } + // --- Camera change handler --- let timer = null; viewer.camera.changed.addEventListener(() => { @@ -3756,20 +3787,11 @@ zoomWatcher = { } } - // Update viewport cluster count (cluster mode only; point mode - // already shows viewport count). Padded bbox so the cluster - // "Samples in View" stat matches the samples table row total - // (issue #221 round 2). - if (getMode() === 'cluster' && viewer._clusterData) { - const inView = countInViewport(paddedViewportBounds(VIEWPORT_PAD_FACTOR)); - const total = viewer._clusterTotal; - if (total) { - updateStats(`H3 Res${currentRes}`, `${inView.clusters.toLocaleString()} / ${total.clusters.toLocaleString()}`, inView.samples.toLocaleString(), null, 'Clusters in View / Loaded', 'Samples in View'); - } - } - - // Update URL hash (replaceState for continuous movement) - writeGlobeHash(viewer); + // Settled-camera tail: cluster "Samples in View" stat (cluster + // mode only; point mode already shows viewport count) + URL-hash + // replaceState for continuous movement. Shared with `moveEnd` via + // reconcileSettledCamera() (#208 smell 1b). + reconcileSettledCamera(viewer); }, 600); }); viewer.camera.percentageChanged = 0.1; @@ -3833,7 +3855,12 @@ zoomWatcher = { }); viewer.camera.moveEnd.addEventListener(() => { - writeGlobeHash(viewer); + // Settled-camera tail shared with `camera.changed` (#208 smell 1b): + // URL-hash write + cluster "Samples in View" refresh. moveEnd fires on + // every discrete settle including sub-10% pans that `camera.changed` + // debounces away, so routing through here keeps the cluster stat in + // lockstep with the URL (which moveEnd already kept fresh via #204). + reconcileSettledCamera(viewer); // B1: viewport-aware facet counts. Bouncing through refreshFacetCounts // reuses the existing 250ms debounce + facetCountsReqId stale-guard, // so bursts of moveEnd (drag-pan, wheel-zoom) coalesce into one query diff --git a/tests/playwright/url-roundtrip.spec.js b/tests/playwright/url-roundtrip.spec.js index 55bfdb6e..d7184b36 100644 --- a/tests/playwright/url-roundtrip.spec.js +++ b/tests/playwright/url-roundtrip.spec.js @@ -298,4 +298,19 @@ test.describe('Explorer URL state round-trip (issue #209)', () => { const s = await snapshot(page); expect(s.selectedH3).toBeNull(); }); + + // NOTE (PR4b, #208 smell 1b): a dedicated headless regression for the new + // `moveEnd` cluster "Samples in View" refresh was attempted but proved + // unreliable. The explorer OJS cell re-evaluates repeatedly during headless + // boot, so `_ojs...value('viewer')` returns different `viewer` instances + // across calls — the one reachable at interaction time frequently has zero + // camera listeners attached, so a forced `moveEnd.raiseEvent()` (or `flyTo`) + // never reaches reconcileSettledCamera. A flaky test being worse than none, + // PR4b instead rests on: (1) the existing url-roundtrip + characterization + // suite proving behavior-neutrality of the shared URL write from BOTH the + // camera.changed and moveEnd handlers; (2) the change being mechanically + // trivial — `moveEnd` now invokes the IDENTICAL cluster-stat block that + // `camera.changed` already ran (covered by cluster-mode boot in the + // characterization specs); (3) a manual probe confirming a settled cluster + // camera writes " | Samples in View" via the shared tail. }); From d512b15f446b133ed75030689f435cce6abdb9f5 Mon Sep 17 00:00:00 2001 From: Raymond Yee Date: Wed, 17 Jun 2026 08:24:57 -0700 Subject: [PATCH 2/6] #285: open the in-map detail card on a pid deep-link (parity with row-click) (#15) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A `#pid=` deep-link (cold boot OR back/forward hashchange) populated only the sidebar card (updateSampleCard → #clusterSection) but never opened the floating in-map detail card (#inMapCard) a table row-click shows. Same pid, two paths, two UI states (#239-family divergence). Fix: shared openInMapCardForSample(meta, isStale) helper in the zoomWatcher cell, mirroring activateRow's tail — showInMapCard at canvas centre + the identical rich wide-table detail query (material/specimen/thumbnail) → updateSampleDetail + populateInMapCardDetail, with isStale guards. Wired into both the boot pid path and the hashchange pid path. The boot path's old description-only sidebar query is superseded by the helper's richer query. activateRow is in a different OJS cell and is intentionally left untouched. No camera flyTo here (deliberate, documented): both callers already frame the camera to the URL coords (boot setView / hashchange flyTo), and a #pid= link settles on the sample view (activateRow's pushState is replaced by the post-flight moveEnd replaceState). Card anchors at canvas centre — the #226-correct anchor that dodges the lazy-load race. Codex review (2 rounds) → no blocking findings. Addressed: - boot path re-checks isStale() after the helper (don't continue stale boot hydration into mode/heatmap); - hideInMapCard() on the hashchange h3 / no-selection / pid-not-found branches so navigating away from a pid doesn't strand the floating card. Tests: new characterization (d3) asserts a pid deep-link opens #inMapCard with the exact known material AND that a hashchange to a bare view re-hides it. Full smoke(4)+url-roundtrip(5)+characterization(8) = 17 green; render clean. Co-authored-by: Claude Opus 4.8 (1M context) --- explorer.qmd | 105 ++++++++++++++++-- .../explorer-characterization.spec.js | 65 ++++++++++- 2 files changed, 153 insertions(+), 17 deletions(-) diff --git a/explorer.qmd b/explorer.qmd index 4c4d9bae..153b9970 100644 --- a/explorer.qmd +++ b/explorer.qmd @@ -3983,6 +3983,71 @@ zoomWatcher = { } } + // --- Open the in-map detail card for a deep-linked sample (issue #285) --- + // A `#pid=` deep-link (cold boot OR back/forward hashchange) must reach the + // same end state as a table row-click: not just the sidebar card + // (updateSampleCard / #clusterSection) but also the floating in-map detail + // card (#inMapCard). This mirrors activateRow's tail (the rich wide-table + // detail query → updateSampleDetail + populateInMapCardDetail), shared + // between the boot and hashchange pid paths. + // + // We do NOT fly the camera here: both callers have already framed it to the + // URL coordinates (boot setView / the hashchange flyTo above), and a `#pid=` + // link is generated by Share / row-click with the camera captured AT the + // sample (activateRow's pushState is replaced by the post-flight moveEnd + // replaceState, so the history entry settles on the sample view). So we + // anchor the card at canvas centre — exactly where the sample lands, the + // same anchor activateRow uses to dodge the #226 lazy-load race (the + // deferred-to-flyTo.complete approach was rejected there). + // + // Deliberate decision (issue #285 acceptance allows it): for a hand-edited + // `#pid=` URL whose lat/lng do NOT point at the sample, the card still + // anchors at canvas centre rather than flying. Adding a "camera far from + // sample → flyTo" branch would fight the hashchange flyTo above and the + // boot setView; it's a tracked follow-up, not folded in here. + async function openInMapCardForSample(meta, isStale) { + const canvas = viewer.scene.canvas; + showInMapCard(meta, { x: canvas.clientWidth / 2, y: canvas.clientHeight / 2 }); + try { + const pidEsc = meta.pid.replace(/'/g, "''"); + const detail = await db.query(` + SELECT + s.description, + s.thumbnail_url, + s.has_feature_of_interest, + mat_lbl.pref_label AS material_label, + obj_lbl.pref_label AS object_type_label + FROM read_parquet('${wide_url}') AS s + LEFT JOIN read_parquet('${wide_url}') AS mat + ON mat.row_id = s.p__has_material_category[1] + AND mat.otype = 'IdentifiedConcept' + LEFT JOIN read_parquet('${vocab_labels_url}') AS mat_lbl + ON mat_lbl.uri = mat.pid + LEFT JOIN read_parquet('${wide_url}') AS obj + ON obj.row_id = s.p__has_sample_object_type[1] + AND obj.otype = 'IdentifiedConcept' + LEFT JOIN read_parquet('${vocab_labels_url}') AS obj_lbl + ON obj_lbl.uri = obj.pid + WHERE s.pid = '${pidEsc}' + AND s.otype = 'MaterialSampleRecord' + LIMIT 1 + `); + if (isStale()) return; + if (detail && detail.length > 0) { + updateSampleDetail(detail[0]); + populateInMapCardDetail(detail[0]); + } else { + updateSampleDetail({ description: '' }); + populateInMapCardDetail(null); + } + } catch(err) { + if (isStale()) return; + console.error('Deep-link in-map detail query failed:', err); + updateSampleDetail(null); + populateInMapCardDetail(null); + } + } + // --- Handle browser back/forward --- window.addEventListener('hashchange', async () => { // Bump the selection generation BEFORE any early-return so even @@ -4038,11 +4103,19 @@ zoomWatcher = { if (isStale()) return; if (sample && sample.length > 0) { const s = sample[0]; - updateSampleCard({ + const meta = { pid: s.pid, label: s.label, source: s.source, lat: s.latitude, lng: s.longitude, place_name: s.place_name, result_time: s.result_time - }); + }; + updateSampleCard(meta); + // #285: also open the floating in-map card (parity with + // row-click). Camera already framed by the flyTo above. + await openInMapCardForSample(meta, isStale); + } else { + // pid no longer resolves — clear any floating card left + // over from a prior pid selection (#285). + hideInMapCard(); } } catch(err) { console.error("Hash pid query failed:", err); @@ -4050,6 +4123,9 @@ zoomWatcher = { } else if (state.h3) { viewer._globeState.selectedPid = null; viewer._globeState.selectedH3 = state.h3.toLowerCase(); + // #285: navigating to a cluster selection clears the sample-only + // floating in-map card (mirrors the map cluster-click path). + hideInMapCard(); const meta = await fetchClusterByH3(state.h3); if (isStale()) return; if (meta) { @@ -4071,6 +4147,8 @@ zoomWatcher = { viewer._globeState.selectedPid = null; viewer._globeState.selectedH3 = null; updateClusterCard(null); + // #285: no selection in the new hash — clear the floating card too. + hideInMapCard(); const sampEl = document.getElementById('samplesSection'); if (sampEl) sampEl.innerHTML = ''; } @@ -5179,19 +5257,24 @@ zoomWatcher = { if (isStale()) return "active"; if (sample && sample.length > 0) { const s = sample[0]; - updateSampleCard({ + const meta = { pid: s.pid, label: s.label, source: s.source, lat: s.latitude, lng: s.longitude, place_name: s.place_name, result_time: s.result_time - }); - const detail = await db.query(` - SELECT description FROM read_parquet('${wide_url}') - WHERE pid = '${ih.pid.replace(/'/g, "''")}' - LIMIT 1 - `); + }; + updateSampleCard(meta); + // #285: open the floating in-map card too (parity with + // row-click), not just the sidebar. The shared helper runs + // the richer detail query (material/specimen/thumbnail) that + // supersedes the old description-only sidebar query, and + // anchors the card at canvas centre where the URL-hydrated + // camera frames the sample. + await openInMapCardForSample(meta, isStale); + // The helper awaits a remote query; if a hashchange + // superseded this boot lookup meanwhile, abort the rest of + // the boot deep-link hydration (mode/heatmap below) just as + // the old description-only query did. if (isStale()) return "active"; - if (detail && detail.length > 0) updateSampleDetail(detail[0]); - else updateSampleDetail({ description: '' }); } } catch(err) { console.error("Deep-link pid query failed:", err); diff --git a/tests/playwright/explorer-characterization.spec.js b/tests/playwright/explorer-characterization.spec.js index 47813dfc..62c67eee 100644 --- a/tests/playwright/explorer-characterization.spec.js +++ b/tests/playwright/explorer-characterization.spec.js @@ -22,9 +22,10 @@ * (c) heatmap -> see heatmap-overlay.spec.js (comment only, no test) * (d1) ?search= URL -> __searchFilter restored AND #tableMeta shows the match summary * (d2) &pid= URL -> selectedPid restored AND #clusterSection shows the sample card - * (NB: deep-link does NOT open #inMapCard — row-click-only; gap filed) + * (d3) &pid= URL -> ALSO opens #inMapCard with exact material (#285 fix), and a + * hashchange away from the pid hides the floating card again * (e) facet hydration -> >=3 source counts, material URIs, no stuck .recomputing - * (f) detail card -> known sample -> #inMapCard visible AND exact material value + * (f) detail card -> known sample row-click -> #inMapCard visible AND exact material value */ const { test, expect } = require('@playwright/test'); const { explorerUrl } = require('./helpers/url'); @@ -179,10 +180,8 @@ test.describe('Explorer characterization tests [data]', () => { // ========================================================================= // (d2) deep-link &pid= -> restores selectedPid AND renders the sample card - // into #clusterSection (what the boot/hashchange pid path actually does: - // updateSampleCard, NOT showInMapCard). The fact that a pid deep-link - // does NOT open #inMapCard (whereas a row-click does) is a real UX - // inconsistency tracked as a follow-up (#239-family), NOT asserted here. + // into #clusterSection (the sidebar half of the pid path). The in-map + // card (#inMapCard) half is asserted separately by (d3) below (#285). // ========================================================================= test('(d2) [data] &pid= deep-link restores selectedPid and renders #clusterSection card', async ({ browser }) => { // Phase 1: click a row, capture pid + label + the resulting pid URL. @@ -275,4 +274,58 @@ test.describe('Explorer characterization tests [data]', () => { { timeout: 90000, intervals: [500, 1000, 2000] } ).toBe('Other anthropogenic material'); }); + + // ========================================================================= + // (d3) #285: a pid deep-link must open the #inMapCard too, not only the + // sidebar #clusterSection -- same end state as a row-click (f). Phase 1 + // row-clicks a KNOWN sample to capture a real pid URL; phase 2 loads it + // fresh and asserts the boot pid path opens the in-map card with the + // same exact material (f) pins for the row-click path. Fails on the + // pre-#285 code, where the deep-link populated only the sidebar. + // ========================================================================= + test('(d3) [data] &pid= deep-link opens #inMapCard with exact material (#285)', async ({ browser }) => { + const KNOWN_PID = 'ark:/28722/k2p55x96j'; + const ctx1 = await browser.newContext(); + let capturedUrl = null; + try { + const page1 = await ctx1.newPage(); + await page1.goto(explorerUrl('?search=Object+5404-8' + WORLD), { waitUntil: 'domcontentloaded', timeout: 60000 }); + await page1.waitForSelector('#cesiumContainer', { timeout: 30000 }); + await waitForSearchReady(page1, 90000); + const row = page1.locator(`.samples-table tbody tr[data-pid="${KNOWN_PID}"]`); + await expect(row).toBeVisible({ timeout: 120000 }); + await row.locator('td').first().click(); + await expect.poll(async () => await page1.evaluate(() => location.href), { timeout: 30000, intervals: [250, 500, 1000] }).toContain('pid='); + capturedUrl = await page1.evaluate(() => location.href); + } finally { await ctx1.close(); } + + const ctx2 = await browser.newContext(); + try { + const page2 = await ctx2.newPage(); + await page2.goto(capturedUrl, { waitUntil: 'domcontentloaded', timeout: 60000 }); + await page2.waitForSelector('#cesiumContainer', { timeout: 30000 }); + await waitForBootReady(page2); + // Boot pid path restored the selection... + await expect.poll(async () => await getSelectedPid(page2), { timeout: 90000, intervals: [500, 1000, 2000] }).toBe(KNOWN_PID); + // ...and (the #285 fix) opened the floating in-map card, not just the sidebar. + await expect.poll( + async () => await page2.locator('#inMapCard').getAttribute('hidden'), + { timeout: 120000, intervals: [500, 1000, 2000] } + ).toBeNull(); + // The shared detail query populated the known material (parity with (f)). + await expect.poll( + async () => (await page2.locator('#imcMaterial').textContent() || '').trim(), + { timeout: 90000, intervals: [500, 1000, 2000] } + ).toBe('Other anthropogenic material'); + + // #285 finding 3: a hashchange away from the pid (to a bare view with no + // pid/h3) must HIDE the floating card again — otherwise it strands over + // the map. Drive a same-document hashchange and assert it re-hides. + await page2.evaluate(() => { location.hash = '#v=1&lat=20&lng=0&alt=10000000'; }); + await expect.poll( + async () => await page2.locator('#inMapCard').getAttribute('hidden'), + { timeout: 30000, intervals: [250, 500, 1000] } + ).not.toBeNull(); + } finally { await ctx2.close(); } + }); }); From e29f47a39496e832d11473448dfb6e722e299216 Mon Sep 17 00:00:00 2001 From: Raymond Yee Date: Wed, 17 Jun 2026 11:25:28 -0700 Subject: [PATCH 3/6] #281/#282 Half(a): facet hierarchy data pipeline (tree + membership + counts) (#16) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * plan: facet hierarchy (#281/#282/#276) — design + proven PoC Implementation plan for the tree facet display, grounded in the actual 202608 data + pipeline (not the "ancestry is free" folklore) and Codex-reviewed. Key grounding findings: - The facet UI is entirely flat; sample_facets_v3 stores a single "first non-root" URI per dim; the wide arrays are a SET of asserted concepts (NOT a clean ancestry path) — full ancestry must be COMPUTED from SKOS broader. - The canonical tree is derivable from the SKOS TTLs build_vocab_labels.py already fetches (but drops broader from its output). Trees are small/shallow (≤21 core concepts, depth 3). scripts/poc_facet_hierarchy.py proves Half (a) on live 202608 data (material): membership 15.08M rows over 5.83M located samples; parent>=child PASS; root==located-with-material PASS; non-additive confirmed. Codex corrections folded in: distinct-pid-UNION counting (not additive), located-universe (samp_geo) membership, data-form URI normalization (TTLs are un-versioned, data is /1.0/), DAG/multi-parent handling, extract a selected-facet state model + sql-builders.js helpers, closure-table option, material-first. Two halves: (a) data/pipeline is independent of the #249 refactor and can start now; (b) tree UI rides on the PR4a/PR4b/#285 merges. No production code changed. Co-Authored-By: Claude Opus 4.8 (1M context) * #281/#282 Half(a): facet hierarchy pipeline (tree + membership + counts) Backend half of the facet-hierarchy feature — no explorer.qmd/UI changes (Half b rides on the #249 refactor). Derives the SKOS concept tree, per-sample membership over the ancestry, and hierarchical counts, all validated by algebra. build_vocab_labels.py - Emit `broader` (canonical primary parent) + `broader_count` per concept, in BOTH vocab-form and data-form (/1.0/) rows (reusing _data_form_uris) so uri↔broader join within each uri_form. Surface multi-parent (DAG) count as a lossy-projection note. build_frontend_derived.py - concept_tree (uri, parent_uri, depth) + concept_closure (recursive) from vocab_labels' data-form broader edges. - node_dim: assign each concept to the dim whose canonical root it reaches — drop ONLY the explicit per-dim root (not every parentless concept), and keep exactly one root per dim. - sample_facet_membership(pid, facet_type, concept_uri, depth): located universe (samp_geo), full wide arrays expanded to ancestors, restricted to each dim's tree. Concepts with no path to their dim root are EXCLUDED + reported (flat facet_summaries still counts them). - facet_tree_summaries(facet_type, concept_uri, parent_uri, depth, count): COUNT(DISTINCT pid) per node — distinct-pid UNION, NOT additive. - --vocab-labels arg; fail-loud when a hierarchy artifact is requested without it; deterministic ORDER BY tie-breakers. validate_frontend_derived.py - Tree gate: parent>=child, every parent resolves, one root per dim, all 3 dims present, cross-file algebra (material root == facets_v2 non-root material), membership grain unique, symmetric tree==GROUP BY(membership). Verified on live 202608: 209-node tree, 38.9M membership rows; material root 5,829,436 == facets_v2 non-root material; all validator checks PASS; 53 unit tests green. PoC (scripts/poc_facet_hierarchy.py) + design (FACET_HIERARCHY_PLAN.md) included. Codex-reviewed (2 rounds); HIGH root/orphan findings fixed. Deferred (documented): per-dim DAG paths (we keep one canonical parent), a SQL-literal helper for path interpolation, and materializing the wide-array projection once (3× reread) — all follow-ups, not blockers. Co-Authored-By: Claude Opus 4.8 (1M context) * #281/#282 Half(a): close Codex r2 residuals (closure cycle guard + validator note) - concept_closure recursive CTE: cap distance < 64 so a future broader-cycle in the vocab can't recurse forever (today's projection is acyclic; this is a guard, not a behavior change — output byte-identical, live max depth is 3). - document the material-root cross-file check's current-data invariant (excluded material = 0): if a future vintage adds a material concept absent from the SKOS tree, the check correctly fails — revisit then. Re-verified: build + validator ALL CHECKS PASS; 14 unit tests green. Co-Authored-By: Claude Opus 4.8 (1M context) --------- Co-authored-by: Claude Opus 4.8 (1M context) --- FACET_HIERARCHY_PLAN.md | 252 +++++++++++++++++++++++++++ scripts/build_frontend_derived.py | 147 +++++++++++++++- scripts/build_vocab_labels.py | 30 ++++ scripts/poc_facet_hierarchy.py | 133 ++++++++++++++ scripts/validate_frontend_derived.py | 59 +++++++ 5 files changed, 620 insertions(+), 1 deletion(-) create mode 100644 FACET_HIERARCHY_PLAN.md create mode 100644 scripts/poc_facet_hierarchy.py diff --git a/FACET_HIERARCHY_PLAN.md b/FACET_HIERARCHY_PLAN.md new file mode 100644 index 00000000..525152d3 --- /dev/null +++ b/FACET_HIERARCHY_PLAN.md @@ -0,0 +1,252 @@ +# Facet hierarchy plan — tree display of facet values (#281 + #282) + +Draft for Codex + RY review (2026-06-17). Issues: +- **#281** (ekansa): show facet hierarchy in a tree, first two levels unfolded, click to open deeper. +- **#282** (akthom): nested + alphabetical across all three concept vocabularies. +- **#276**: the counting-semantics fork this forces — single "first non-root" value vs **membership** ("anywhere in the path"). + +Treated as **one design**. This is the biggest explorer UI change since launch; it +rides on the #249 test net (PR1/PR2 merged; PR3 deployed; PR4a/PR4b/#285 in flight). + +--- + +## 0. What I verified first (grounding — don't trust the folklore) + +The Slack/issue framing was "the data already carries each sample's full vocab +ancestry, so the tree is in hand." **That is only half true** — verified against +code + the live 202608 data, not assumed: + +1. **The facet UI is entirely flat today.** `explorer.qmd` `facetFilters` cell + (L1783–1873) renders material/context/object_type as a flat checkbox list via + `renderFilter()` (L1843), sorted by `count DESC` from `facet_summaries`. Source + is hard-coded flat HTML (L659–690). No parent/child/tree/indent anywhere. + `vocab_labels` is loaded only as `uri → pref_label` (no `broader`). + +2. **`sample_facets_v3` is one row per `pid` with a SINGLE already-flattened URI + per dimension.** `build_frontend_derived.py` picks the **first non-root** + material concept per sample (`arg_min(ic.uri, ord)` over the wide array, + excluding `MATERIAL_ROOT`, L86–96); context/object_type take array element + `[1]` and their root-dropping is still deferred (L29–30, L114–115). + +3. **The wide arrays are a SET of asserted concepts, NOT a clean ancestry path.** + Real 202608 data — one sample's `p__has_material_category` resolves (in array + order) to: `mineral`, `material` (root), `rock`. The canonical SKOS tree is + `mineral → earthmaterial → material` and `rock → rockorsediment → earthmaterial + → material`. So the array `{mineral, material, rock}` is **neither** a path + **nor** a transitive closure (it's missing `earthmaterial` and `rockorsediment`, + and spans two branches). Array-length distribution: len 1 = 6,233,867 samples, + len 2 = 1,601, len 3 = 491,424. + + **Consequence:** full transitive ancestry must be **computed at build time** by + walking `skos:broader` from each asserted concept to the root. We cannot read + it straight off the array. + +4. **The canonical tree IS available** — `build_vocab_labels.py` already fetches + ~12 SKOS TTLs (core material/sampledfeature/objecttype + OpenContext, earthenv, + biology extensions) but **drops** `skos:broader` from its output. The three + core vocabularies are small and shallow: + + | vocab | concepts (core) | root | max depth | level-1 children | + |---|---|---|---|---| + | material | 21 | `material` | 3 | 7 | + | sampledfeature | 20 | `anysampledfeature` | 3 | 4 | + | objecttype | 20 | `materialsample` | 3 | 5 | + + Extension TTLs add more nodes under these roots, but the structure stays + shallow. "First two levels unfolded" reveals most of each tree. + +**Net:** the tree is *derivable* but not free. Two build-time additions are +needed — (a) emit the `broader` edges, (b) compute per-sample membership over the +ancestry. The UI then renders the tree and counts/filters via membership. + +--- + +## 1. Two halves, two dependency profiles + +| Half | What | Depends on #249 refactor? | +|---|---|---| +| **(a) Data/pipeline** | tree edges + membership-count derived files | **No** — pure backend; can start now | +| **(b) Tree UI** | tree rendering + membership filtering in `explorer.qmd` | **Yes** — touches the facet cells PR4b/#285 also touch | + +So we break ground on **(a) immediately**; **(b)** lands after PR4a/PR4b/#285 merge. + +--- + +## 2. Half (a) — data/pipeline (start now) + +### 2.1 Emit the tree (`build_vocab_labels.py`) +Add a `broader` column (parent URI, nullable for roots) to `vocab_labels.parquet`, +read from `skos:broader` across **all** the TTLs already parsed. Optionally also a +`depth` (int, root = 0) and `scheme` (already present). Validation: every non-root +concept resolves to a root via `broader`; no cycles; every `facet_value` that +appears in the data has a tree node (flag orphans — these are the #161/#148-style +label-gap cousins). + +### 2.2 Compute membership (`build_frontend_derived.py`) +New derived table, one row per (sample, concept-in-its-ancestry): +``` +sample_facet_membership(pid, facet_type, concept_uri, depth) +``` +For each sample and each dimension: take the asserted concept(s) from the wide +array, drop the bare root, and for each, walk `broader` to the root emitting a row +per ancestor (dedup per pid×concept). A sample tagged `{mineral, rock}` → membership +`{mineral, earthmaterial, rockorsediment, rock, material?}` (root-inclusion = open +question Q2). This is **membership semantics** (#276 "anywhere in the path"). + +Then hierarchical counts come from a GROUP BY on the membership table: +``` +facet_tree_summaries(facet_type, concept_uri, parent_uri, depth, label, count) + where count = COUNT(DISTINCT pid) with that concept in its membership set +``` +**Counting invariant (Codex-corrected):** counts are a **distinct-pid UNION**, NOT +additive. `parent_count = COUNT(DISTINCT pid)` over (direct ∪ all descendants); the +only guaranteed relation is `parent_count >= every child_count`. Do **not** assert +`parent = direct + Σ children` — a `{mineral, rock}` sample lands under multiple +sibling branches, so children overlap. (Verified both directions on real data — see §7.) + +**Universe must match the explorer (Codex):** build membership from the **located** +sample set (`samp_geo` — `MaterialSampleRecord` with geometry), the same universe the +map/table/existing facet_summaries use, or hierarchy counts will drift from every +other surface. + +**URI form is first-class (Codex):** the SKOS TTLs use *un-versioned* URIs +(`.../material/anthropogenicmetal`) while the data uses *versioned* ones +(`.../material/1.0/rock`). `broader` edges MUST be emitted in the **data form** — +reuse the alias/version normalization `build_vocab_labels.py` already applies +(its data-form aliases exist for exactly this reason). Verified: a naive join is +0% match; with version-segment normalization it's correct (§7). + +**SKOS is a DAG, not a tree (Codex):** 29 concepts across the loaded vocabularies +have multiple `skos:broader` parents. The validator must either pick a canonical +parent deterministically or define multi-parent rendering before UI work; don't +silently assume a single-parent tree. + +### 2.3 Sizing / perf +Membership ≈ samples × avg ancestry depth. With depth ≤ 3–4 and most samples at +len 1, expect ~15–25M rows over 6.7M samples (vs `facets_v3` 6.7M rows). Store +sorted by `(facet_type, pid)` for the bbox-JOIN path; ZSTD. **Validate query +latency** (the explorer reads these over HTTP range requests) before committing to +the shape — mirror `profile_queries.py`. If the per-pid membership table is too +heavy for the viewport-scoped count JOIN, fall back to a precomputed +`facet_tree_cross_filter` cube (like today's `facet_cross_filter`). + +### 2.4 Outputs, validator, publish +- New/changed files (versioned, **non-cutover** — publish alongside, don't + overwrite prod): `…_facet_tree_summaries.parquet`, `…_sample_facet_membership.parquet`, + `vocab_labels_*.parquet` (+`broader`). +- Extend `validate_frontend_derived.py`: tree integrity (single root per scheme, + no cycles, parent counts ≥ child counts and = direct+Σchildren), membership grain + (no dup pid×concept), label coverage. +- Ship as a **data-pipeline-only PR** (no `explorer.qmd`). Prove counts in a notebook + / query log committed under the existing `*_SUMMARY` convention. + +--- + +## 3. Half (b) — tree UI (after #249 PRs merge) + +### 3.1 Rendering +Replace flat `renderFilter()` for material/context/object_type with a tree builder +fed by `facet_tree_summaries` (parent_uri + depth + label + count). Source stays +flat (no vocab tree). Behavior per #281/#282: +- Render the tree **nested + alphabetical** within each level (#282). +- **First two levels unfolded** (root's children + grandchildren); deeper nodes + collapsed behind a disclosure control; click to expand (#281). +- Each node: checkbox + label + `(membership count)` in the existing + `.facet-count[data-facet][data-value]` span shape so the count-update plumbing + (`applyFacetCounts`, `.recomputing`) is reused unchanged. + +### 3.2 Counts +`updateCrossFilteredCounts` reads membership counts instead of single-value counts. +Node count = its membership count under the current viewport + cross-filter + search. +Parent = direct + Σ descendants falls out of the membership GROUP BY. + +### 3.3 Filtering semantics (the coherence contract) +Selecting a node filters to its **entire subtree** (membership): `facetFilterSQL` +changes from `material IN (…)` on `facets_v3` to `pid IN (SELECT pid FROM membership +WHERE concept_uri IN (selected nodes + their descendants))`. **Counts and the table +filter must share one expression** (the `FACETS_DESCRIPTION_EXPR` discipline from +the 202608 work / the #245 "facet == table" invariant), or legend and table drift. +- Parent/child checkbox interaction: selecting a parent selects its subtree + (tri-state indeterminate for partial — **Q3**). + +### 3.4 Interactions to respect +- `?material=` URL param (and friends) must accept tree nodes and round-trip + (cf. facet-viewport `coherence` test). +- The #267 "active facet forces point mode" rule and the B1 viewport-scoped count + path (moveStart `.recomputing` → moveEnd `refreshFacetCounts`) stay intact. +- Rides on the refactored facet code from #249 PR3 (`sql-builders.js`) — extend + there, with `node --test` units for the new tree/membership SQL builders. + +--- + +## 4. Sequencing & gate +1. **Now:** Half (a) — tree edges + membership + `facet_tree_summaries`, validator, + latency probe, notebook proof. Data-pipeline PR; publish versioned files. +2. **After PR4a/PR4b/#285 merge:** Half (b) — tree UI + membership filtering, behind + the smoke + characterization + new tree-specific Playwright specs (assert + parent count = direct+Σchildren; selecting a parent filters the subtree; 2-levels + unfolded; legend == table under a subtree selection). Codex per step. +3. Keep single-value `facets_v3` until (b) ships, then decide deprecate vs keep + (Q1). + +--- + +## 5. Open questions (for Codex + RY) +- **Q1 — migrate or coexist?** Does hierarchy fully replace the single-value + `facets_v3` columns (filtering + counts move to membership), or do both ship? + (Hierarchy needs membership for both; single-value can't express subtree filter.) +- **Q2 — root inclusion.** Does membership include the bare root (`material`, + `anysampledfeature`, `materialsample`)? It's the "All" node; probably render it + as the tree root label, not a selectable facet — confirm. +- **Q3 — parent selection UX.** Selecting a parent = select whole subtree, with a + tri-state indeterminate when only some children are checked? Or parent is a pure + filter (subtree) with no child checkboxes shown until expanded? +- **Q4 — multi-asserted leaves.** A sample tagged `{mineral, rock}` contributes to + both branches' ancestries (union). Confirm that's the intended "membership" + reading (vs picking one). #276 leans union. +- **Q5 — which dims first.** material is the #282 priority; sampledfeature + + objecttype follow the same machinery; source never gets a tree. Ship all three + trees at once or material-first? +- **Q6 — count perf.** Is the per-pid membership JOIN viable for viewport-scoped + counts, or do we precompute a `facet_tree_cross_filter` cube? Decide from the + 2.3 latency probe. + +--- + +## 6. First concrete step — DONE (proof-of-concept) +`scripts/poc_facet_hierarchy.py` builds the proof against the local 202608 wide + +the SKOS TTLs: merges `broader` edges (per-file parse — a combined-graph parse +silently drops material's edges), normalizes URIs to data form, computes the +ancestor closure + membership, and checks the invariants. Run: +`python scripts/poc_facet_hierarchy.py --wide <202608_wide.parquet> --ttls `. + +## 7. PoC results (proven, material dimension, 202608) +- located samples = **6,026,242**; located-with-material = **5,829,436**; + membership = **15,076,893** rows. +- **INVARIANT A — parent ≥ child: PASS** (counts monotonic down the tree). +- **INVARIANT B — root == located-with-material: PASS** (5,829,436 = 5,829,436; + every located sample with a material concept reaches the root). +- **INVARIANT C — non-additive: confirmed** (earthmaterial distinct = 4,091,133 ≠ + Σ children = 2,028,538). Additive summing is wrong in both directions. +- Sane tree: material 5.83M → earthmaterial 4.09M → rockorsediment 852K → rock 794K; + mineral 303K; organicmaterial 1.02M. +- **Gotchas surfaced & fixed:** (1) URI version-form mismatch (0% join → fixed by + normalization); (2) rdflib drops material edges when many TTLs share one graph + (parse per-file, merge dicts); (3) 29 multi-parent (DAG) concepts exist. + +**Next:** wire this into `build_vocab_labels.py` (emit `broader`, data-form) + +`build_frontend_derived.py` (closure + membership + `facet_tree_summaries`, built +from `samp_geo`) + `validate_frontend_derived.py` (tree integrity, parent≥child, +DAG policy), behind a latency probe. Ship as the data-only PR (Half a). Then Half b. + +## 8. Codex review — accepted corrections (2026-06-17, gpt-5.5) +Verdict: "directionally sound." Accepted: (1) distinct-pid-union invariant, not +additive [§2.2]; (2) build membership from the located universe [§2.2]; (3) URI-form +normalization is first-class [§2.2]; (4) DAG/multi-parent handling [§2.2]; (5) extract +a selected-facet **state model** (URL/checkbox/filter-SQL/cross-filter currently all +read the DOM directly) rather than just nested HTML [§3]; (6) put membership SQL in +`assets/js/sql-builders.js` with `node --test` units [§3]; (7) consider a +**closure table** (`concept_closure(ancestor, descendant, distance)` + asserted +projection) over a materialized membership file — benchmark both; (8) **ship +material-first** behind shared machinery [§5 Q5]; (9) parent UX = store the parent +URI + derive descendants, don't explode into the URL, tri-state indeterminate [§5 Q3]. diff --git a/scripts/build_frontend_derived.py b/scripts/build_frontend_derived.py index dd29addd..0394a752 100755 --- a/scripts/build_frontend_derived.py +++ b/scripts/build_frontend_derived.py @@ -44,9 +44,26 @@ MATERIAL_ROOT = "https://w3id.org/isample/vocabulary/material/1.0/material" FACET_DIMS = ["source", "material", "context", "object_type"] +# Hierarchical dims (#281/#282): wide array column + the dim's canonical SKOS +# root. The array carries each sample's SET of asserted IdentifiedConcept +# row_ids (general↔specific, no guaranteed order — FACET_HIERARCHY_PLAN.md §0). +# The root is dropped from "asserted" (re-added via closure) and is the single +# tree root per dim. source has no vocab tree. (Codex: drop ONLY these explicit +# roots, not every parentless concept — deprecated/parentless concepts must stay.) +DIM_ARRAY_COL = { + "material": "p__has_material_category", + "context": "p__has_context_category", + "object_type": "p__has_sample_object_type", +} +DIM_ROOT = { + "material": MATERIAL_ROOT, + "context": "https://w3id.org/isample/vocabulary/sampledfeature/1.0/anysampledfeature", + "object_type": "https://w3id.org/isample/vocabulary/materialsampleobjecttype/1.0/materialsample", +} # the artifacts this script knows how to build (for --only/--skip validation) ARTIFACTS = ["sample_facets_v2", "samples_map_lite", "h3_summaries", - "facet_summaries", "facet_cross_filter", "wide_h3"] + "facet_summaries", "facet_cross_filter", "wide_h3", + "sample_facet_membership", "facet_tree_summaries"] # Shared SQL expression for sample_facets_v2.description (#277 part 2). # Appends space-joined concept labels (IC labels across all 4 concept dims) @@ -299,6 +316,117 @@ def build_wide_h3(con, wide, out): ) TO '{out}' (FORMAT PARQUET, COMPRESSION ZSTD)""") +def build_concept_membership(con, wide, vocab_labels, t0): + """Build the hierarchy temp tables (#281/#282/#276) from vocab_labels' data-form + `broader` edges + the wide concept arrays, over the LOCATED universe (samp_geo) + so counts match the map/table. Creates: concept_tree, concept_closure, roots, + asserted, membership. See FACET_HIERARCHY_PLAN.md §2.""" + # concept_tree: data-form (uri, canonical primary parent, depth-from-root). + # vocab_labels already aliases broader into data form, so this joins to the + # data-form concept URIs the wide arrays resolve to. + con.execute(f""" + CREATE OR REPLACE TEMP TABLE vl_edges AS + SELECT DISTINCT uri, broader AS parent_uri + FROM read_parquet('{vocab_labels}') WHERE uri_form='data_v1'; + CREATE OR REPLACE TEMP TABLE concept_tree AS + WITH RECURSIVE depths AS ( + SELECT uri, parent_uri, 0 AS depth FROM vl_edges WHERE parent_uri IS NULL + UNION ALL + SELECT e.uri, e.parent_uri, d.depth + 1 + FROM vl_edges e JOIN depths d ON e.parent_uri = d.uri + ) + SELECT uri, ANY_VALUE(parent_uri) AS parent_uri, MIN(depth) AS depth + FROM depths GROUP BY uri; + -- transitive ancestor closure (self at distance 0) + CREATE OR REPLACE TEMP TABLE concept_closure AS + WITH RECURSIVE clo AS ( + SELECT uri AS descendant, uri AS ancestor, 0 AS distance FROM concept_tree + UNION ALL + SELECT c.descendant, t.parent_uri AS ancestor, c.distance + 1 + FROM clo c JOIN concept_tree t ON t.uri = c.ancestor + -- Cycle guard (Codex r2): the SKOS projection is acyclic today, but cap + -- depth so a future bad vocab (a broader cycle) can't recurse forever. + -- 64 >> any real concept depth (live max is 3). + WHERE t.parent_uri IS NOT NULL AND c.distance < 64 + ) + SELECT DISTINCT descendant, ancestor, distance FROM clo; + """) + # node_dim: assign each tree concept to the dim whose canonical root it reaches + # via the closure. This both (a) restricts the hierarchy to each dim's real + # tree and (b) keeps exactly one root per dim — deprecated/parentless concepts + # that don't reach a dim root are NOT treated as roots (Codex HIGH-1). + dim_root_vals = ", ".join(f"('{dim}', '{root}')" for dim, root in DIM_ROOT.items()) + con.execute(f""" + CREATE OR REPLACE TEMP TABLE dim_root(facet_type VARCHAR, root_uri VARCHAR); + INSERT INTO dim_root VALUES {dim_root_vals}; + CREATE OR REPLACE TEMP TABLE node_dim AS + SELECT DISTINCT c.descendant AS uri, r.facet_type + FROM concept_closure c JOIN dim_root r ON r.root_uri = c.ancestor; + """) + # asserted: every located sample's concept(s) per hierarchical dim, from the + # FULL wide array (not the flattened first-non-root value), dropping ONLY that + # dim's explicit root. + union = " UNION ALL ".join( + f"""SELECT DISTINCT sg.pid, '{dim}' AS facet_type, ic.uri AS concept + FROM samp_geo sg + JOIN read_parquet('{wide}') s ON s.pid = sg.pid, + UNNEST(s.{col}) AS u(rid) + JOIN ic ON ic.row_id = u.rid + WHERE ic.uri <> '{DIM_ROOT[dim]}'""" + for dim, col in DIM_ARRAY_COL.items()) + con.execute(f"CREATE OR REPLACE TEMP TABLE asserted AS {union};") + # membership: each asserted concept expanded to its ancestor closure, RESTRICTED + # to ancestors in the SAME dim's canonical tree. Membership semantics (#276): a + # sample counts under every node on the path(s) of every concept it asserts. + # Asserted concepts that don't reach their dim root (label gaps #148/#161, or + # the un-linked specimentype scheme) produce no rows → EXCLUDED from the + # hierarchy (flat facet_summaries still counts them) and reported below. + con.execute(""" + CREATE OR REPLACE TEMP TABLE membership AS + SELECT DISTINCT a.pid, a.facet_type, c.ancestor AS concept_uri + FROM asserted a + JOIN concept_closure c ON c.descendant = a.concept + JOIN node_dim nd ON nd.uri = c.ancestor AND nd.facet_type = a.facet_type; + """) + n_tree = con.sql("SELECT COUNT(*) FROM concept_tree").fetchone()[0] + n_mem = con.sql("SELECT COUNT(*) FROM membership").fetchone()[0] + # Excluded = distinct (dim, concept) asserted that never reach the dim root. + excl = con.sql(""" + SELECT a.facet_type, COUNT(DISTINCT a.concept) AS n + FROM asserted a + WHERE NOT EXISTS ( + SELECT 1 FROM concept_closure c JOIN node_dim nd + ON nd.uri = c.ancestor AND nd.facet_type = a.facet_type + WHERE c.descendant = a.concept) + GROUP BY a.facet_type ORDER BY a.facet_type""").fetchall() + log(f"concept_tree={n_tree:,} membership={n_mem:,}", t0) + if excl: + detail = ", ".join(f"{ft}={n}" for ft, n in excl) + log(f"NOTE: concepts EXCLUDED from hierarchy (no path to dim root; label " + f"gaps #148/#161 / un-linked schemes — flat facet_summaries still counts them): {detail}", t0) + + +def build_sample_facet_membership(con, out): + con.execute(f"""COPY ( + SELECT m.pid, m.facet_type, m.concept_uri, t.depth + FROM membership m JOIN concept_tree t ON t.uri = m.concept_uri + ORDER BY m.facet_type, m.pid, m.concept_uri + ) TO '{out}' (FORMAT PARQUET, COMPRESSION ZSTD)""") + + +def build_facet_tree_summaries(con, out): + # Hierarchical counts: COUNT(DISTINCT pid) per node (membership = direct ∪ + # descendants). NOT additive — a sample under two sibling branches counts + # once at each and once at their shared ancestor (FACET_HIERARCHY_PLAN.md §2.2). + con.execute(f"""COPY ( + SELECT m.facet_type, m.concept_uri, t.parent_uri, t.depth, + COUNT(DISTINCT m.pid) AS count + FROM membership m JOIN concept_tree t ON t.uri = m.concept_uri + GROUP BY m.facet_type, m.concept_uri, t.parent_uri, t.depth + ORDER BY m.facet_type, count DESC, m.concept_uri + ) TO '{out}' (FORMAT PARQUET, COMPRESSION ZSTD)""") + + def file_meta(con, path): n = con.sql(f"SELECT COUNT(*) FROM read_parquet('{path}')").fetchone()[0] schema = [(r[0], r[1]) for r in con.sql(f"DESCRIBE SELECT * FROM read_parquet('{path}')").fetchall()] @@ -321,6 +449,9 @@ def main(): ap.add_argument("--tag", required=True, help="output prefix, e.g. isamples_202606 (no stale default)") ap.add_argument("--only", default="", help=f"comma list of: {','.join(ARTIFACTS)}") ap.add_argument("--skip", default="", help="comma list of the same names to skip") + ap.add_argument("--vocab-labels", default="", + help="vocab_labels parquet (with `broader`, data_v1 rows) — " + "required for sample_facet_membership / facet_tree_summaries (#281/#282)") ap.add_argument("--no-manifest", action="store_true", help="skip writing {tag}_manifest.json") ap.add_argument("--threads", type=int, default=0, help="DuckDB thread count; set 1 for bit-stable floating centroids (slower)") @@ -358,6 +489,20 @@ def emit(name, fn): log("h3_summary_res{4,6,8} ✓", t0) emit("wide_h3", lambda o: build_wide_h3(con, args.wide, o)) + # Hierarchy artifacts (#281/#282) — need vocab_labels for the SKOS tree. + if want("sample_facet_membership") or want("facet_tree_summaries"): + if not args.vocab_labels: + # Fail loud if the user EXPLICITLY asked for a hierarchy artifact + # (Codex) — silently skipping an explicit --only target is wrong. + explicit = only & {"sample_facet_membership", "facet_tree_summaries"} + if explicit: + sys.exit(f"FATAL: --only {sorted(explicit)} requires --vocab-labels ") + log("SKIP hierarchy artifacts: pass --vocab-labels ", t0) + else: + build_concept_membership(con, args.wide, args.vocab_labels, t0) + emit("sample_facet_membership", lambda o: build_sample_facet_membership(con, o)) + emit("facet_tree_summaries", lambda o: build_facet_tree_summaries(con, o)) + if not args.no_manifest: log("hashing inputs/outputs for manifest…", t0) exts = {r[0]: r[1] for r in con.sql( diff --git a/scripts/build_vocab_labels.py b/scripts/build_vocab_labels.py index d003d2f6..c6bb508b 100644 --- a/scripts/build_vocab_labels.py +++ b/scripts/build_vocab_labels.py @@ -179,6 +179,14 @@ def extract_rows(ttl_url: str) -> list[dict]: scheme = _pick_scheme(g, c) definition = _pick_definition(g, c) alt_labels = sorted({str(a) for a in g.objects(c, SKOS.altLabel)}) + # skos:broader parent (#281/#282 tree). SKOS permits multiple parents + # (a DAG); pick the lexicographically-first as the canonical primary so + # the column is deterministic, and stash the full set for the validator + # to flag multi-parent concepts. Vocab-form here; aliased to data-form + # in _emit_data_form_aliases so uri↔broader join within each uri_form. + broaders = sorted(str(b) for b in g.objects(c, SKOS.broader)) + broader_vocab = broaders[0] if broaders else None + broader_count = len(broaders) # One row per language of skos:prefLabel; fall back to rdfs:label. pref_labels = list(g.objects(c, SKOS.prefLabel)) @@ -190,11 +198,14 @@ def extract_rows(ttl_url: str) -> list[dict]: # downstream JOINs at least know the URI exists. rows.append({ "uri": uri, + "uri_form": "vocab", "pref_label": None, "lang": None, "scheme": scheme, "definition": definition, "alt_labels": alt_labels, + "broader": broader_vocab, + "broader_count": broader_count, "source_ttl": ttl_url, }) continue @@ -208,6 +219,8 @@ def extract_rows(ttl_url: str) -> list[dict]: "scheme": scheme, "definition": definition, "alt_labels": alt_labels, + "broader": broader_vocab, + "broader_count": broader_count, "source_ttl": ttl_url, }) return rows @@ -254,6 +267,15 @@ def _emit_data_form_aliases(rows: list[dict]) -> list[dict]: clone = dict(r) clone["uri"] = data_uri clone["uri_form"] = "data_v1" + # Map the parent to its data form too, so a data_v1 row's `broader` + # joins to another data_v1 row's `uri` (same alias space). If the + # parent has no known data-form alias, leave the vocab-form parent + # (the validator flags any broader that resolves to no node). + parent = r.get("broader") + if parent: + parent_data = _data_form_uris(parent) + if parent_data: + clone["broader"] = parent_data[0] aliases.append(clone) return aliases @@ -320,6 +342,8 @@ def main(argv: list[str] | None = None) -> int: "scheme": scheme, "definition": None, "alt_labels": [], + "broader": None, # deprecated leaf concepts; no tree parent + "broader_count": 0, "source_ttl": "manual_override", }) print(f" {len(MANUAL_LABEL_OVERRIDES):>4} rows (manual overrides for deprecated URIs)") @@ -343,6 +367,12 @@ def main(argv: list[str] | None = None) -> int: df.to_parquet(args.output, index=False) print(f"\nWrote {len(df):,} rows → {args.output}") print(f" by uri_form: {df['uri_form'].value_counts().to_dict()}") + # Surface the SKOS DAG (#281/#282): concepts with >1 skos:broader parent. + # We keep the lexicographically-first as the canonical `broader` (a lossy + # tree projection); flag the count so the hierarchy build/UI can account for it. + if "broader_count" in df.columns: + multi = df[(df["uri_form"] == "vocab") & (df["broader_count"].fillna(0) > 1)]["uri"].nunique() + print(f" multi-parent (DAG) concepts: {multi} (canonical primary parent kept; lossy projection)") print(f" unique URIs: {df['uri'].nunique():,}") print(f" languages: {sorted(df['lang'].dropna().unique().tolist())}") print(f" schemes: {df['scheme'].nunique()} distinct skos:inScheme values") diff --git a/scripts/poc_facet_hierarchy.py b/scripts/poc_facet_hierarchy.py new file mode 100644 index 00000000..79e56847 --- /dev/null +++ b/scripts/poc_facet_hierarchy.py @@ -0,0 +1,133 @@ +#!/usr/bin/env python3 +"""Proof-of-concept for the facet hierarchy plan (#281/#282/#276) — Half (a). + +De-risks the data/pipeline half BEFORE touching build_vocab_labels.py / +build_frontend_derived.py: derive the concept tree from SKOS `broader`, compute +per-sample membership over the ancestry, aggregate hierarchical counts, and check +the invariants. See FACET_HIERARCHY_PLAN.md §6/§7. + +Material dimension only (the #282 priority); the same machinery generalizes to +context (sampledfeature) and object_type. + +Gotchas this encodes (all surfaced empirically): + 1. URI form: SKOS TTLs use un-versioned URIs (.../material/rock) but the data + uses versioned ones (.../material/1.0/rock). We normalize by stripping the + /X.Y/ version segment so the ancestry join matches. (Production reuses + build_vocab_labels.py's alias/version logic instead.) + 2. rdflib drops material's broader edges when many TTLs are parsed into ONE + graph — parse each TTL into its own graph and merge the dicts. + 3. SKOS is a DAG: some concepts have multiple skos:broader parents. + +Counting invariant (Codex-corrected): counts are a distinct-pid UNION, NOT +additive. parent_count = COUNT(DISTINCT pid over direct ∪ descendants); only +parent_count >= every child_count is guaranteed. + +Usage: + python scripts/poc_facet_hierarchy.py --wide /path/202608_wide.parquet \ + --ttls /path/to/ttl_dir +""" +import argparse +import glob +import re + +import duckdb +import rdflib +from rdflib.namespace import SKOS + +VERSION_SEG = re.compile(r"/\d+\.\d+(?=/)") # the "/1.0" version path segment +MATERIAL_ROOT_NORM = "https://w3id.org/isample/vocabulary/material/material" + + +def norm(uri): + """Strip the version segment so TTL (un-versioned) and data (versioned) URIs join.""" + return VERSION_SEG.sub("", uri) if uri else uri + + +def load_broader(ttl_dir): + """Merge skos:broader across all TTLs. Parse each file into its OWN graph — + a single shared graph silently drops material's edges.""" + broader, multi = {}, {} + for f in sorted(glob.glob(f"{ttl_dir}/*.ttl")): + g = rdflib.Graph().parse(f, format="turtle") + for s, _, o in g.triples((None, SKOS.broader, None)): + cs, co = norm(str(s)), norm(str(o)) + broader.setdefault(cs, co) + multi.setdefault(cs, set()).add(co) + dag = {k: v for k, v in multi.items() if len(v) > 1} + return broader, dag + + +def ancestor_closure(broader): + """Return [(descendant, ancestor, distance)] including self at distance 0.""" + def chain(u): + out, seen, cur, d = [(u, 0)], {u}, u, 0 + while cur in broader and broader[cur] not in seen: + cur = broader[cur]; d += 1; out.append((cur, d)); seen.add(cur) + return out + concepts = set(broader) | set(broader.values()) + return [(c, a, d) for c in concepts for (a, d) in chain(c)] + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--wide", required=True, help="202608 wide parquet") + ap.add_argument("--ttls", required=True, help="dir of SKOS .ttl vocab files") + args = ap.parse_args() + + broader, dag = load_broader(args.ttls) + print(f"broader edges: {len(broader)} multi-parent (DAG) concepts: {len(dag)}") + closure = ancestor_closure(broader) + + con = duckdb.connect() + con.create_function("norm", norm, ["VARCHAR"], "VARCHAR") + con.execute("CREATE TABLE closure(descendant VARCHAR, ancestor VARCHAR, distance INT)") + con.executemany("INSERT INTO closure VALUES (?,?,?)", closure) + con.execute("CREATE TABLE parent(child VARCHAR, parent VARCHAR)") + con.executemany("INSERT INTO parent VALUES (?,?)", list(broader.items())) + + w = args.wide + con.execute(f""" + CREATE TEMP TABLE ic AS + SELECT row_id, pid AS uri FROM read_parquet('{w}') WHERE otype='IdentifiedConcept'; + -- LOCATED universe = the explorer's universe (samp_geo): MaterialSampleRecord w/ geometry + CREATE TEMP TABLE located AS + SELECT pid FROM read_parquet('{w}') WHERE otype='MaterialSampleRecord' AND geometry IS NOT NULL; + CREATE TEMP TABLE asserted AS + SELECT DISTINCT s.pid, norm(ic.uri) AS concept + FROM read_parquet('{w}') s JOIN located l ON l.pid = s.pid, + UNNEST(s.p__has_material_category) AS u(rid) JOIN ic ON ic.row_id = u.rid + WHERE s.otype='MaterialSampleRecord' AND norm(ic.uri) <> '{MATERIAL_ROOT_NORM}'; + CREATE TEMP TABLE membership AS + SELECT DISTINCT a.pid, c.ancestor AS concept + FROM asserted a JOIN closure c ON c.descendant = a.concept; + CREATE TEMP TABLE tree_counts AS + SELECT concept, COUNT(DISTINCT pid) AS cnt FROM membership GROUP BY concept; + """) + + n_loc = con.sql("SELECT COUNT(*) FROM located").fetchone()[0] + n_wm = con.sql("SELECT COUNT(DISTINCT pid) FROM asserted").fetchone()[0] + n_mem = con.sql("SELECT COUNT(*) FROM membership").fetchone()[0] + print(f"located={n_loc:,} located-with-material={n_wm:,} membership rows={n_mem:,}") + + bad = con.sql(""" + SELECT p.parent, p.child, pc.cnt, cc.cnt FROM parent p + JOIN tree_counts pc ON pc.concept=p.parent + JOIN tree_counts cc ON cc.concept=p.child WHERE cc.cnt > pc.cnt""").fetchall() + print(f"INVARIANT A (parent>=child): {'PASS' if not bad else 'FAIL ' + str(bad[:3])}") + + root = con.sql(f"SELECT cnt FROM tree_counts WHERE concept='{MATERIAL_ROOT_NORM}'").fetchone() + root = root[0] if root else 0 + print(f"INVARIANT B (root==located-with-material): root={root:,} expected={n_wm:,} " + f"{'PASS' if root == n_wm else 'DIFF'}") + + tail = lambda u: u.rsplit("/", 1)[-1] + print("\n=== material membership counts (located samples) ===") + for c, cnt in con.sql( + "SELECT concept, cnt FROM tree_counts WHERE concept LIKE '%/material/%' " + "ORDER BY cnt DESC LIMIT 14").fetchall(): + p = broader.get(c) + print(f" {cnt:>10,} {tail(c):26s} (parent: {tail(p) if p else 'ROOT'})") + + +if __name__ == "__main__": + main() diff --git a/scripts/validate_frontend_derived.py b/scripts/validate_frontend_derived.py index 110ea025..5ab768ec 100755 --- a/scripts/validate_frontend_derived.py +++ b/scripts/validate_frontend_derived.py @@ -50,6 +50,8 @@ def main(): ap.add_argument("--facets"); ap.add_argument("--map-lite") ap.add_argument("--summaries"); ap.add_argument("--cross-filter") ap.add_argument("--h3", nargs=3, metavar=("R4", "R6", "R8")) + ap.add_argument("--tree-summaries", help="facet_tree_summaries parquet (#281/#282); optional") + ap.add_argument("--membership", help="sample_facet_membership parquet (#281/#282); optional") ap.add_argument("--wide", help="source wide parquet — enables the SEMANTIC gate " "(re-derive and diff the written files against a fresh build)") ap.add_argument("--min-rows", type=int, default=1_000_000, @@ -280,6 +282,63 @@ def except_diff(asql, bsql): n = scalar(f"SELECT COUNT(*) FROM {F} WHERE {dim}='{root}'") info.append(f"{dim} root-concept rows: {n:,} (informational; root-dropping deferred)") + # --- hierarchy artifacts (#281/#282) — checked only when present --- + def _opt(name, attr): + v = getattr(a, attr) + if v: + return v + if a.dir and a.tag: + p = os.path.join(a.dir, f"{a.tag}_{name}.parquet") + return p if os.path.exists(p) else None + return None + tree = _opt("facet_tree_summaries", "tree_summaries") + mem = _opt("sample_facet_membership", "membership") + if tree: + T = f"read_parquet('{tree}')" + # parent ≥ child for every edge, every dim (distinct-pid UNION semantics — + # NOT additive; see FACET_HIERARCHY_PLAN.md §2.2). + viol = scalar(f"""SELECT COUNT(*) FROM {T} c JOIN {T} p + ON p.concept_uri=c.parent_uri AND p.facet_type=c.facet_type + WHERE c.count > p.count""") + check("tree: parent count >= every child count", viol == 0, f"{viol} edges violate") + # every non-null parent_uri resolves to a node (no dangling edges) + orph = scalar(f"""SELECT COUNT(*) FROM {T} c WHERE c.parent_uri IS NOT NULL + AND NOT EXISTS (SELECT 1 FROM {T} p WHERE p.concept_uri=c.parent_uri AND p.facet_type=c.facet_type)""") + check("tree: every parent_uri resolves to a node", orph == 0, f"{orph} dangling parents") + # exactly one root per hierarchical dim + badroots = scalar(f"""SELECT COUNT(*) FROM ( + SELECT facet_type, COUNT(*) n FROM {T} WHERE parent_uri IS NULL GROUP BY facet_type HAVING n<>1)""") + check("tree: exactly one root per dim", badroots == 0, f"{badroots} dims with !=1 root") + # all three hierarchical dims present (catches a silently-missing dim) + dims_present = scalar(f"SELECT COUNT(DISTINCT facet_type) FROM {T}") + check("tree: all 3 hierarchical dims present", dims_present == 3, + f"{dims_present} dims present (want material/context/object_type)") + # CROSS-FILE ALGEBRA: material root membership == facets_v2 non-root material + # (both = located samples carrying ≥1 non-root material concept). + # NOTE (Codex r2): this equality holds under the current-data invariant + # "0 material concepts excluded from the hierarchy" (every located + # material concept resolves to the material tree). If a future vintage + # introduces a material concept absent from the SKOS tree, facets_v2 would + # still count it flat while the hierarchy excludes it, and this check would + # (correctly) fail — revisit the equality then. + mat_root = scalar(f"SELECT count FROM {T} WHERE facet_type='material' AND parent_uri IS NULL") + fv2_mat = scalar(f"SELECT COUNT(*) FROM {F} WHERE material IS NOT NULL") + check("tree: material root == facets_v2 non-root material count", + mat_root == fv2_mat, f"tree={mat_root:,} vs facets_v2={fv2_mat:,}") + if mem: + M = f"read_parquet('{mem}')" + dup = scalar(f"SELECT COUNT(*) FROM (SELECT pid,facet_type,concept_uri FROM {M} GROUP BY 1,2,3 HAVING COUNT(*)>1)") + check("membership: (pid,facet_type,concept_uri) unique", dup == 0, f"{dup} dup grain rows") + if tree: # tree_summaries must EXACTLY equal a fresh GROUP BY of membership + T2 = f"read_parquet('{tree}')" + # symmetric: neither side has a (facet_type,concept,count) the other lacks + mm = scalar(f""" + WITH g AS (SELECT facet_type, concept_uri, COUNT(DISTINCT pid) AS count FROM {M} GROUP BY 1,2), + t AS (SELECT facet_type, concept_uri, count FROM {T2}) + SELECT (SELECT COUNT(*) FROM (SELECT * FROM g EXCEPT SELECT * FROM t)) + + (SELECT COUNT(*) FROM (SELECT * FROM t EXCEPT SELECT * FROM g))""") + check("tree counts == GROUP BY membership (symmetric)", mm == 0, f"{mm} rows disagree") + print(f"\n{'CHECK':<44} {'RESULT':<6} DETAIL\n" + "-" * 90) ok = True for name, passed, detail in R: From f6fa45aec0080286bced251bd177c1b5bbdf924d Mon Sep 17 00:00:00 2001 From: Raymond Yee Date: Wed, 17 Jun 2026 15:22:55 -0700 Subject: [PATCH 4/6] #281/#282 Half(b) increment 1: Material facet tree (preview flag) (#17) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * #281/#282 Half(b) increment 1: Material facet tree behind ?facets=tree preview The user-facing tree, gated behind a preview flag (default OFF → the flat Material list is byte-identical for everyone; ?facets=tree opts in, ?facets=flat forces off, localStorage ISAMPLES_FACET_TREE=1 is sticky). Reversible = one switch. What works (verified headless on the 202608 data): - Material renders as an expandable tree from facet_tree_summaries: non-selectable root group ("All materials"), first two levels unfolded (#281), deeper collapsed behind carets, alphabetical within level (#282). - Material baseline counts come from facet_tree_summaries (not the flat summaries). - Subtree FILTERING via membership: selecting a parent node filters the table/map to its whole subtree by filtering on the parent URI alone (membership encodes every ancestor — no client-side descendant expansion). Verified: selecting `earthmaterial` → table = 4,091,133 (exactly its subtree count). - facetFilterSQL is the shared predicate (table + map + point-mode all route through it); material → membership subquery when the flag is on, AND-combined with the flat context/object_type subquery. context/object_type/source stay flat. - Flag OFF path is unchanged: facetFilterSQL emits the identical single facets_v3 subquery; describeCrossFilters reads material as before. Smoke gate green. Scope / deferred to increment 2 (documented): - Live viewport- & cross-filtered Material counts: in tree mode Material is excluded from the live count engine and shows STATIC tree baseline counts (facets_v3 can't answer parent-node counts). Table/map filtering is fully live + correct. - Tri-state parent display + auto-check descendants; accessibility (role=tree/aria); context/object_type trees; latency probe + optional cube; R2 publish of the 3 files. Tests: tests/playwright/facet-tree.spec.js (flag-off flat, flag-on tree + subtree filter). Gated on FACET_TREE_LOCAL=1 + the docs/data mirror until the hierarchy files are on R2 (skipped in CI so it stays green). Render clean. Co-Authored-By: Claude Opus 4.8 (1M context) * #281/#282 Half(b): compact the Material tree rows (RY feedback) Override .filter-body label{display:block;padding:2px 0} for tree nodes so caret+label+count sit on one tight line; smaller carets/indent. Flag-gated (tree only); flat list spacing unchanged. Co-Authored-By: Claude Opus 4.8 (1M context) * #281/#282 Half(b) increment 2: Material tree selection polish (tri-state, inherit, OR) Polish so the preview is feedback-ready before Eric (UI-only; no data/query/pipeline changes; flag-gated; flat path unchanged). - materialTreeActive() — single shared predicate (FACET_TREE && a tree actually rendered) used by materialSelection / facetFilterSQL / describeCrossFilters / syncMaterialTreeVisual, so a degraded flat fallback behaves FULLY flat (Codex r2/r3). - materialSelection() — the MINIMAL selection (top-most checked nodes); a checked node under a checked ancestor is redundant. Used for filtering + URL so the membership filter on a parent covers its subtree with no client-side expansion. - syncMaterialTreeVisual() — checking a parent inherits descendants (checked+disabled); unchecking reverts them; a node with checked descendants but unchecked itself shows the indeterminate "–". Multi-peer selection = OR/union (already native to the IN() membership filter). - URL: writeQueryState serializes the minimal nodes; applyQueryToFacetFilters restores + re-syncs inherited/indeterminate state. - Compact tree row spacing (RY feedback). Verified (202608, headless): parent→child {checked,disabled}, table=4,091,133 subtree; two peers→parent indeterminate, table=333,253 (OR union); URL carries only the minimal node and round-trips; ?facets=tree with the tree data 404'd → flat fallback still filters. 5 facet-tree specs + flag-off smoke green; render clean. Codex: 3-round LGTM. Deferred (next): live viewport/cross-filtered Material counts (static tree baseline today); accessibility (role=tree/aria); context/object_type trees. Co-Authored-By: Claude Opus 4.8 (1M context) --------- Co-authored-by: Claude Opus 4.8 (1M context) --- explorer.qmd | 193 ++++++++++++++++++++++++++-- tests/playwright/facet-tree.spec.js | 168 ++++++++++++++++++++++++ 2 files changed, 351 insertions(+), 10 deletions(-) create mode 100644 tests/playwright/facet-tree.spec.js diff --git a/explorer.qmd b/explorer.qmd index 153b9970..7efad93d 100644 --- a/explorer.qmd +++ b/explorer.qmd @@ -578,6 +578,17 @@ format: .facet-row.zero { opacity: 0.4; } .facet-row.zero:hover { opacity: 0.65; } .facet-count.recomputing { opacity: 0.55; font-style: italic; } + /* #281/#282 Material tree (preview) — compact rows */ + .filter-body .facet-treeroot { font-weight: 600; font-size: 12px; line-height: 1.5; padding: 0 0 2px; color: #444; } + .filter-body .facet-treenode { font-size: 12px; line-height: 1.5; } + /* override .filter-body label{display:block;padding:2px 0} so caret+label+count + sit on one tight line with no extra per-row padding */ + .filter-body .facet-treenode .facet-treelabel { display: inline; padding: 0; cursor: pointer; } + .facet-children { margin-left: 13px; } + .facet-children.collapsed { display: none; } + .facet-caret { cursor: pointer; display: inline-block; width: 12px; color: #888; user-select: none; font-size: 10px; } + .facet-caret:hover { color: #1565c0; } + .facet-caret-spacer { display: inline-block; width: 12px; } ``` @@ -767,6 +778,22 @@ cross_filter_url = `${R2_BASE}/isamples_202608_facet_cross_filter.parquet` // ~60 KB lookup; falls back to URI tail if a URI isn't covered. vocab_labels_url = `${R2_BASE}/vocab_labels_202608.parquet` +// #281/#282 facet hierarchy PREVIEW (default OFF — the flat Material list is +// unchanged for everyone). Opt in with ?facets=tree (force off with ?facets=flat), +// or stickily via localStorage ISAMPLES_FACET_TREE=1. When on, the Material facet +// renders as an expandable tree backed by the hierarchy artifacts below; context / +// object_type / source stay flat. Reversible = a single switch. +FACET_TREE = (() => { + const q = new URLSearchParams(location.search).get('facets'); + if (q === 'tree') return true; + if (q === 'flat') return false; + return (typeof localStorage !== 'undefined' && localStorage.getItem('ISAMPLES_FACET_TREE') === '1'); +})() +// Hierarchical counts (one row per concept node: facet_type, concept_uri, +// parent_uri, depth, count) and per-sample membership (pid ↔ every ancestor). +facet_tree_url = `${R2_BASE}/isamples_202608_facet_tree_summaries.parquet` +membership_url = `${R2_BASE}/isamples_202608_sample_facet_membership.parquet` + // Canonical palette — see issue #113. Path-relative so this works under // both isamples.org (custom domain at root) and project-pages fork // previews (rdhyee.github.io/isamplesorg.github.io/...). @@ -877,6 +904,9 @@ function applyQueryToFacetFilters() { setCheckedValues('materialFilterBody', csvParamValues(params, 'material')); setCheckedValues('contextFilterBody', csvParamValues(params, 'context')); setCheckedValues('objectTypeFilterBody', csvParamValues(params, 'object_type')); + // #281/#282: restored ?material= URIs are the minimal nodes — fill in their + // inherited descendants + indeterminate ancestors so the tree looks right. + syncMaterialTreeVisual(); } @@ -927,7 +957,9 @@ function writeQueryState(opts = {}) { ['context', 'contextFilterBody'], ['object_type', 'objectTypeFilterBody'], ].forEach(([key, containerId]) => { - const values = getCheckedValues(containerId); + // #281/#282: serialize the MINIMAL Material selection in tree mode (parent + // URIs, not expanded descendants) so shared links stay compact + stable. + const values = (key === 'material') ? materialSelection() : getCheckedValues(containerId); if (values.length > 0) params.set(key, values.join(',')); else params.delete(key); }); @@ -961,6 +993,73 @@ function hasFacetFilters() { || getCheckedValues('objectTypeFilterBody').length > 0; } +// #281/#282: the MINIMAL Material selection — the top-most checked tree nodes +// (a checked node whose ancestor is also checked is redundant, since the parent's +// subtree already covers it). This is the canonical selection used for filtering +// and URL serialization in tree mode; flat mode falls back to all checked values. +// #281/#282: the SINGLE source of truth for "Material is actually in tree mode" — +// the flag is on AND the tree rendered (renderMaterialTreeFacet falls back to the +// flat list if the hierarchy files fail to load). Every material code path keys off +// this, so a degraded flat fallback behaves fully flat — selection, filtering, AND +// cross-filter counts (Codex r2). Without it, filtering would query the missing +// membership file and counts would wrongly exclude material. +function materialTreeActive() { + return FACET_TREE && !!document.querySelector('#materialFilterBody .facet-treenode'); +} + +function materialSelection() { + if (!materialTreeActive()) return getCheckedValues('materialFilterBody'); + const body = document.getElementById('materialFilterBody'); + const out = []; + body.querySelectorAll('.facet-treenode > .facet-treelabel input[type="checkbox"]:checked').forEach(cb => { + const node = cb.closest('.facet-treenode'); + let anc = node && node.parentElement ? node.parentElement.closest('.facet-treenode') : null; + let covered = false; + while (anc) { + const a = anc.querySelector(':scope > .facet-treelabel input[type="checkbox"]'); + if (a && a.checked && !a.disabled) { covered = true; break; } // explicit ancestor covers it + anc = anc.parentElement ? anc.parentElement.closest('.facet-treenode') : null; + } + if (!covered) out.push(cb.value); + }); + return out; +} + +// #281/#282: keep the tree's visual state coherent after any change or URL restore: +// - a checked parent visually fills in its descendants (checked + disabled = +// "included because the parent is"), so the redundancy is obvious; +// - unchecking a parent reverts those inherited descendants; +// - a parent with some (but not all-via-itself) descendants selected shows the +// indeterminate "–" state. Filtering reads materialSelection() (top-most), so +// the inherited descendant checks never double-count. +function syncMaterialTreeVisual() { + if (!materialTreeActive()) return; + const body = document.getElementById('materialFilterBody'); + if (!body) return; + const cbOf = (node) => node.querySelector(':scope > .facet-treelabel input[type="checkbox"]'); + const nodes = [...body.querySelectorAll('.facet-treenode')]; // DOM order = parents before children + // Pass 1 (top-down): inherit checked state from an explicitly-checked ancestor. + for (const node of nodes) { + const cb = cbOf(node); + if (!cb) continue; + let anc = node.parentElement ? node.parentElement.closest('.facet-treenode') : null; + let coveredByAnc = false; + while (anc) { + const a = cbOf(anc); + if (a && a.checked && !a.disabled) { coveredByAnc = true; break; } + anc = anc.parentElement ? anc.parentElement.closest('.facet-treenode') : null; + } + if (coveredByAnc) { cb.checked = true; cb.disabled = true; cb.indeterminate = false; } + else if (cb.disabled) { cb.disabled = false; cb.checked = false; } // was inherited, parent now off → revert + } + // Pass 2: indeterminate when not selected itself but a descendant is checked. + for (const node of nodes) { + const cb = cbOf(node); + if (!cb) continue; + cb.indeterminate = (!cb.checked) && !!node.querySelector(':scope > .facet-children input[type="checkbox"]:checked'); + } +} + // Single source of truth for #facetNote visibility. The note ("filter // takes effect at neighborhood zoom") explains the cluster-mode honesty // gap: H3 summary parquets only carry `dominant_source`, so material / @@ -994,25 +1093,38 @@ function syncFacetNote() { // (a sample with two materials would appear twice via JOIN). Required // for Phase 4's table mode and any non-JOIN caller. See issue #156. function facetFilterSQL() { - const mat = getCheckedValues('materialFilterBody'); + const mat = materialSelection(); // minimal (top-most) nodes in tree mode const ctx = getCheckedValues('contextFilterBody'); const ot = getCheckedValues('objectTypeFilterBody'); - const conds = []; + // Each entry is a standalone `pid IN (...)` predicate; multiple are AND-ed. + const parts = []; + // #281/#282: in tree mode the Material selection is a set of concept nodes; + // filter via the membership table (which encodes each sample under every + // ancestor), so selecting a parent node matches its whole subtree — no + // client-side descendant expansion. context/object_type stay flat on facets. + const facetsConds = []; if (mat.length > 0) { const list = mat.map(s => `'${escSql(s)}'`).join(','); - conds.push(`material IN (${list})`); + if (materialTreeActive()) { + parts.push(`pid IN (SELECT DISTINCT pid FROM read_parquet('${membership_url}') WHERE facet_type='material' AND concept_uri IN (${list}))`); + } else { + facetsConds.push(`material IN (${list})`); + } } if (ctx.length > 0) { const list = ctx.map(s => `'${escSql(s)}'`).join(','); - conds.push(`context IN (${list})`); + facetsConds.push(`context IN (${list})`); } if (ot.length > 0) { const list = ot.map(s => `'${escSql(s)}'`).join(','); - conds.push(`object_type IN (${list})`); + facetsConds.push(`object_type IN (${list})`); } - if (conds.length === 0) return ''; - return ` AND pid IN (SELECT DISTINCT pid FROM read_parquet('${facets_url}') WHERE ${conds.join(' AND ')})`; + if (facetsConds.length > 0) { + parts.push(`pid IN (SELECT DISTINCT pid FROM read_parquet('${facets_url}') WHERE ${facetsConds.join(' AND ')})`); + } + if (parts.length === 0) return ''; + return ' AND ' + parts.map(p => `(${p})`).join(' AND '); } // Shared viewport-padding factor. The samples table (PR #219), the @@ -1852,7 +1964,62 @@ facetFilters = { ).join(''); }; - renderFilter('materialFilterBody', 'material', grouped.material); + // #281/#282: render Material as an expandable tree when the preview flag + // is on; flat list otherwise. context/object_type/source stay flat. + // Defined inline so it closes over prettyLabel / escapers / grouped / db. + async function renderMaterialTreeFacet() { + const body = document.getElementById('materialFilterBody'); + if (!body) return; + let rows; + try { + rows = await db.query(`SELECT concept_uri, parent_uri, depth, count FROM read_parquet('${facet_tree_url}') WHERE facet_type='material'`); + } catch (err) { + console.warn('facet_tree load failed; flat material fallback:', err); + renderFilter('materialFilterBody', 'material', grouped.material); + return; + } + const nodes = new Map(); + for (const r of rows) nodes.set(r.concept_uri, { uri: r.concept_uri, parent: r.parent_uri, depth: Number(r.depth), count: Number(r.count), label: prettyLabel(r.concept_uri), kids: [] }); + let root = null; + for (const n of nodes.values()) { if (n.parent == null) root = n; else if (nodes.has(n.parent)) nodes.get(n.parent).kids.push(n); } + for (const n of nodes.values()) n.kids.sort((a, b) => a.label.localeCompare(b.label)); // #282 alphabetical within level + // Material baseline counts must come from the tree (Codex), not the flat summaries. + viewer._baselineCounts.material = new Map([...nodes.values()].map(n => [n.uri, n.count])); + // First two levels unfolded (#281): depth 1 + 2 visible; depth ≥3 collapsed. + const nodeHtml = (n) => { + const hasKids = n.kids.length > 0; + const open = hasKids && n.depth < 2; + const caret = hasKids + ? `${open ? '▾' : '▸'}` + : ``; + return `
` + + caret + + `` + + (hasKids ? `
` + n.kids.map(nodeHtml).join('') + `
` : '') + + `
`; + }; + // Root renders as a non-selectable "All …" grouping label (selecting it = no filter). + body.innerHTML = + `
${escText(root ? root.label : 'All materials')}` + + (root ? ` (${root.count.toLocaleString()})` : '') + + `
` + + (root ? root.kids.map(nodeHtml).join('') : ''); + body.querySelectorAll('.facet-caret').forEach(c => c.addEventListener('click', (e) => { + e.preventDefault(); e.stopPropagation(); + const node = c.closest('.facet-treenode'); + const kids = node && node.querySelector(':scope > .facet-children'); + if (kids) { const collapsed = kids.classList.toggle('collapsed'); c.textContent = collapsed ? '▸' : '▾'; } + })); + // Tri-state + inherited-check visual sync on any checkbox toggle. Filtering + // reads materialSelection() (top-most) so inherited checks don't double-count. + body.addEventListener('change', () => syncMaterialTreeVisual()); + syncMaterialTreeVisual(); // initial (in case ?material= pre-checked nodes) + } + if (FACET_TREE) { await renderMaterialTreeFacet(); } + else { renderFilter('materialFilterBody', 'material', grouped.material); } renderFilter('contextFilterBody', 'context', grouped.context); renderFilter('objectTypeFilterBody', 'object_type', grouped.object_type); applyFacetCounts('source', null); @@ -2841,7 +3008,13 @@ zoomWatcher = { const sourceChecks = document.querySelectorAll('#sourceFilter input[type="checkbox"]'); const sourceTotal = sourceChecks.length; const sources = getActiveSources(); - const mat = getCheckedValues('materialFilterBody'); + // #281/#282 increment 1: in tree mode the Material facet keeps its + // STATIC tree baseline counts and is excluded from the live cross-filter + // count engine (its counts come from facet_tree_summaries, not facets_v3, + // and a selected parent node wouldn't match facets_v3's flat values). The + // table/map still filter correctly via facetFilterSQL's membership path. + // Viewport/cross-filtered material tree counts are the next increment. + const mat = materialTreeActive() ? [] : getCheckedValues('materialFilterBody'); const ctx = getCheckedValues('contextFilterBody'); const ot = getCheckedValues('objectTypeFilterBody'); const dims = [ diff --git a/tests/playwright/facet-tree.spec.js b/tests/playwright/facet-tree.spec.js new file mode 100644 index 00000000..d5663d38 --- /dev/null +++ b/tests/playwright/facet-tree.spec.js @@ -0,0 +1,168 @@ +/** + * #281/#282 Half(b) increment 1 — Material facet hierarchy (preview flag). + * + * Verifies the `?facets=tree` preview: flag OFF leaves Material flat (unchanged); + * flag ON renders the expandable tree and selecting a node filters the table to + * that node's whole SUBTREE via the membership table (no client-side expansion). + * + * GATED: needs the hierarchy data (facet_tree_summaries / sample_facet_membership / + * vocab_labels-with-broader). Until those are published to R2, run against the local + * docs/data mirror: + * FACET_TREE_LOCAL=1 TEST_URL=http://localhost:5860 npx playwright test facet-tree + * (the spec drives ?data_base=/data). Skipped by default so CI stays green until the + * R2 publish; flip the skip / drop ?data_base once the files are remote. + */ +const { test, expect } = require('@playwright/test'); + +const LOCAL = !!process.env.FACET_TREE_LOCAL; +const DATA = LOCAL ? '&data_base=/data' : ''; +const WORLD = '#v=1&lat=20&lng=0&alt=10000000'; + +test.describe('Material facet tree (#281/#282 preview)', () => { + test.skip(!LOCAL, 'needs hierarchy data — run with FACET_TREE_LOCAL=1 against the docs/data mirror until R2 publish'); + test.setTimeout(150000); + + test('flag OFF → Material stays a flat list (no tree nodes)', async ({ page }) => { + await page.goto(`/explorer.html?facets=flat${DATA}${WORLD}`); + await page.waitForFunction( + () => document.querySelectorAll('#materialFilterBody .facet-row[data-facet="material"]').length > 0, + null, { timeout: 90000 }); + const treenodes = await page.evaluate(() => document.querySelectorAll('#materialFilterBody .facet-treenode').length); + expect(treenodes).toBe(0); + }); + + test('flag ON → tree renders; selecting a parent filters the table to its subtree', async ({ page }) => { + await page.goto(`/explorer.html?facets=tree${DATA}${WORLD}`); + await page.waitForFunction( + () => document.querySelectorAll('#materialFilterBody .facet-treenode').length > 0, + null, { timeout: 90000 }); + + // Tree structure: a non-selectable root group, several nodes, carets, and the + // deepest level collapsed (first two levels unfolded, #281). + const info = await page.evaluate(() => ({ + nodes: document.querySelectorAll('#materialFilterBody .facet-treenode').length, + hasRoot: !!document.querySelector('#materialFilterBody .facet-treeroot'), + carets: document.querySelectorAll('#materialFilterBody .facet-caret').length, + collapsed: document.querySelectorAll('#materialFilterBody .facet-children.collapsed').length, + earthmaterial: !!document.querySelector('#materialFilterBody input[value*="/earthmaterial"]'), + })); + expect(info.nodes).toBeGreaterThan(5); + expect(info.hasRoot).toBe(true); + expect(info.carets).toBeGreaterThan(0); + expect(info.collapsed).toBeGreaterThan(0); + expect(info.earthmaterial).toBe(true); + + // Selecting the "earthmaterial" parent must filter the table to its whole + // subtree (membership encodes ancestors → no client expansion needed). + await page.evaluate(() => { + const cb = document.querySelector('#materialFilterBody input[value*="/earthmaterial"]'); + cb.checked = true; + document.getElementById('materialFilterBody').dispatchEvent(new Event('change', { bubbles: true })); + }); + await page.waitForFunction( + () => /of [\d,]+\)/.test(document.getElementById('tablePageInfo')?.textContent || ''), + null, { timeout: 60000 }); + const total = await page.evaluate(() => { + const m = (document.getElementById('tablePageInfo')?.textContent || '').match(/of ([\d,]+)\)/); + return m ? parseInt(m[1].replace(/,/g, ''), 10) : null; + }); + expect(total).toBeGreaterThan(0); + }); + + // Known 202608 subtree/union totals — deterministic for this dataset, so polling + // to the exact value also guarantees the filter has applied (no stale-pager read). + const EARTHMATERIAL_TOTAL = 4091133; // earthmaterial subtree + const MINERAL_OR_SOIL_TOTAL = 333253; // mineral ∪ soil (peers) + + test('polish: checking a parent inherits its children (checked+disabled); peers go OR with an indeterminate parent', async ({ page }) => { + const toggle = (sub, val) => page.evaluate(({ sub, val }) => { + const cb = document.querySelector(`#materialFilterBody input[value*="${sub}"]`); + cb.checked = val; cb.dispatchEvent(new Event('change', { bubbles: true })); + }, { sub, val }); + const total = async () => page.evaluate(() => { + const m = (document.getElementById('tablePageInfo')?.textContent || '').match(/of ([\d,]+)\)/); + return m ? parseInt(m[1].replace(/,/g, ''), 10) : null; + }); + await page.goto(`/explorer.html?facets=tree${DATA}${WORLD}`); + await page.waitForFunction( + () => document.querySelectorAll('#materialFilterBody .facet-treenode').length > 0, + null, { timeout: 90000 }); + + // Check the "earthmaterial" parent → a child ("mineral") becomes inherited + // (checked + disabled), and the table filters to the whole subtree. + await toggle('/earthmaterial', true); + const child = await page.evaluate(() => { + const cb = document.querySelector('#materialFilterBody input[value*="/mineral"]'); + return { checked: cb.checked, disabled: cb.disabled }; + }); + expect(child).toEqual({ checked: true, disabled: true }); + await expect.poll(total, { timeout: 60000, intervals: [500, 1000, 2000] }).toBe(EARTHMATERIAL_TOTAL); + + // Uncheck the parent; check two PEERS (mineral + soil) → the parent shows the + // indeterminate "–" state, and the table is their OR/union (smaller than the parent). + await toggle('/earthmaterial', false); + await toggle('/mineral', true); + await toggle('/soil', true); + const parentState = await page.evaluate(() => { + const cb = document.querySelector('#materialFilterBody input[value*="/earthmaterial"]'); + return { checked: cb.checked, indeterminate: cb.indeterminate }; + }); + expect(parentState).toEqual({ checked: false, indeterminate: true }); + await expect.poll(total, { timeout: 60000, intervals: [500, 1000, 2000] }).toBe(MINERAL_OR_SOIL_TOTAL); + expect(MINERAL_OR_SOIL_TOTAL).toBeLessThan(EARTHMATERIAL_TOTAL); + }); + + test('URL round-trip: a parent selection serializes the minimal node and restores inherited state', async ({ page }) => { + const EARTHMATERIAL = 'https://w3id.org/isample/vocabulary/material/1.0/earthmaterial'; + // Select earthmaterial, then assert the URL carries ONLY that node (minimal — no + // expanded descendants like /mineral). + await page.goto(`/explorer.html?facets=tree${DATA}${WORLD}`); + await page.waitForFunction(() => document.querySelectorAll('#materialFilterBody .facet-treenode').length > 0, null, { timeout: 90000 }); + await page.evaluate(() => { + const cb = document.querySelector('#materialFilterBody input[value*="/earthmaterial"]'); + cb.checked = true; cb.dispatchEvent(new Event('change', { bubbles: true })); + }); + await expect.poll(async () => { + const p = new URLSearchParams(new URL(await page.evaluate(() => location.href)).search); + return p.get('material'); + }, { timeout: 30000, intervals: [250, 500, 1000] }).toBe(EARTHMATERIAL); + const url = await page.evaluate(() => location.href); + expect(url).not.toContain('mineral'); // descendants are NOT expanded into the URL + + // Reload that URL fresh → earthmaterial restored as selected, and a child shows + // the inherited (checked + disabled) state. + await page.goto(url.includes('data_base') ? url : `${url}${DATA.replace('&', url.includes('?') ? '&' : '?')}`); + await page.waitForFunction(() => document.querySelectorAll('#materialFilterBody .facet-treenode').length > 0, null, { timeout: 90000 }); + const restored = await page.evaluate(() => { + const par = document.querySelector('#materialFilterBody input[value*="/earthmaterial"]'); + const kid = document.querySelector('#materialFilterBody input[value*="/mineral"]'); + return { parentChecked: par.checked, kidChecked: kid.checked, kidDisabled: kid.disabled }; + }); + expect(restored).toEqual({ parentChecked: true, kidChecked: true, kidDisabled: true }); + }); + + test('graceful fallback: if the tree data 404s, Material renders flat and still filters', async ({ page }) => { + // Deploy-safety (Codex r2/r3): with ?facets=tree but the hierarchy files + // missing, renderMaterialTreeFacet() catches and renders the flat list, and + // materialTreeActive() is false everywhere → selection/filtering use the flat + // facets_v3 path (NOT the missing membership file). + await page.route('**/*facet_tree_summaries*', route => route.fulfill({ status: 404, body: '' })); + await page.goto(`/explorer.html?facets=tree${DATA}${WORLD}`); + await page.waitForFunction( + () => document.querySelectorAll('#materialFilterBody .facet-row[data-facet="material"]').length > 0, + null, { timeout: 90000 }); + const treenodes = await page.evaluate(() => document.querySelectorAll('#materialFilterBody .facet-treenode').length); + expect(treenodes).toBe(0); // fell back to flat + // a flat material selection still filters the table (uses facets_v3, no membership) + await page.evaluate(() => { + const cb = document.querySelector('#materialFilterBody input[type="checkbox"]'); + cb.checked = true; cb.dispatchEvent(new Event('change', { bubbles: true })); + }); + await page.waitForFunction(() => /of [\d,]+\)/.test(document.getElementById('tablePageInfo')?.textContent || ''), null, { timeout: 60000 }); + const total = await page.evaluate(() => { + const m = (document.getElementById('tablePageInfo')?.textContent || '').match(/of ([\d,]+)\)/); + return m ? parseInt(m[1].replace(/,/g, ''), 10) : null; + }); + expect(total).toBeGreaterThan(0); + }); +}); From f65e6152352c5842aca821814c9e31e2ea017b34 Mon Sep 17 00:00:00 2001 From: Raymond Yee Date: Wed, 17 Jun 2026 16:25:40 -0700 Subject: [PATCH 5/6] =?UTF-8?q?#281/#282:=20ship=20the=20Material=20facet?= =?UTF-8?q?=20tree=20=E2=80=94=20default=20ON=20(=3Ffacets=3Dflat=20kill-s?= =?UTF-8?q?witch)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The hierarchy preview is good enough to ship (RY). Flip FACET_TREE default to true so all users get the expandable Material tree; ?facets=flat (or localStorage ISAMPLES_FACET_TREE=0) reverts for a user without a redeploy. context/object_type stay flat. Verified: default→tree (18 nodes), ?facets=flat→flat, smoke green. Co-Authored-By: Claude Opus 4.8 (1M context) --- explorer.qmd | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/explorer.qmd b/explorer.qmd index 7efad93d..b598c954 100644 --- a/explorer.qmd +++ b/explorer.qmd @@ -778,16 +778,20 @@ cross_filter_url = `${R2_BASE}/isamples_202608_facet_cross_filter.parquet` // ~60 KB lookup; falls back to URI tail if a URI isn't covered. vocab_labels_url = `${R2_BASE}/vocab_labels_202608.parquet` -// #281/#282 facet hierarchy PREVIEW (default OFF — the flat Material list is -// unchanged for everyone). Opt in with ?facets=tree (force off with ?facets=flat), -// or stickily via localStorage ISAMPLES_FACET_TREE=1. When on, the Material facet -// renders as an expandable tree backed by the hierarchy artifacts below; context / -// object_type / source stay flat. Reversible = a single switch. +// #281/#282 facet hierarchy — SHIPPED, default ON. The Material facet renders as an +// expandable tree backed by the hierarchy artifacts below; context / object_type / +// source stay flat. `?facets=flat` is the kill-switch (also localStorage +// ISAMPLES_FACET_TREE=0); `?facets=tree` / `=1` force on. The escape hatch lets us +// revert PER USER with no redeploy if the tree misbehaves; flipping the default back +// globally is a one-line change + redeploy. FACET_TREE = (() => { const q = new URLSearchParams(location.search).get('facets'); if (q === 'tree') return true; - if (q === 'flat') return false; - return (typeof localStorage !== 'undefined' && localStorage.getItem('ISAMPLES_FACET_TREE') === '1'); + if (q === 'flat') return false; // kill-switch + const ls = (typeof localStorage !== 'undefined') ? localStorage.getItem('ISAMPLES_FACET_TREE') : null; + if (ls === '0') return false; + if (ls === '1') return true; + return true; // DEFAULT ON })() // Hierarchical counts (one row per concept node: facet_type, concept_uri, // parent_uri, depth, count) and per-sample membership (pid ↔ every ancestor). From 8b70b989602e436f411391ac9553040d9b485236 Mon Sep 17 00:00:00 2001 From: Raymond Yee Date: Wed, 17 Jun 2026 16:30:31 -0700 Subject: [PATCH 6/6] tests: make playwright import optional in conftest (pre-existing upstream fix) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fixture-tests (pipeline-tests.yml) has been red on upstream main since 2026-06-11: conftest.py hard-imports playwright.sync_api, which the data-only pipeline CI job doesn't install. The fork already fixed this (try/except → skip browser fixtures); bring it upstream so the pipeline gate is green. Co-Authored-By: Claude Opus 4.8 (1M context) --- tests/conftest.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tests/conftest.py b/tests/conftest.py index dcc7b612..2f5955f1 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -13,7 +13,15 @@ """ import os import pytest -from playwright.sync_api import sync_playwright + +# Optional: only the browser-based site tests need playwright. The data- +# pipeline fixture tests (tests/test_frontend_derived.py, +# tests/test_oc_concept_enrichment.py) run in CI without it — a hard import +# here would crash collection for the whole tests/ directory. +try: + from playwright.sync_api import sync_playwright +except ImportError: # pragma: no cover + sync_playwright = None SITE_URL = os.environ.get("ISAMPLES_BASE_URL", "https://isamples.org") @@ -21,6 +29,8 @@ @pytest.fixture(scope="session") def browser(): + if sync_playwright is None: + pytest.skip("playwright not installed (pipeline-only environment)") with sync_playwright() as p: browser = p.chromium.launch( headless="--headed" not in " ".join(os.sys.argv),