diff --git a/doc/getting_started/tutorials/13.ctable-basics.ipynb b/doc/getting_started/tutorials/13.ctable-basics.ipynb index 829011807..ed2b67808 100644 --- a/doc/getting_started/tutorials/13.ctable-basics.ipynb +++ b/doc/getting_started/tutorials/13.ctable-basics.ipynb @@ -774,12 +774,7 @@ "cell_type": "markdown", "id": "4f466e5d", "metadata": {}, - "source": [ - "### 3.3 Sorting\n", - "\n", - "`sort_by()` returns a sorted copy by default (or sorts in-place with `inplace=True`).\n", - "Multi-column sorting is supported — primary key first." - ] + "source": "### 3.3 Sorting\n\n`sort_by()` returns a sorted copy by default (or sorts in-place with `inplace=True`).\nPass `view=True` for a zero-copy sorted **view** that shares the table's data and gathers\nrows on demand — ideal for reading a sorted slice of a large table without copying it.\nMulti-column sorting is supported — primary key first." }, { "cell_type": "code", @@ -1197,37 +1192,9 @@ "start_time": "2026-05-21T09:38:01.039615Z" } }, - "source": [ - "# Top 10 hottest days in Madrid across the whole year\n", - "# Sort the full table, then filter — views cannot be sorted directly\n", - "hottest_all = climate.sort_by(\"temperature\", ascending=False)\n", - "madrid_sorted = hottest_all.where(hottest_all.city == \"Madrid\")\n", - "print(\"10 hottest days in Madrid:\")\n", - "print(madrid_sorted.select([\"city\", \"day\", \"temperature\", \"humidity\"]).head(10))" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "10 hottest days in Madrid:\n", - " city day temperature humidity\n", - "0 Madrid 191 31.399208 42.543335\n", - "1 Madrid 190 31.232576 44.303246\n", - "2 Madrid 227 31.227442 46.992290\n", - "3 Madrid 194 30.915184 35.044228\n", - "4 Madrid 186 30.879374 48.080303\n", - "5 Madrid 202 30.745684 43.722813\n", - "6 Madrid 177 30.469023 38.390163\n", - "7 Madrid 163 30.215179 46.051888\n", - "8 Madrid 181 30.181025 43.726521\n", - "9 Madrid 184 29.936199 50.654797\n", - "\n", - "[10 rows x 4 columns]\n" - ] - } - ], - "execution_count": 21 + "source": "# Top 10 hottest days in Madrid across the whole year.\n# Views *can* be sorted: sort_by() on a where()-view returns a zero-copy sorted\n# view — it shares the table's columns and gathers rows on demand, no full-table\n# copy. (On a base table, pass view=True for the same lazy behaviour.)\nmadrid = climate.where(climate.city == \"Madrid\")\nmadrid_sorted = madrid.sort_by(\"temperature\", ascending=False)\nprint(\"10 hottest days in Madrid:\")\nprint(madrid_sorted.select([\"city\", \"day\", \"temperature\", \"humidity\"]).head(10))", + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -2876,30 +2843,7 @@ "cell_type": "markdown", "id": "405cd155", "metadata": {}, - "source": [ - "---\n", - "## Summary\n", - "\n", - "Here's everything we covered:\n", - "\n", - "| Feature | API |\n", - "|---------|-----|\n", - "| Create | `CTable(Schema)`, `CTable(Schema, new_data=...)` |\n", - "| Insert | `append(row)`, `extend(list_or_array)` |\n", - "| View | `head()`, `tail()`, `print(t)`, `t.info()` |\n", - "| Filter | `where(expr)` → view |\n", - "| Project | `select([cols])` → view |\n", - "| Sort | `sort_by(cols)`, `sort_by(cols, inplace=True)` |\n", - "| Aggregates | `col.sum()`, `.mean()`, `.std()`, `.min()`, `.max()` |\n", - "| Stats | `describe()`, `cov()` |\n", - "| Mutate | `delete()`, `compact()`, `add_column()`, `drop_column()`, `assign()` |\n", - "| Persist | `save(path)`, `to_b2z()`, `to_b2d()`, `CTable.open(path)`, `CTable.load(path)` |\n", - "| Interop | `to_arrow()`, `from_arrow()`, `to_csv()`, `from_csv()` |\n", - "| Nullable | `null_value=` on spec, `is_null()`, `notnull()`, `null_count()` |\n", - "\n", - "CTable is designed for **compressed analytical workloads** — large tables that need to stay small in RAM\n", - "while still being fast to query and easy to persist." - ] + "source": "---\n## Summary\n\nHere's everything we covered:\n\n| Feature | API |\n|---------|-----|\n| Create | `CTable(Schema)`, `CTable(Schema, new_data=...)` |\n| Insert | `append(row)`, `extend(list_or_array)` |\n| View | `head()`, `tail()`, `print(t)`, `t.info()` |\n| Filter | `where(expr)` → view |\n| Project | `select([cols])` → view |\n| Sort | `sort_by(cols)`, `sort_by(cols, view=True)`, `sort_by(cols, inplace=True)` |\n| Aggregates | `col.sum()`, `.mean()`, `.std()`, `.min()`, `.max()` |\n| Stats | `describe()`, `cov()` |\n| Mutate | `delete()`, `compact()`, `add_column()`, `drop_column()`, `assign()` |\n| Persist | `save(path)`, `to_b2z()`, `to_b2d()`, `CTable.open(path)`, `CTable.load(path)` |\n| Interop | `to_arrow()`, `from_arrow()`, `to_csv()`, `from_csv()` |\n| Nullable | `null_value=` on spec, `is_null()`, `notnull()`, `null_count()` |\n\nCTable is designed for **compressed analytical workloads** — large tables that need to stay small in RAM\nwhile still being fast to query and easy to persist." } ], "metadata": { diff --git a/src/blosc2/b2view/app.py b/src/blosc2/b2view/app.py index c205c5beb..6fbbb898f 100644 --- a/src/blosc2/b2view/app.py +++ b/src/blosc2/b2view/app.py @@ -18,6 +18,7 @@ from textual.screen import ModalScreen from textual.theme import Theme from textual.widgets import ( + Checkbox, DataTable, Footer, Header, @@ -228,8 +229,7 @@ class HelpScreen(ModalScreen[None]): BINDINGS: ClassVar = [ ("escape", "close", "Close"), - ("question_mark", "close", "Close"), - ("q", "close", "Close"), + ("q", "app.quit", "Quit b2view"), ] _SECTIONS: ClassVar = [ @@ -257,7 +257,9 @@ class HelpScreen(ModalScreen[None]): ("t / b", "first / last row"), ("g", "go to row..."), ("f", "filter rows (CTable)"), - ("escape", "unlock a row window / clear the active filter"), + ("S", "sort by an indexed column (CTable; R reverses)"), + ("R", "reverse the current sort order (when sorted)"), + ("escape", "unlock a row window / clear the active filter or sort"), ], ), ( @@ -274,12 +276,13 @@ class HelpScreen(ModalScreen[None]): ( "Plot modal (after 'p')", [ - ("+ / -", "zoom in / out about the centre"), + ("+ / -", "zoom in / out about the left edge"), ("left / right", "pan the zoomed window"), ("0", "reset to the whole series"), ("g", "type an exact start:stop row range"), ("v", "lock the data grid to the current range (esc unlocks)"), ("h", "high-res matplotlib image of the current range"), + ("escape", "close the plot (q quits b2view)"), ], ), ( @@ -710,6 +713,70 @@ def action_cancel(self) -> None: self.dismiss(None) +class SortByScreen(ModalScreen["tuple[str, bool] | None"]): + """Dropdown to sort a CTable by one of its FULL-indexed columns. + + ↑/↓ to pick a column, ``r`` (or click) toggles reverse/descending, Enter + applies. Dismisses with ``(column, reverse)`` or None on cancel. + """ + + CSS = """ + SortByScreen { + align: center middle; + } + #sortby-dialog { + width: 60; + height: auto; + max-height: 80%; + border: thick $accent; + background: $surface; + padding: 1 2; + } + #sortby-title { + text-style: bold; + margin-bottom: 1; + } + #sortby-list { + height: auto; + max-height: 16; + } + """ + + BINDINGS: ClassVar = [("escape", "cancel", "Cancel"), ("R", "toggle_reverse", "Reverse")] + + def __init__(self, *, columns: list[str], current: tuple[str, bool] | None = None): + super().__init__() + self.columns = columns + self._current = current + + def compose(self) -> ComposeResult: + cur_col, cur_rev = self._current or (None, False) + with Vertical(id="sortby-dialog"): + yield Static("Sort by indexed column (Enter applies, R reverses)", id="sortby-title") + yield OptionList( + *(Option(name, id=str(i)) for i, name in enumerate(self.columns)), id="sortby-list" + ) + yield Checkbox("Reverse (descending)", value=cur_rev, id="sortby-reverse") + + def on_mount(self) -> None: + option_list = self.query_one("#sortby-list", OptionList) + cur_col = (self._current or (None, False))[0] + option_list.highlighted = self.columns.index(cur_col) if cur_col in self.columns else 0 + option_list.focus() + + def action_toggle_reverse(self) -> None: + checkbox = self.query_one("#sortby-reverse", Checkbox) + checkbox.value = not checkbox.value + + def on_option_list_option_selected(self, event: OptionList.OptionSelected) -> None: + if event.option.id is not None: + reverse = self.query_one("#sortby-reverse", Checkbox).value + self.dismiss((self.columns[int(event.option.id)], reverse)) + + def action_cancel(self) -> None: + self.dismiss(None) + + def _plot_view(series: dict) -> tuple[np.ndarray, np.ndarray, np.ndarray, str]: """Turn a ``plot_series`` result into drawable arrays + a method label. @@ -722,9 +789,11 @@ def _plot_view(series: dict) -> tuple[np.ndarray, np.ndarray, np.ndarray, str]: finite = np.isfinite(ymin) & np.isfinite(ymax) x, ymin, ymax = x[finite], ymin[finite], ymax[finite] method = series.get("method") - descr = {"summary": "min/max envelope", "reduce": "min/max envelope"}.get( - method, "sampled — may miss extremes" - ) + descr = { + "summary": "min/max envelope", + "reduce": "min/max envelope", + "sorted": "min/max envelope", + }.get(method, "sampled — may miss extremes") return x, ymin, ymax, descr @@ -796,7 +865,7 @@ def action_cancel(self) -> None: class PlotScreen(ModalScreen["tuple[int, int] | None"]): """Modal plotting one numeric column; zoomable into a row sub-range. - Keys: ``+``/``-`` zoom about the view centre, ``←``/``→`` pan, ``0`` reset to + Keys: ``+``/``-`` zoom about the view's left edge, ``←``/``→`` pan, ``0`` reset to the whole series, ``g`` type an exact ``start:stop`` range. Each change re-fetches the envelope for the new range (exact for sub-ranges) via the *fetch* closure, so zooming reveals detail the whole-series buckets hide. @@ -829,15 +898,14 @@ class PlotScreen(ModalScreen["tuple[int, int] | None"]): } """ - _KEYS_HINT = "+/- zoom · ←/→ pan · 0 reset · g range · v view rows · h hi-res · s scatter · q close" + _KEYS_HINT = "+/- zoom · ←/→ pan · 0 reset · g range · v view rows · h hi-res · s scatter · esc close" _MIN_WIDTH = 16 # smallest zoom window (rows), so the envelope still reads _HIRES_MAX_POINTS = 50_000 # above this, the hi-res raw view is strided-sampled _SCATTER_MAX_POINTS = 50_000 # above this, the col-vs-col scatter is strided-sampled BINDINGS: ClassVar = [ ("escape", "close", "Close"), - ("q", "close", "Close"), - ("p", "close", "Close"), + ("q", "app.quit", "Quit b2view"), ("plus", "zoom_in", "Zoom in"), ("equals_sign", "zoom_in", "Zoom in"), ("minus", "zoom_out", "Zoom out"), @@ -920,10 +988,10 @@ def _set_range(self, start: int, stop: int) -> None: def _zoom(self, factor: float) -> None: width = self.row_stop - self.row_start - center = (self.row_start + self.row_stop) // 2 new_w = width // 2 if factor < 1 else width * 2 new_w = max(min(self._MIN_WIDTH, self.n), min(self.n, new_w)) - start = max(0, min(center - new_w // 2, self.n - new_w)) + # Anchor on the left edge so the zoomed plot starts where it did before. + start = max(0, min(self.row_start, self.n - new_w)) self._set_range(start, start + new_w) def _pan(self, direction: int) -> None: @@ -1069,11 +1137,11 @@ class ScatterPlotScreen(ModalScreen[None]): } """ - _KEYS_HINT = "h hi-res · q/esc back to plot" + _KEYS_HINT = "h hi-res · esc back to plot" BINDINGS: ClassVar = [ ("escape", "close", "Close"), - ("q", "close", "Close"), + ("q", "app.quit", "Quit b2view"), ("h", "hires", "High-res"), ] @@ -1201,8 +1269,7 @@ class HiResPlotScreen(ModalScreen[None]): BINDINGS: ClassVar = [ ("escape", "close", "Close"), - ("q", "close", "Close"), - ("h", "close", "Close"), + ("q", "app.quit", "Quit b2view"), ("r", "toggle_raw", "Raw/envelope"), ] @@ -1242,8 +1309,8 @@ def __init__( @property def _keys_hint(self) -> str: if self._can_toggle: - return "r raw/envelope · q/esc/h back to braille" - return "q/esc/h · back to braille" + return "r raw/envelope · esc back to braille" + return "esc · back to braille" def _current_title(self) -> str: if self._mode == "scatter": @@ -1349,8 +1416,8 @@ class CellDetailScreen(ModalScreen[None]): Reached with Return on an expensive (list/struct/object/ndarray) column whose grid cell shows a ``<...; skipped>`` placeholder; the value is decoded - on demand. The table stays underneath with its position intact (esc/q/enter - return). + on demand. The table stays underneath with its position intact (esc + returns). """ CSS = """ @@ -1383,8 +1450,7 @@ class CellDetailScreen(ModalScreen[None]): BINDINGS: ClassVar = [ ("escape", "close", "Close"), - ("q", "close", "Close"), - ("enter", "close", "Close"), + ("q", "app.quit", "Quit b2view"), ] def __init__(self, *, row: int, name: str, label: str, value: Any): @@ -1404,7 +1470,7 @@ def compose(self) -> ComposeResult: # A VerticalScroll is focusable, so the screen's key bindings fire. with VerticalScroll(id="cell-body"): yield Static(markup_escape(text)) - yield Static("esc/q · close", id="cell-keys") + yield Static("esc · close", id="cell-keys") def on_mount(self) -> None: self.query_one("#cell-body", VerticalScroll).focus() @@ -1655,6 +1721,8 @@ class B2ViewApp(App): Binding("e", "grid_col_end", "Row end", show=False), Binding("c", "go_to_column", "Go to column", show=False), Binding("f", "filter_rows", "Filter rows", show=False), + Binding("S", "sort_rows", "Sort by", show=False), + Binding("R", "reverse_sort", "Reverse sort", show=False), Binding("slash", "filter_columns", "Filter columns", show=False), Binding("p", "plot_column", "Plot column", show=False), Binding("d", "dim_cycle", "Dim mode", show=False), @@ -1668,6 +1736,7 @@ def __init__( *, start_path: str = "/", start_panel: str = "tree", + start_maximized: bool = False, preview_rows: int = 20, preview_cols: int = 10, download_url: str | None = None, @@ -1683,6 +1752,7 @@ def __init__( self._header_label = urlpath self.start_path = start_path self.start_panel = start_panel + self.start_maximized = start_maximized self.preview_rows = preview_rows self.preview_cols = preview_cols self.browser: StoreBrowser | None = None @@ -1691,6 +1761,9 @@ def __init__( self.table_page: dict | None = None self.table_buffer: dict | None = None self.grid_col_start = 0 + # Sticky visible-column count: (layout key, count). Keeps the column + # set stable across vertical scroll / sort reverse (see _load_table_page). + self._col_fit: tuple[tuple, int] | None = None self._data_layout: DataSliceLayout | None = None self._active_dim = 0 self._dim_mode = False @@ -1720,7 +1793,7 @@ def compose(self) -> ComposeResult: with B2ViewPanel(id="data-pane") as data_pane: data_pane.border_title = "data" data_pane.border_subtitle = ( - "?(help) | d(im mode) | filter: f(rows) /(cols) | " + "?(help) | d(im mode) | filter: f(rows) /(cols) | S(ort) | " "rows: t/b/g(oto) | cols: s/e/c(goto) | p(lot)" ) yield Static("", id="data-header") @@ -1789,8 +1862,13 @@ def _start_browsing(self) -> None: self.call_after_refresh(self.update_panels, "/") def _apply_start_focus(self) -> None: - """Focus the panel requested on startup (the --panel option).""" + """Focus the panel requested on startup (the --panel option), and + maximize it too when --max was given.""" self._focus_panel_by_name(self.start_panel) + if self.start_maximized: + # Defer a frame: .focus() above is scheduled, so the new focus (which + # action_maximize_panel reads) isn't applied yet this tick. + self.call_after_refresh(self.action_maximize_panel) def _focus_panel_by_name(self, name: str) -> None: """Focus a panel by its user-facing name.""" @@ -2056,6 +2134,19 @@ def _trim_columns_to_fit(self, data: dict) -> dict: break total += width keep += 1 + return self._take_n_columns(data, keep) + + def _take_n_columns(self, data: dict, n: int) -> dict: + """Keep the first *n* columns of a paged *data* window (clamped to range). + + Width-based fitting (:meth:`_trim_columns_to_fit`) is recomputed from the + currently visible rows, so it can vary as you scroll or reverse a sort. + Pinning the count keeps the visible column set stable across those + re-renders (the sticky fit in :meth:`_load_table_page`). + """ + if data.get("source_kind") not in _COL_PAGED_KINDS: + return data + keep = max(1, min(n, len(data["columns"]))) if keep >= len(data["columns"]): return data kept = data["columns"][:keep] @@ -2109,6 +2200,25 @@ def _table_page_size(self) -> int: height = self.query_one("#data-pane", Vertical).size.height - 2 return max(1, height - 1) if height > 1 else max(1, self.preview_rows) + def _col_fit_key(self) -> tuple: + """Identity of the current column layout for the sticky column-count fit. + + Changes (forcing a width re-fit) on a new node, a horizontal scroll, an + ndarray dim/fixed-value change, a column filter, or a terminal resize — + but not on vertical scroll, sort reverse or row filter, which keep the + same columns. + """ + layout = self._data_layout + layout_sig = None + if layout is not None: + layout_sig = ( + tuple(layout.navigable_dims), + tuple(sorted(layout.fixed_values.items())), + tuple(layout.shape), + ) + col_filter = self.browser.get_column_filter(self.selected_path) if self.browser else None + return (self.selected_path, self.grid_col_start, layout_sig, col_filter, self._data_table_width()) + def _load_table_page(self, path: str, start: int) -> dict: if self.browser is None: raise RuntimeError("Store browser is not open") @@ -2166,7 +2276,21 @@ def _load_table_page(self, path: str, start: int) -> dict: max_cols=self._candidate_max_cols(), col_start=self.grid_col_start, ) - data = self._trim_columns_to_fit(data) + # The visible column count is sticky for a given column layout: recompute + # the width-based fit only when the layout key changes (node, horizontal + # position, ndarray dims, column filter). Vertical scrolling, reversing a + # sort and row filtering keep the same columns instead of dropping one + # when the freshly visible rows happen to measure wider. + fit_key = self._col_fit_key() + if self._col_fit is not None and self._col_fit[0] == fit_key: + data = self._take_n_columns(data, self._col_fit[1]) + else: + data = self._trim_columns_to_fit(data) + # Only remember the count once the layout has settled (a real + # width-based fit); before that the trim is a no-op and would pin a + # bloated count that overflows the table on later renders. + if data.get("source_kind") in _COL_PAGED_KINDS and self._data_table_width() > 1: + self._col_fit = (fit_key, len(data["columns"])) data["viewport_width"] = self._data_table_width() self.table_buffer = data data = self._slice_table_buffer(start, page_size) @@ -2413,12 +2537,19 @@ def _window_and_filter_chips(self, data: dict) -> list[str]: if data.get("source_kind") == "ctable" and self.browser is not None: flt = self.browser.get_filter(self.selected_path) col_flt = self.browser.get_column_filter(self.selected_path) + sort = self.browser.get_sort(self.selected_path) if flt: total = self.browser.base_nrows(self.selected_path) chips.append(f"filter: [bold]{markup_escape(flt)}[/bold] ({total} total)") + if sort: + col, reverse = sort + arrow = "▼" if reverse else "▲" + chips.append(_accent_chip(f"SORTED {arrow} {markup_escape(col)}")) if col_flt: chips.append(f"cols: [bold]{markup_escape(col_flt)}[/bold]") - if flt or col_flt or self.row_window is not None: + if sort: + chips.append("everse") + if flt or col_flt or sort or self.row_window is not None: chips.append("unlock/clear") return chips @@ -2736,6 +2867,68 @@ def action_filter_rows(self) -> None: screen = FilterScreen(current=self.browser.get_filter(self.selected_path)) self.push_screen(screen, self._apply_filter) + def action_sort_rows(self) -> None: + if not self._in_data_grid(): + return + if self.table_page.get("source_kind") != "ctable": + self.notify("Sorting is only supported for CTable nodes", severity="warning") + return + columns = self.browser.full_index_columns(self.selected_path) + if not columns: + self.notify("No FULL-indexed columns to sort by", severity="warning") + return + screen = SortByScreen(columns=columns, current=self.browser.get_sort(self.selected_path)) + self.push_screen(screen, self._apply_sort) + + def _apply_sort(self, choice: tuple[str, bool] | None, *, reposition: bool = True) -> None: + if choice is None or self.browser is None or self.table_page is None: + return # cancelled + column, reverse = choice + try: + self.browser.set_sort(self.selected_path, column, reverse) + except Exception as exc: + self.notify(f"Cannot sort: {exc}", severity="error") + return + self.row_window = None # set_sort drops any window/filter; keep the chip in sync + self.table_buffer = None + # Park the cursor on the sorted column's first row. On the initial sort + # ('S') bring the column into view, clamping the window start to the tail + # ('End') position so the natural left-to-right column order is preserved + # and the last column shows a full window, not a lone column. On reverse + # ('R') leave the horizontal scroll where it is — only the order flips. + names = self.browser.column_names(self.selected_path) or [] + col_idx = names.index(column) if column in names else 0 + if reposition: + self.grid_col_start = min(col_idx, self._fit_col_start_backward(self.table_page["ncols"])) + data = self._load_table_page(self.selected_path, 0) + cursor_col = max(0, col_idx - data.get("col_start", 0)) + self._update_data_table(data, cursor_row=0, cursor_col=cursor_col) + self._update_data_header(data) + self.query_one("#data-table", DataTable).focus() + + def action_reverse_sort(self) -> None: + """Flip ascending/descending on the currently sorted column.""" + if ( + not self._in_data_grid() + or self.table_page.get("source_kind") != "ctable" + or self.browser is None + ): + return + sort = self.browser.get_sort(self.selected_path) + if sort is None: + return + column, reverse = sort + self._apply_sort((column, not reverse), reposition=False) + + def _clear_sort(self) -> None: + """Escape out of a sort view, restoring original row order.""" + self.browser.clear_sort(self.selected_path) + self.table_buffer = None + data = self._load_table_page(self.selected_path, 0) + self._update_data_table(data, cursor_row=0, cursor_col=0) + self._update_data_header(data) + self.query_one("#data-table", DataTable).focus() + def _apply_filter(self, expr: str | None) -> None: if expr is None or self.browser is None or self.table_page is None: return @@ -3043,6 +3236,8 @@ def action_dim_exit(self) -> None: return if self.browser.get_filter(self.selected_path): self._apply_filter("") + elif self.browser.get_sort(self.selected_path): + self._clear_sort() elif self.browser.get_column_filter(self.selected_path): self.browser.set_column_selection(self.selected_path, None) self._reload_columns() diff --git a/src/blosc2/b2view/cli.py b/src/blosc2/b2view/cli.py index 9e5cfa98f..c91d77beb 100644 --- a/src/blosc2/b2view/cli.py +++ b/src/blosc2/b2view/cli.py @@ -66,6 +66,12 @@ def build_parser() -> argparse.ArgumentParser: action="store_true", help="Capture the mouse for clicking and scrolling (disables the terminal's native text selection)", ) + parser.add_argument( + "--max", + dest="maximized", + action="store_true", + help="Maximize the focused panel on startup (same as pressing 'm')", + ) return parser @@ -103,6 +109,7 @@ def main(argv: list[str] | None = None) -> int: urlpath, start_path=args.path, start_panel=args.panel, + start_maximized=args.maximized, preview_rows=args.preview_rows, preview_cols=args.preview_cols, download_url=download_url, diff --git a/src/blosc2/b2view/model.py b/src/blosc2/b2view/model.py index d21b4c1d8..817b358af 100644 --- a/src/blosc2/b2view/model.py +++ b/src/blosc2/b2view/model.py @@ -260,6 +260,9 @@ def __init__(self, urlpath: str): self._filter_views: dict[str, Any] = {} # Per-path locked row windows for CTable nodes (path -> slice() view) self._window_views: dict[str, Any] = {} + # Per-path sort order for CTable nodes (path -> (column, reverse) / view) + self._sorts: dict[str, tuple[str, bool]] = {} + self._sort_views: dict[str, Any] = {} # Per-path column filters (path -> substring pattern / matched names) self._column_filters: dict[str, str] = {} self._column_selections: dict[str, list[str]] = {} @@ -387,12 +390,9 @@ def preview( return preview_array_1d(obj, start=start, stop=stop) return preview_array(obj, slices=slices, max_rows=max_rows, max_cols=max_cols) if kind == "ctable": - # A locked row window (set by 'v') takes precedence; it is sliced - # from whatever was visible, so it already folds in any row filter. - if path in self._window_views: - obj = self._window_views[path] - else: - obj = self._filter_views.get(path, obj) + # Read precedence: a locked row window (set by 'v') wins, then a row + # filter, then a sort view; all already fold in their predecessors. + obj = self._ordered_object(path, obj) if columns is None: columns = self._column_selections.get(path) stop = min(start + max_rows, len(obj)) if stop is None else stop @@ -443,22 +443,27 @@ def plot_series( kind = object_kind(obj) if kind == "ctable": - # A locked row window (set by 'v') takes precedence over any row - # filter, mirroring preview()/read_cell(): a plot shows exactly the - # rows the grid is showing. The SUMMARY fast-path spans the whole - # column, so it is only valid when neither narrows the series. - if path in self._window_views: - view = self._window_views[path] - narrowed = True - else: - view = self._filter_views.get(path, obj) - narrowed = path in self._filter_views + # Window/filter/sort precedence mirrors preview()/read_cell(): a plot + # shows exactly the rows (and order) the grid is showing. The SUMMARY + # fast-path spans the whole column in original order, so it is only + # valid when nothing narrows *or reorders* the series. + view = self._ordered_object(path, obj) + narrowed = view is not obj n = len(view) start, stop = self._clamp_range(row_start, row_stop, n) if start == 0 and stop == n and not narrowed: env = self._column_summary_envelope(obj, column, n, max_points) if env is not None: return {**env, "n": n, "row_start": start, "row_stop": stop, "method": "summary"} + # Range ordered purely by the very column we're plotting: the sort + # view is monotonic over any contiguous range, so we can read just + # the bucket boundaries instead of gathering every value (this also + # accelerates zooming, where start/stop is a sub-range). + sort = self._sorts.get(path) + sorted_only = path not in self._window_views and path not in self._filter_views + if sorted_only and sort is not None and sort[0] == column: + env = self._sorted_column_envelope(view[column], start, stop, n, max_points) + return {**env, "n": n, "row_start": start, "row_stop": stop, "method": "sorted"} col = view[column] chunks = getattr(col, "chunks", None) return self._range_envelope( @@ -541,12 +546,9 @@ def read_series( kind = object_kind(obj) if kind == "ctable": - # Honor a locked row window first, then any row filter, matching - # preview()/read_cell() so the hi-res view tracks the visible grid. - if path in self._window_views: - view = self._window_views[path] - else: - view = self._filter_views.get(path, obj) + # Window > filter > sort, matching preview()/read_cell() so the + # hi-res view tracks the visible grid (rows and order). + view = self._ordered_object(path, obj) n = len(view) start, stop = self._clamp_range(row_start, row_stop, n) stride = self._series_stride(stop - start, max_points) @@ -621,12 +623,9 @@ def read_xy( if kind != "ctable": raise ValueError("Scatter requires a CTable source") - # Honor a locked row window first, then any row filter, matching - # read_series() so the scatter tracks exactly the visible rows. - if path in self._window_views: - view = self._window_views[path] - else: - view = self._filter_views.get(path, obj) + # Window > filter > sort, matching read_series() so the scatter tracks + # exactly the visible rows (and order). + view = self._ordered_object(path, obj) n = len(view) start, stop = self._clamp_range(row_start, row_stop, n) width = stop - start @@ -660,12 +659,9 @@ def read_cell(self, path: str, column: str, row: int) -> Any: path = self.normalize_path(path) obj = self._get_object(path) if object_kind(obj) == "ctable": - # Same precedence as preview(): a locked row window wins over a - # filter view, so the visible row index resolves the same cell. - if path in self._window_views: - obj = self._window_views[path] - else: - obj = self._filter_views.get(path, obj) + # Same precedence as preview() (window > filter > sort) so the + # visible row index resolves the same cell. + obj = self._ordered_object(path, obj) values = obj[column][row : row + 1] if len(values) == 0: raise IndexError(f"row {row} is out of range") @@ -713,14 +709,54 @@ def _range_envelope( env["x"] = np.asarray(env["x"]) + start return {**env, **base, "method": "reduce"} + def _sorted_column_envelope( + self, col: Any, start: int, stop: int, n: int, max_points: int + ) -> dict[str, np.ndarray]: + """Exact min/max envelope of rows ``[start, stop)`` of a column read + through its own sort view, with ``x`` in absolute row coordinates. + + When the plotted column *is* the column the view is sorted by, the values + are monotonic over any contiguous range (NaNs land in a contiguous block + at the very end), so each bucket's min/max are just its two endpoints — + no need to gather every value. We read only the ~2*nbuckets boundary + values plus, for the lone finite/NaN transition bucket, that one bucket + in full. Bit-identical to the full-read reduce path, but ~50x cheaper. + + ponytail: relies on the sort view being monotonic; only call it when the + plotted column equals the sort column (see :meth:`plot_series`). + """ + rng = stop - start + group, nbuckets = _bucket_geometry(rng, max_points) + if nbuckets == 0: + empty = np.empty(0) + return {"x": empty, "ymin": empty, "ymax": empty} + offs = np.arange(nbuckets) * group # bucket starts, relative to *start* + ends = start + np.minimum(offs + group - 1, rng - 1) + vstart = np.asarray(col[start:stop:group], dtype=float)[:nbuckets] + vend = np.asarray(col[ends], dtype=float) + ymin = np.fmin(vstart, vend) + ymax = np.fmax(vstart, vend) + # A bucket straddling the finite/NaN boundary has one NaN endpoint; its + # interior extreme is hidden, so read that one bucket exactly. + for i in np.flatnonzero(np.isnan(vstart) != np.isnan(vend)): + s = start + int(offs[i]) + seg = np.asarray(col[s : min(s + group, stop)], dtype=float) + ymin[i] = np.nanmin(seg) + ymax[i] = np.nanmax(seg) + x = start + np.minimum(offs, max(0, rng - 1)) + return {"x": x, "ymin": ymin, "ymax": ymax} + def _column_summary_envelope( self, table: Any, column: str | int | None, n: int, max_points: int ) -> dict[str, np.ndarray] | None: - """Build a min/max envelope from a column's SUMMARY index, or None. + """Build a min/max envelope from a column's index summaries, or None. Reads precomputed per-block ``(min, max)`` from the index — no data - decompression. Returns None when there is no usable summary (non-string - column, no index, non-numeric, or unsupported level). + decompression. Every index kind (SUMMARY, FULL, PARTIAL, BUCKET, OPSI) + persists the same block-level ``(min, max, flags)`` sidecars in its + ``levels`` descriptor, so any indexed numeric column plots instantly + without a dedicated summary index. Returns None when there is no usable + summary (non-string column, no index, non-numeric, or no block level). """ if not isinstance(column, str): return None @@ -728,20 +764,26 @@ def _column_summary_envelope( idx = table.index(column) except Exception: return None - if getattr(idx, "kind", None) != "summary": - return None try: desc = idx.descriptor levels = desc.get("levels") or {} + # Prefer the finest whole-column level available (block), else any. level = "block" if "block" in levels else next(iter(levels), None) if level is None or np.dtype(desc["dtype"]).kind not in "iuf": return None - from blosc2.indexing import FLAG_ALL_NAN, _open_level_summary_handle - - handle = _open_level_summary_handle(idx._target_array(), desc, level) + path = levels[level].get("path") + if path is None: + return None # in-memory sidecar: nothing to fast-read + from blosc2.indexing import _INDEX_MMAP_MODE, FLAG_ALL_NAN, _open_sidecar_file + + # Drop the handle after reading: the cached _open_level_summary_handle + # would hold a file descriptor open for the whole session (one per + # plotted column), exhausting the FD limit on large test runs. + handle = _open_sidecar_file(path, _INDEX_MMAP_MODE) bmin = np.asarray(handle["min"][:]) bmax = np.asarray(handle["max"][:]) flags = np.asarray(handle["flags"][:]) + del handle except Exception: return None if bmin.shape[0] == 0: @@ -779,6 +821,7 @@ def set_filter(self, path: str, expr: str | None) -> int: self._filters.pop(path, None) self._filter_views.pop(path, None) return len(self._get_object(path)) + self.clear_sort(path) # filter and sort are mutually exclusive view = self._get_object(path).where(expr) self._filters[path] = expr self._filter_views[path] = view @@ -788,6 +831,43 @@ def get_filter(self, path: str) -> str | None: """Return the active filter expression for *path*, if any.""" return self._filters.get(self.normalize_path(path)) + def full_index_columns(self, path: str) -> list[str]: + """Names of CTable columns at *path* that carry a FULL index (sortable).""" + obj = self._get_object(path) + return [ix.col_name for ix in getattr(obj, "indexes", []) if getattr(ix, "kind", None) == "full"] + + def set_sort(self, path: str, column: str, reverse: bool) -> None: + """Order the CTable at *path* by *column* using its FULL index (zero-copy view). + + The view streams from the sorted index, so the full table is never + materialised. Sorting replaces any active row filter/window for *path*. + """ + path = self.normalize_path(path) + self._window_views.pop(path, None) + self._filters.pop(path, None) + self._filter_views.pop(path, None) + view = self._get_object(path).sort_by(column, ascending=not reverse, view=True) + self._sorts[path] = (column, reverse) + self._sort_views[path] = view + + def clear_sort(self, path: str) -> None: + """Drop any sort order from *path*, restoring the original row order.""" + path = self.normalize_path(path) + self._sorts.pop(path, None) + self._sort_views.pop(path, None) + + def get_sort(self, path: str) -> tuple[str, bool] | None: + """Return the active ``(column, reverse)`` sort for *path*, if any.""" + return self._sorts.get(self.normalize_path(path)) + + def _ordered_object(self, path: str, obj: Any) -> Any: + """CTable read precedence for *path*: window > filter > sort > base *obj*.""" + if path in self._window_views: + return self._window_views[path] + if path in self._filter_views: + return self._filter_views[path] + return self._sort_views.get(path, obj) + def set_row_window(self, path: str, start: int, stop: int) -> int: """Lock the CTable at *path* to live rows ``[start:stop]``; return its length. @@ -796,7 +876,10 @@ def set_row_window(self, path: str, start: int, stop: int) -> int: then cannot leave the range because the view reports only its own rows. """ path = self.normalize_path(path) - base = self._filter_views.get(path, self._get_object(path)) + if path in self._filter_views: + base = self._filter_views[path] + else: + base = self._sort_views.get(path, self._get_object(path)) view = base.slice(start, stop, copy=False) self._window_views[path] = view return len(view) diff --git a/src/blosc2/ctable.py b/src/blosc2/ctable.py index 5839f254c..1d67c4666 100644 --- a/src/blosc2/ctable.py +++ b/src/blosc2/ctable.py @@ -2314,6 +2314,91 @@ def _lazy_aggregate_fastpath(self, op: str, *, where=None, dtype=None, ddof: int return NotImplemented return NotImplemented + def _summary_minmax_source(self): + """Return ``(sidecar_path, dtype, nullable)`` for a summary-readable + ``min``/``max``, or ``None`` when the index shortcut is not provably + correct. + + Excluded: a view (its summary describes the base table); a column kind + without numeric/string block extrema; a leaky null sentinel (only a + non-nullable column, or a NaN-sentinel float — whose NaNs the summary + drops — match the nulls-skipped contract of ``min()``); and a stale, + absent, or in-memory-only index. Deletions/appends are covered by the + stale flag (every mutation marks the index stale; a rebuild re-summarises + only the live rows, and capacity padding never enters the summaries). + """ + table = self._table + if table.base is not None: + return None + if ( + self.is_computed + or self.is_ndarray + or self.is_list + or self.is_varlen_scalar + or self.is_dictionary + ): + return None + dtype = self.dtype + if dtype is None or dtype.kind not in "biufUS": + return None + col = table._schema.columns_by_name.get(self._col_name) + spec = col.spec if col is not None else None + nullable = getattr(spec, "nullable", False) + null_value = getattr(spec, "null_value", None) + is_nan_float = dtype.kind == "f" and isinstance(null_value, float) and np.isnan(null_value) + if nullable and not is_nan_float: + return None # non-NaN sentinel leaks into the block extrema + desc = table._root_table._get_index_catalog().get(self._col_name) + if not desc or desc.get("stale", False): + return None + levels = desc.get("levels") or {} + level = "block" if "block" in levels else next(iter(levels), None) + if level is None: + return None + path = levels[level].get("path") + if path is None: + return None # in-memory sidecar: the scan is already fast + return path, dtype, nullable + + def _index_summary_minmax(self, op: str): + """Exact ``min``/``max`` from the column index's block summaries, or + ``NotImplemented`` when that shortcut is not applicable (see + :meth:`_summary_minmax_source`). + + Every index kind (SUMMARY/FULL/PARTIAL/BUCKET/OPSI) persists per-block + ``(min, max, flags)``, so reducing those is decompression-free (~240x + faster than scanning tens of millions of rows). + """ + source = self._summary_minmax_source() + if source is None: + return NotImplemented + path, dtype, nullable = source + try: + from blosc2.indexing import _INDEX_MMAP_MODE, FLAG_ALL_NAN, FLAG_HAS_NAN, _open_sidecar_file + + # Read the tiny (min, max, flags) arrays and drop the handle: unlike + # the cached _open_level_summary_handle, this releases the file + # descriptor immediately (min()/max() must not leak one per table). + handle = _open_sidecar_file(path, _INDEX_MMAP_MODE) + flags = np.asarray(handle["flags"][:]) + vals = np.asarray(handle[op][:]) + del handle + except Exception: + return NotImplemented + if vals.shape[0] == 0: + return NotImplemented + # A non-nullable float with NaN *data* makes numpy min/max return NaN, + # but the summary dropped those NaNs — they would disagree, so bail. + if dtype.kind == "f" and not nullable and bool((flags & (FLAG_HAS_NAN | FLAG_ALL_NAN)).any()): + return NotImplemented + valid = (flags & FLAG_ALL_NAN) == 0 + if not valid.any(): + return NotImplemented # whole column null → let the scan raise + vals = vals[valid] + if dtype.kind in "US": + return min(vals) if op == "min" else max(vals) + return vals.min() if op == "min" else vals.max() + def min(self, axis=None, *, where=None): """Minimum live, non-null value. @@ -2330,6 +2415,12 @@ def min(self, axis=None, *, where=None): self._require_kind("biufUS", "min") where = self._normalize_sum_where(where) if where is None: + # Try the index-summary shortcut first: it returns NotImplemented for + # an empty/all-null column, so the emptiness check (which counts live + # rows — expensive on a nullable column) only runs on the fallback. + fast_idx = self._index_summary_minmax("min") + if fast_idx is not NotImplemented: + return fast_idx self._require_nonempty("min") fast = self._lazy_aggregate_fastpath("min", where=where) if fast is not NotImplemented: @@ -2364,6 +2455,10 @@ def max(self, axis=None, *, where=None): self._require_kind("biufUS", "max") where = self._normalize_sum_where(where) if where is None: + # See min(): shortcut before the live-row count. + fast_idx = self._index_summary_minmax("max") + if fast_idx is not NotImplemented: + return fast_idx self._require_nonempty("max") fast = self._lazy_aggregate_fastpath("max", where=where) if fast is not NotImplemented: @@ -3341,6 +3436,23 @@ def _is_varlen_scalar_column(col: CompiledColumn) -> bool: def _is_dictionary_column(col: CompiledColumn) -> bool: return isinstance(col.spec, DictionarySpec) + def _dict_rank_index_stale(self, name: str, dict_rank_meta: dict) -> bool: + """True if a dict-rank FULL index no longer matches the live dictionary. + + The index encodes alphabetical ranks frozen at build time; if the + dictionary gained/changed entries the ranks are wrong, so callers must + fall back to lexsort until the index is rebuilt. + """ + from blosc2.ctable_indexing import _dict_rank_hash + + col = self._root_table._cols.get(name) + if col is None: + return True + dictionary = list(col.dictionary) + if len(dictionary) != dict_rank_meta.get("dict_len"): + return True + return _dict_rank_hash(dictionary) != dict_rank_meta.get("dict_hash") + @staticmethod def _is_ndarray_column(col: CompiledColumn) -> bool: return isinstance(col.spec, NDArraySpec) @@ -9911,16 +10023,19 @@ def _normalise_sort_keys( raise TypeError( f"Column {name!r} is a varlen scalar column and does not support sort ordering." ) - raise TypeError( - f"Column {name!r} is a list column and does not support sort ordering in V1." - ) + if cc is not None and self._is_dictionary_column(cc): + pass # dictionary columns: sorting supported (decoded strings) + else: + raise TypeError( + f"Column {name!r} is a list column and does not support sort ordering in V1." + ) if np.issubdtype(dtype, np.complexfloating): raise TypeError( f"Column {name!r} has complex dtype {dtype} which does not support ordering." ) return cols, ascending - def _sorted_positions_from_full_index(self, name: str, ascending: bool) -> np.ndarray | None: + def _sorted_positions_from_full_index(self, name: str, ascending: bool) -> np.ndarray | None: # noqa: C901 """Return live physical positions from a matching FULL index, if available. Reads the pre-sorted positions sidecar directly rather than going through @@ -9931,13 +10046,25 @@ def _sorted_positions_from_full_index(self, name: str, ascending: bool) -> np.nd catalog = root._get_index_catalog() descriptor = None + null_value = None + null_code = None + is_dict_rank = False if name in root._cols: col_info = root._schema.columns_by_name.get(name) - if col_info is not None and getattr(col_info.spec, "null_value", None) is not None: - return None + if col_info is not None: + null_value = getattr(col_info.spec, "null_value", None) + if isinstance(col_info.spec, DictionarySpec): + null_code = col_info.spec.null_code descriptor = catalog.get(name) if descriptor is None or descriptor.get("kind") != "full" or descriptor.get("stale", False): descriptor = None + else: + dict_rank_meta = descriptor.get("full", {}).get("dict_rank") + if dict_rank_meta is not None: + if self._dict_rank_index_stale(name, dict_rank_meta): + descriptor = None # ranks no longer match dictionary → lexsort + else: + is_dict_rank = True elif name in root._computed_cols: cc = root._computed_cols[name] for _lookup_key, candidate in catalog.items(): @@ -9960,8 +10087,12 @@ def _sorted_positions_from_full_index(self, name: str, ascending: bool) -> np.nd # machinery which is built for selective range queries and is ~70x slower # for full-table streaming. if positions_path is not None: - # Persistent table: positions live in a sidecar .b2nd file. - positions_nd = blosc2.open(positions_path, mode="r") + # Persistent table: positions live in a sidecar .b2nd file. Use the + # sidecar opener so .b2z (zip) stores are read at their zip offset — + # blosc2.open() would look for a standalone file that isn't there. + from blosc2.indexing import _open_sidecar_file + + positions_nd = _open_sidecar_file(positions_path) else: # In-memory table: positions live in the sidecar handle cache. from blosc2.indexing import _SIDECAR_HANDLE_CACHE, _sidecar_handle_cache_key @@ -9976,13 +10107,61 @@ def _sorted_positions_from_full_index(self, name: str, ascending: bool) -> np.nd return None positions = np.asarray(positions_nd[:], dtype=np.int64) - valid = root._valid_rows[:] - positions = np.asarray(positions, dtype=np.int64) - positions = positions[(positions >= 0) & (positions < len(valid))] - positions = positions[valid[positions]] + total = len(root._valid_rows) + # Index sidecars can carry padding positions beyond the live range, so + # the bounds clip always runs — but the ``.all()`` check skips the copy + # (and a 24M-element temporary) when there is nothing to clip. + in_bounds = (positions >= 0) & (positions < total) + if not bool(in_bounds.all()): + positions = positions[in_bounds] + del in_bounds + # Validity filtering only matters when the table has gaps (deleted rows); + # for a compact table every clipped position is already live. + if root._n_rows is None or root._n_rows != total: + valid = root._valid_rows[:] + positions = positions[valid[positions]] if self is not root: current_valid = self._valid_rows[:] positions = positions[current_valid[positions]] + + if is_dict_rank: + # Dict-rank index: positions sorted by rank (int32), nulls have sentinel null_rank. + # Partition null rows using codes (int32), not decoded strings. + codes = np.asarray(root._cols[name].codes[:], dtype=np.int32) + null_phys = codes == null_code + del codes + if null_phys.any(): + is_null = null_phys[positions] + del null_phys + nulls = positions[is_null] + nonnull = positions[~is_null] + del is_null, positions + if not ascending: + nonnull = nonnull[::-1] + return np.concatenate([nonnull, nulls]) + # No nulls: fall through to simple reverse + elif null_value is not None: + # The index sorts by raw value, but sort_by's contract is nulls-last. + # Partition explicitly so it holds for any sentinel (NaN sorts last, + # an integer sentinel like INT64_MIN sorts first) and either order. + # Free each 24M-element temporary as soon as it is consumed to keep + # peak memory near the size of the permutation itself. + raw = np.asarray(root._cols[name][:]) + if isinstance(null_value, float) and np.isnan(null_value): + null_phys = np.isnan(raw) + else: + null_phys = raw == null_value + del raw + if null_phys.any(): + is_null = null_phys[positions] + del null_phys + nulls = positions[is_null] + nonnull = positions[~is_null] + del is_null, positions + if not ascending: + nonnull = nonnull[::-1] + return np.concatenate([nonnull, nulls]) + if not ascending: positions = positions[::-1] return positions @@ -10003,23 +10182,27 @@ def _build_lex_keys( lex_keys = [] for name, asc in zip(reversed(cols), reversed(ascending), strict=True): cc = self._computed_cols.get(name) + col_info = self._schema.columns_by_name.get(name) + is_dict_col = False if cc is not None: # Materialise computed column values at live positions raw = np.asarray(self._build_computed_lazy(cc)[:])[live_pos] else: - col_info = self._schema.columns_by_name.get(name) - if col_info is not None and self._is_dictionary_column(col_info): + is_dict_col = col_info is not None and self._is_dictionary_column(col_info) + if is_dict_col: # Sort dictionary columns by decoded string values. decoded = self._cols[name][live_pos] raw = np.array(decoded, dtype=object) + # Replace None with placeholder so lexsort never compares None. + # Null indicator key (below) already places nulls last. + raw[raw == None] = "" # noqa: E711 else: raw = self._cols[name][live_pos] - col_info = self._schema.columns_by_name.get(name) nv = getattr(col_info.spec, "null_value", None) if col_info else None # Value key if not asc: - if raw.dtype.kind in "US": + if raw.dtype.kind in "USO": # strings can't be negated — invert via rank rank = np.argsort(np.argsort(raw, kind="stable"), kind="stable") lex_keys.append((n - 1 - rank).astype(np.intp)) @@ -10032,7 +10215,12 @@ def _build_lex_keys( # Null indicator key — more significant than the value key above, # so nulls always sort last (0 before 1 → non-null before null). - if nv is not None: + if is_dict_col and col_info.spec.nullable: + null_code = col_info.spec.null_code + codes_at_pos = np.asarray(self._cols[name].codes[live_pos], dtype=np.int32) + null_ind = (codes_at_pos == null_code).astype(np.intp) + lex_keys.append(null_ind) + elif nv is not None: if isinstance(nv, float) and np.isnan(nv): null_ind = np.isnan(raw).astype(np.intp) else: @@ -10047,8 +10235,15 @@ def sort_by( ascending: bool | list[bool] = True, *, inplace: bool = False, + view: bool = False, ) -> CTable: - """Return a copy of the table sorted by one or more columns. + """Return the table sorted by one or more columns. + + By default this materialises a new in-memory copy of the sorted rows. + Pass ``view=True`` to instead get a lightweight **sorted view** that + shares the parent's column data and gathers rows on demand in sorted + order — no whole-table copy. This is ideal for reading a sorted slice + of a large persistent table (e.g. ``t.sort_by("col", view=True)[:10]``). Parameters ---------- @@ -10069,17 +10264,31 @@ def sort_by( ``self`` (like :meth:`compact` but sorted). If ``False`` (default), return a new in-memory CTable leaving this one untouched. + view: + If ``True``, return a zero-copy sorted **view** over this table + instead of materialising a copy: it shares the parent's columns and + stores only the sort permutation, gathering rows on demand in sorted + order. Slicing the view (``sv[start:stop:step]``) keeps the sorted + order and touches only the rows read. A single-column sort backed by + a non-stale ``FULL`` index reuses its pre-sorted positions (no sort at + read time); otherwise only the sort-key column(s) are materialised to + build the permutation — never the whole table. Mutually exclusive + with ``inplace``. Sorting an existing view is always lazy regardless + of this flag. Raises ------ ValueError - If called on a view or a read-only table when ``inplace=True``. + If called on a view or a read-only table when ``inplace=True``, or if + both ``inplace`` and ``view`` are ``True``. KeyError If any column name is not found. TypeError If a column used as a sort key does not support ordering (e.g. complex numbers). """ + if inplace and view: + raise ValueError("inplace=True and view=True are mutually exclusive.") if self.base is not None and inplace: raise ValueError( "Cannot sort a view inplace (would modify shared column data). Use sort_by(inplace=False) to get a sorted copy." @@ -10120,7 +10329,7 @@ def sort_by( # use those positions directly, so columns are fetched on demand and in # the correct sorted order — identical performance to pre-projecting # with columns= before calling sort_by. - if self.base is not None: + if self.base is not None or view: result = CTable._make_view(self, self._valid_rows) result._cached_live_positions = sorted_pos result._n_rows = n @@ -10128,6 +10337,144 @@ def sort_by( return self._sorted_copy_from_positions(sorted_pos, n) + def sorted_slice(self, col: str, key: slice, *, ascending: bool = True) -> CTable: + """Return rows ``key`` in ``col``-sorted order, reading only the slice window. + + Like ``sort_by(col, ascending=ascending, view=True)[key]`` but, when ``col`` + has a usable FULL index, it reads just the needed window of the index's + position sidecar instead of materialising the whole 24M-row permutation — + ideal for small slices (top/bottom *k*). Falls back to the full sorted + view (same result) whenever the window path does not apply. + """ + if not isinstance(key, slice): + raise TypeError("sorted_slice expects a slice") + pos = self._sorted_slice_positions(col, ascending, key) + if pos is None: + return self.sort_by(col, ascending=ascending, view=True)[key] + return self._view_from_positions(pos) + + def _sorted_slice_positions(self, name: str, ascending: bool, key: slice) -> np.ndarray | None: + """Physical positions for the sorted slice ``key``, reading only the window. + + Returns ``None`` (so the caller falls back to the full path) unless this is + a base table with a non-stale, persistent FULL index over a compact, + unpadded column indexed by a numeric (or null) sentinel. + """ + if self.base is not None: + return None + descriptor = self._get_index_catalog().get(name) + if not descriptor or descriptor.get("kind") != "full" or descriptor.get("stale", False): + return None + full = descriptor.get("full") or {} + positions_path = full.get("positions_path") + if positions_path is None: # in-memory sidecar: not worth a partial-read path + return None + + n = self._n_rows + total = len(self._valid_rows) + if n is None or n != total: # deletions → positions are not a clean permutation + return None + + col_info = self._schema.columns_by_name.get(name) + null_value = getattr(col_info.spec, "null_value", None) if col_info is not None else None + # Dict-rank index: use null_rank (int32) as sentinel for null-block location. + dict_rank = full.get("dict_rank") + if dict_rank is not None: + if self._dict_rank_index_stale(name, dict_rank): + return None # ranks no longer match dictionary → lexsort + null_value = dict_rank["null_rank"] + # Numeric / NaN / string sentinels keep the null rows in one contiguous block + # once sorted; other non-numeric sentinels (e.g. object) would need a + # different locator. + if null_value is not None and not isinstance(null_value, (int, float, str, bytes)): + return None + + from blosc2.indexing import _open_sidecar_file + + pnd = _open_sidecar_file(positions_path) + if len(pnd) != total: # capacity padding → window read would be wrong + return None + + result_idx = np.arange(*key.indices(n), dtype=np.int64) + if result_idx.size == 0: + return np.empty(0, dtype=np.int64) + + # Locate the (contiguous) null block [null_lo, null_hi) in the sorted order. + null_lo, null_hi = self._null_block_bounds(full, null_value, n) + + # Map each requested result index to its index in the sorted sidecar. The + # nulls-last order is the non-null rows (forward or reversed) followed by + # the null block, where the non-null rows are everything outside the block. + sidecar_idx = np.empty_like(result_idx) + if ascending: + len_below = null_lo # non-null rows sorted below the null block + len_above = n - null_hi # non-null rows sorted above it + below = result_idx < len_below + above = (result_idx >= len_below) & (result_idx < len_below + len_above) + nulls = result_idx >= len_below + len_above + sidecar_idx[below] = result_idx[below] + sidecar_idx[above] = null_hi + (result_idx[above] - len_below) + sidecar_idx[nulls] = null_lo + (result_idx[nulls] - len_below - len_above) + else: + len_above = n - null_hi # largest non-null rows come first + len_below = null_lo + above = result_idx < len_above + below = (result_idx >= len_above) & (result_idx < len_above + len_below) + nulls = result_idx >= len_above + len_below + sidecar_idx[above] = (n - 1) - result_idx[above] + sidecar_idx[below] = (null_lo - 1) - (result_idx[below] - len_above) + sidecar_idx[nulls] = null_lo + (result_idx[nulls] - len_above - len_below) + + lo = int(sidecar_idx.min()) + hi = int(sidecar_idx.max()) + 1 + window = np.asarray(pnd[lo:hi], dtype=np.int64) + return window[sidecar_idx - lo] + + def _null_block_bounds(self, full: dict, null_value, n: int) -> tuple[int, int]: + """Return ``[null_lo, null_hi)``: the null rows' span in the sorted sidecar. + + Empty (``null_lo == null_hi``) when the column is non-nullable or has no + null rows. Reads only a handful of sidecar blocks, never the whole array. + """ + if null_value is None: + return n, n + from blosc2.indexing import _open_sidecar_file + + vnd = _open_sidecar_file(full["values_path"]) + if isinstance(null_value, float) and np.isnan(null_value): + # NaN sorts last and breaks ordered comparisons, so count the trailing + # block directly, one chunk at a time (peak memory = a single chunk). + chunk = int(vnd.chunks[0]) if vnd.chunks else len(vnd) + count = 0 + hi = len(vnd) + while hi > 0: + lo = max(0, hi - chunk) + block = np.isnan(np.asarray(vnd[lo:hi])) + count += int(block.sum()) + if not block.all(): # reached the non-null region + break + hi = lo + return n - count, n + # Ordinary value: the block is wherever the sentinel sorts. Bisect the + # sorted values, reading one block per probe. + return ( + self._sidecar_bisect(vnd, null_value, "left"), + self._sidecar_bisect(vnd, null_value, "right"), + ) + + @staticmethod + def _sidecar_bisect(vnd: blosc2.NDArray, value, side: str) -> int: + """``np.searchsorted`` over an ascending sidecar, reading one element/probe.""" + lo, hi = 0, len(vnd) + while lo < hi: + mid = (lo + hi) // 2 + v = vnd[mid : mid + 1][0] + if (v < value) if side == "left" else (v <= value): + lo = mid + 1 + else: + hi = mid + return lo + def _sorted_small_copy_from_live_positions( self, cols: list[str], ascending: list[bool], live_pos: np.ndarray, n: int ) -> CTable: @@ -10143,8 +10490,11 @@ def _sorted_small_copy_from_live_positions( lex_keys = [] for name, asc in zip(reversed(cols), reversed(ascending), strict=True): col_info = self._schema.columns_by_name.get(name) - if col_info is not None and self._is_dictionary_column(col_info): + is_dict_col = col_info is not None and self._is_dictionary_column(col_info) + if is_dict_col: raw = np.array(self._cols[name][live_pos], dtype=object) + # Replace None with placeholder so lexsort never compares None. + raw[raw == None] = "" # noqa: E711 else: raw = gathered[name] @@ -10159,13 +10509,19 @@ def _sorted_small_copy_from_live_positions( else: lex_keys.append(raw) - nv = getattr(col_info.spec, "null_value", None) if col_info else None - if nv is not None: - if isinstance(nv, float) and np.isnan(nv): - null_ind = np.isnan(raw).astype(np.intp) - else: - null_ind = (raw == nv).astype(np.intp) + if is_dict_col and col_info.spec.nullable: + null_code = col_info.spec.null_code + codes_at_pos = np.asarray(self._cols[name].codes[live_pos], dtype=np.int32) + null_ind = (codes_at_pos == null_code).astype(np.intp) lex_keys.append(null_ind) + else: + nv = getattr(col_info.spec, "null_value", None) if col_info else None + if nv is not None: + if isinstance(nv, float) and np.isnan(nv): + null_ind = np.isnan(raw).astype(np.intp) + else: + null_ind = (raw == nv).astype(np.intp) + lex_keys.append(null_ind) order = np.lexsort(lex_keys) result = self._empty_copy(capacity=n) @@ -11332,6 +11688,12 @@ def _run_row_logic(self, ind: int | slice | str | Iterable) -> CTable: mant_pos = true_pos[ind] + # For an ordered view (sorted view or position view), preserve the row + # order and any duplicates by carrying the positions forward. A boolean + # mask is physical-order and set-like, so it would silently drop both. + if getattr(self, "_cached_live_positions", None) is not None: + return self._view_from_positions(np.asarray(mant_pos)) + new_mask_np = np.zeros(len(self._valid_rows), dtype=bool) new_mask_np[mant_pos] = True diff --git a/src/blosc2/ctable_indexing.py b/src/blosc2/ctable_indexing.py index 90dac675e..a628b0cad 100644 --- a/src/blosc2/ctable_indexing.py +++ b/src/blosc2/ctable_indexing.py @@ -56,6 +56,66 @@ def __init__(self): self.vlmeta = _FakeVlMeta() +def _dict_rank_hash(dictionary) -> str: + """Stable hash of a dictionary's entries (code position + value). + + Used to detect when a rank-based FULL index has gone stale (the alphabetical + ranks it encodes no longer match the live dictionary). Must be stable across + processes — the hash is persisted in the index descriptor and recomputed on a + fresh open; ``hash()`` is PYTHONHASHSEED-salted and would spuriously mismatch, + making the persistent index always fall back. + """ + import hashlib + + h = hashlib.sha1(usedforsecurity=False) + for i, value in enumerate(dictionary): + h.update(repr((i, value)).encode("utf-8")) + return h.hexdigest() + + +class _DictRankWrapper: + """Wrap a dictionary column's codes NDArray, translating codes to alphabetical ranks on read. + + Mirrors the NDArray interface enough for the index builder (dtype, shape, ndim, + chunks, blocks, __getitem__). The rank for code *c* is its position in an + alphabetically-sorted dictionary, so sorting by rank == sorting by decoded string. + Null codes map to a sentinel rank ``null_rank = len(dictionary)`` (largest, so + nulls sort last). + """ + + def __init__( + self, + codes, + code_to_rank: np.ndarray, + null_rank: np.int32, + null_code: int, + nullable: bool, + n_live: int, + ): + self._codes = codes # int32 NDArray + self._code_to_rank = code_to_rank # int32 array mapping code -> rank + self._null_rank = null_rank + self._null_code = null_code + self._nullable = nullable + self.dtype = np.dtype(np.int32) + # The codes array carries capacity padding beyond the live rows; expose only + # the live range so the index sidecars match n_rows (no padding → the + # zero-permutation window read engages instead of falling back). + self.shape = (n_live,) + self.ndim = 1 + chunk0 = codes.chunks[0] if codes.chunks else n_live + block0 = codes.blocks[0] if codes.blocks else n_live + self.chunks = (min(chunk0, n_live),) + self.blocks = (min(block0, n_live),) + + def __getitem__(self, key): + codes_slice = np.asarray(self._codes[key], dtype=np.int32) + ranks = self._code_to_rank[codes_slice] + if self._nullable: + ranks[codes_slice == self._null_code] = self._null_rank + return ranks + + class _CTableBuildProxy: """Minimal shim that lets the ``indexing`` module build sidecars for a CTable column without touching the column's own ``schunk.vlmeta``. @@ -696,10 +756,25 @@ def create_index( # noqa: C901 f"Cannot create an index on variable-length scalar column {col_name!r}: " "indexing for vlstring/vlbytes/struct/object columns is not supported yet." ) - # Dictionary columns: index the underlying int32 codes array. + # Dictionary columns: index by alphabetical rank instead of insertion-order codes. is_dictionary = isinstance(self._schema.columns_by_name[col_name].spec, DictionarySpec) + dict_rank_meta = None if is_dictionary: - col_arr = col_arr.codes # index the int32 codes NDArray + dict_col = col_arr + n_live = self._n_rows if self._n_rows is not None else len(self._valid_rows) + dictionary = list(dict_col.dictionary) + n_entries = len(dictionary) + order = np.argsort(dictionary, kind="stable") + code_to_rank = np.empty(n_entries, dtype=np.int32) + code_to_rank[order] = np.arange(n_entries, dtype=np.int32) + null_code = dict_col.spec.null_code + null_rank = np.int32(n_entries) + # Hash for staleness detection. + dict_hash = _dict_rank_hash(dictionary) + dict_rank_meta = {"null_rank": int(null_rank), "dict_hash": dict_hash, "dict_len": n_entries} + col_arr = _DictRankWrapper( + dict_col.codes, code_to_rank, null_rank, null_code, dict_col.spec.nullable, n_live + ) is_persistent = self._storage.index_anchor_path(col_name) is not None if is_persistent: @@ -718,6 +793,13 @@ def create_index( # noqa: C901 precomputed_summaries=precomputed_summaries if kind_str == "summary" else None, ) else: + # In-memory path: materialise ranks as a proper NDArray (small tables only). + if is_dictionary: + codes = np.asarray(dict_col.codes[:n_live], dtype=np.int32) + ranks_arr = code_to_rank[codes] + if dict_col.spec.nullable: + ranks_arr[codes == null_code] = null_rank + col_arr = blosc2.asarray(ranks_arr) _ix_create_index( col_arr, field=None, @@ -734,6 +816,11 @@ def create_index( # noqa: C901 ) store = _IN_MEMORY_INDEXES[id(col_arr)] descriptor = _copy_descriptor(store["indexes"]["__self__"]) + if dict_rank_meta is not None: + full = descriptor.setdefault("full", {}) + if full is None: + full = descriptor["full"] = {} + full["dict_rank"] = dict_rank_meta value_epoch, _ = self._storage.get_epoch_counters() descriptor["built_value_epoch"] = value_epoch diff --git a/src/blosc2/ctable_storage.py b/src/blosc2/ctable_storage.py index 8fe2f9e46..5b8fa9961 100644 --- a/src/blosc2/ctable_storage.py +++ b/src/blosc2/ctable_storage.py @@ -19,6 +19,7 @@ from __future__ import annotations +import contextlib import copy import json import os @@ -731,6 +732,7 @@ def rename_column(self, old: str, new: str): def close(self) -> None: self._unregister_sidecar_zip_paths() + self._evict_cached_index_handles() if self._store is not None: self._store.close() self._store = None @@ -739,11 +741,20 @@ def close(self) -> None: def discard(self) -> None: """Clean up without repacking the .b2z archive.""" self._unregister_sidecar_zip_paths() + self._evict_cached_index_handles() if self._store is not None: self._store.discard() self._store = None self._meta = None + def _evict_cached_index_handles(self) -> None: + """Release process-global index handle/data caches for this table's + files, so closing it does not leak a file descriptor per table.""" + from blosc2.indexing import evict_cached_index_handles + + with contextlib.suppress(Exception): + evict_cached_index_handles(self._root) + def _unregister_sidecar_zip_paths(self) -> None: if not self._registered_sidecar_paths: return @@ -966,6 +977,7 @@ def open_mode(self) -> str | None: def close(self) -> None: self._unregister_sidecar_zip_paths() + self._evict_cached_index_handles() if self._owns_store and self._store is not None: self._store.close() self._store = None @@ -973,11 +985,21 @@ def close(self) -> None: def discard(self) -> None: self._unregister_sidecar_zip_paths() + self._evict_cached_index_handles() if self._owns_store and self._store is not None: self._store.discard() self._store = None self._meta = None + def _evict_cached_index_handles(self) -> None: + """Release process-global index handle/data caches for this table's + subtree, so closing it does not leak a file descriptor per table.""" + from blosc2.indexing import evict_cached_index_handles + + with contextlib.suppress(Exception): + root = os.path.join(self._store.working_dir, self._root_key.lstrip("/")) + evict_cached_index_handles(root) + def _unregister_sidecar_zip_paths(self) -> None: if not self._registered_sidecar_paths: return diff --git a/src/blosc2/indexing.py b/src/blosc2/indexing.py index 9d588c728..9f0d45a3b 100644 --- a/src/blosc2/indexing.py +++ b/src/blosc2/indexing.py @@ -199,6 +199,45 @@ def _purge_stale_persistent_caches() -> None: _hot_cache_clear(scope=scope) +def evict_cached_index_handles(root: str | None) -> None: + """Drop cached sidecar handles/data for the persistent store at *root*. + + Index reads cache file-backed handles in process-global dicts for query + reuse; they are normally only purged once their files are deleted, so a + table closed but left on disk keeps its descriptors open — one per table, + which exhausts the file-descriptor limit over a large session. Closing a + table calls this to pop (and thereby release) the handles it owns; the + caches simply repopulate on the next query. + """ + if not root: + return + try: + resolved = str(Path(root).resolve()) + except Exception: + return + prefix = resolved + os.sep + + def _owned_scope(scope) -> bool: + # scope is an _array_key: ("persistent", path) or ("memory", id). + return ( + isinstance(scope, tuple) + and len(scope) == 2 + and scope[0] == "persistent" + and isinstance(scope[1], str) + and (scope[1] == resolved or scope[1].startswith(prefix)) + ) + + def _owned_path(path) -> bool: + return isinstance(path, str) and (path == resolved or path.startswith(prefix)) + + for cache in (_SIDECAR_HANDLE_CACHE, _DATA_CACHE, _HOT_CACHE): + for key in [k for k in tuple(cache) if _owned_scope(k[0])]: + cache.pop(key, None) + for handles in (_QUERY_CACHE_STORE_HANDLES, _GATHER_MMAP_HANDLES): + for path in [p for p in tuple(handles) if _owned_path(p)]: + handles.pop(path, None) + + def _open_sidecar_file(path: str, mmap_mode=None) -> blosc2.NDArray: """Open an index sidecar file, using zip-offset access when registered.""" reg = _SIDECAR_ZIP_REGISTRY.get(path) @@ -758,7 +797,7 @@ def store_cached_coords( def _supported_index_dtype(dtype: np.dtype) -> bool: - return np.dtype(dtype).kind in {"b", "i", "u", "f", "m", "M"} + return np.dtype(dtype).kind in {"b", "i", "u", "f", "m", "M", "S", "U"} def _field_target_descriptor(field: str | None) -> dict: @@ -1064,6 +1103,16 @@ def _segment_summary(segment: np.ndarray, dtype: np.dtype): zero = np.zeros((), dtype=dtype)[()] return zero, zero, flags segment = segment[valid] + if dtype.kind in "US": + # String dtypes: ufunc 'minimum'/'maximum' lack a loop. + mn = segment[0] + mx = segment[0] + for v in segment[1:]: + if v < mn: + mn = v + if v > mx: + mx = v + return mn, mx, flags return segment.min(), segment.max(), flags @@ -1106,8 +1155,25 @@ def _fill_summaries_from_2d( mins = np.where(all_nan, zero, mins).astype(dtype) maxs = np.where(all_nan, zero, maxs).astype(dtype) else: - mins = data_2d.min(axis=1) - maxs = data_2d.max(axis=1) + if dtype.kind in "US": + # String dtypes: numpy ufunc 'minimum'/'maximum' lack a loop for mx: + mx = v + mins[i] = mn + maxs[i] = mx + else: + mins = data_2d.min(axis=1) + maxs = data_2d.max(axis=1) flags = np.zeros(n, dtype=np.uint8) summaries_arr["min"][offset : offset + n] = mins summaries_arr["max"][offset : offset + n] = maxs @@ -3081,13 +3147,24 @@ def _merge_run_pair( ) right_cut = right_values.size - merged_values, merged_positions = indexing_ext.intra_chunk_merge_sorted_slices( - left_values[:left_cut], - left_positions[:left_cut], - right_values[:right_cut], - right_positions[:right_cut], - np.int64, - ) + try: + merged_values, merged_positions = indexing_ext.intra_chunk_merge_sorted_slices( + left_values[:left_cut], + left_positions[:left_cut], + right_values[:right_cut], + right_positions[:right_cut], + np.int64, + ) + except (TypeError, AttributeError): + # ponytail: fallback for non-numeric dtypes (strings, etc.) that the + # Cython merge doesn't support. _merge_sorted_slices uses np.lexsort. + merged_values, merged_positions = _merge_sorted_slices( + left_values[:left_cut], + left_positions[:left_cut], + right_values[:right_cut], + right_positions[:right_cut], + dtype, + ) take = merged_values.size try: _write_ndarray_linear_span(out_values, out_cursor, merged_values) diff --git a/tests/b2view/test_basics.py b/tests/b2view/test_basics.py index 286298edd..4e8a169e5 100644 --- a/tests/b2view/test_basics.py +++ b/tests/b2view/test_basics.py @@ -749,11 +749,11 @@ async def test_plot_column(store_path): n = screen.n assert (screen.row_start, screen.row_stop) == (0, n) - # '+' zooms in about the centre: the window halves and re-centres. + # '+' zooms in about the left edge: the window halves, start unchanged. await pilot.press("plus") await pilot.pause() assert screen.row_stop - screen.row_start == n // 2 - assert screen.row_start > 0 + assert screen.row_start == 0 assert "rows" in screen.plot_title # '-' zooms back out to the whole series. @@ -819,11 +819,11 @@ async def test_plot_column(store_path): assert app._data_layout.row_window is None assert app.table_page["nrows"] == LEAF1_LEN - # 'p' (like escape) closes the plot again + # 'p' re-opens the plot; 'escape' is the only way to close it await pilot.press("p") await pilot.pause() assert isinstance(app.screen, PlotScreen) - await pilot.press("p") + await pilot.press("escape") await pilot.pause() assert not isinstance(app.screen, PlotScreen) @@ -939,8 +939,8 @@ async def test_plot_hires_view(store_path): assert hires._mode == "envelope" assert "min/max envelope" in hires._current_title() - # 'q' returns to the braille plot with the zoom intact. - await pilot.press("q") + # 'escape' returns to the braille plot with the zoom intact. + await pilot.press("escape") await pilot.pause() assert app.screen is plot assert (plot.row_start, plot.row_stop) == (0, plot.n) @@ -1002,7 +1002,7 @@ async def test_plot_scatter_col_vs_col(store_path): np.testing.assert_allclose(scatter.y, np.arange(100, 140) * 1.5) # 'h' opens a high-res matplotlib scatter over the braille scatter, when - # textual-image + matplotlib are available; 'h' again returns to it. + # textual-image + matplotlib are available; 'escape' returns to it. if importlib.util.find_spec("textual_image") and importlib.util.find_spec("matplotlib"): from blosc2.b2view.app import HiResPlotScreen, TextualImage @@ -1010,12 +1010,12 @@ async def test_plot_scatter_col_vs_col(store_path): await pilot.pause() assert isinstance(app.screen, HiResPlotScreen) assert app.screen.query_one("#hires-image", TextualImage) is not None - await pilot.press("h") + await pilot.press("escape") await pilot.pause() assert app.screen is scatter - # 'q' returns to the braille plot with the zoom intact. - await pilot.press("q") + # 'escape' returns to the braille plot with the zoom intact. + await pilot.press("escape") await pilot.pause() assert app.screen is plot assert (plot.row_start, plot.row_stop) == (100, 140) diff --git a/tests/b2view/test_sort.py b/tests/b2view/test_sort.py new file mode 100644 index 000000000..fb2464a78 --- /dev/null +++ b/tests/b2view/test_sort.py @@ -0,0 +1,333 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +"""b2view sort-by support: the StoreBrowser model layer and the 'S' key flow. + +Model tests drive ``StoreBrowser`` directly; the TUI tests drive the real +Textual app through a headless ``Pilot``, pressing the same keys a user would. +""" + +from __future__ import annotations + +import dataclasses + +import numpy as np +import pytest + +pytest.importorskip("textual") +pytest.importorskip("pytest_asyncio") + +import blosc2 + +if blosc2.IS_WASM: + pytest.skip("Textual apps need a terminal driver (termios)", allow_module_level=True) + +from blosc2.b2view.app import B2ViewApp, SortByScreen +from blosc2.b2view.model import StoreBrowser + +N = 200 +TERM_SIZE = (120, 40) + + +@pytest.fixture(scope="module") +def sort_store(tmp_path_factory): + """Standalone CTable with FULL indexes on a numeric and a dictionary column.""" + path = str(tmp_path_factory.mktemp("sort") / "sort.b2z") + + @dataclasses.dataclass + class Row: + b: int = blosc2.field(blosc2.int64()) + label: str = blosc2.field(blosc2.dictionary()) + + rng = np.random.default_rng(0) + bvals = rng.integers(0, 1000, N).astype(np.int64) + pool = ["delta", "alpha", "charlie", "bravo"] + labels = [pool[i] for i in rng.integers(0, len(pool), N)] + + t = blosc2.CTable(Row, urlpath=path, mode="w", expected_size=N) + t.extend(list(zip(bvals.tolist(), labels, strict=True))) + t.create_index("b", kind=blosc2.IndexKind.FULL) + t.create_index("label", kind=blosc2.IndexKind.FULL) + t.close() + return path, bvals, labels + + +# ── Model layer (StoreBrowser) ──────────────────────────────────────────── + + +def _head(browser, column, k): + return [browser.read_cell("/", column, i) for i in range(k)] + + +def test_full_index_columns(sort_store): + path, _, _ = sort_store + with StoreBrowser(path) as browser: + assert set(browser.full_index_columns("/")) == {"b", "label"} + + +def test_sort_numeric_ascending_and_reverse(sort_store): + path, bvals, _ = sort_store + expected = sorted(bvals.tolist()) + with StoreBrowser(path) as browser: + browser.set_sort("/", "b", reverse=False) + assert browser.get_sort("/") == ("b", False) + assert _head(browser, "b", 5) == expected[:5] + + browser.set_sort("/", "b", reverse=True) + assert _head(browser, "b", 5) == expected[::-1][:5] + + +@pytest.mark.parametrize("kind", ["SUMMARY", "FULL", "PARTIAL", "BUCKET", "OPSI"]) +def test_indexed_column_plots_from_summary(tmp_path, kind): + """Any index kind exposes block-level (min, max) summaries, so a numeric + indexed column plots via method 'summary' — no data decompression.""" + + @dataclasses.dataclass + class Row: + v: float = blosc2.field(blosc2.float64()) + + rng = np.random.default_rng(0) + n = 20000 + vals = rng.standard_normal(n) + path = str(tmp_path / f"{kind}.b2z") + t = blosc2.CTable(Row, urlpath=path, mode="w", expected_size=n) + t.extend([(float(x),) for x in vals]) + t.create_index("v", kind=getattr(blosc2.IndexKind, kind)) + t.close() + + with StoreBrowser(path) as browser: + env = browser.plot_series("/", column="v", max_points=64) + assert env["method"] == "summary" + # The block-summary envelope bounds the true data range exactly. + assert np.nanmin(env["ymin"]) == pytest.approx(vals.min()) + assert np.nanmax(env["ymax"]) == pytest.approx(vals.max()) + + +def test_sort_dictionary_by_decoded_string(sort_store): + path, _, labels = sort_store + with StoreBrowser(path) as browser: + browser.set_sort("/", "label", reverse=False) + assert _head(browser, "label", 5) == sorted(labels)[:5] + + +def test_clear_sort_restores_original_order(sort_store): + path, bvals, _ = sort_store + with StoreBrowser(path) as browser: + browser.set_sort("/", "b", reverse=False) + browser.clear_sort("/") + assert browser.get_sort("/") is None + assert _head(browser, "b", 3) == bvals[:3].tolist() # original row order + + +def test_window_composes_over_sort(sort_store): + path, bvals, _ = sort_store + expected = sorted(bvals.tolist()) + with StoreBrowser(path) as browser: + browser.set_sort("/", "b", reverse=False) + assert browser.set_row_window("/", 0, 5) == 5 # locked to first 5 sorted rows + assert _head(browser, "b", 5) == expected[:5] + + +def test_filter_clears_sort(sort_store): + path, _, _ = sort_store + with StoreBrowser(path) as browser: + browser.set_sort("/", "b", reverse=False) + browser.set_filter("/", "b > 500") + assert browser.get_sort("/") is None # mutually exclusive + + +# ── End-to-end TUI flow (Pilot) ─────────────────────────────────────────── + + +async def _wait_for_table(pilot) -> None: + for _ in range(100): + await pilot.pause() + if pilot.app.table_page is not None and not pilot.app.loading_table_page: + return + raise AssertionError("data table never loaded") + + +@pytest.mark.asyncio +@pytest.mark.tui +async def test_sort_key_opens_screen_applies_and_escape_clears(sort_store): + path, _, _ = sort_store + app = B2ViewApp(path, start_panel="data") + async with app.run_test(size=TERM_SIZE) as pilot: + await _wait_for_table(pilot) + app.query_one("#data-table").focus() + await pilot.pause() + + # 'S' opens the sort dropdown listing the FULL-indexed columns. + await pilot.press("S") + await pilot.pause() + assert isinstance(app.screen, SortByScreen) + + # Enter applies the highlighted column ascending; grid reorders. + await pilot.press("enter") + await pilot.pause() + assert app.browser.get_sort("/") in {("b", False), ("label", False)} + col = app.browser.get_sort("/")[0] + assert app.table_page["data"][col][0] == min(app.table_page["data"][col]) + # Cursor parks on the sorted column, first row. + table = app.query_one("#data-table") + cur_row, cur_col = table.cursor_coordinate + assert cur_row == 0 + assert app.table_page["columns"][cur_col] == col + + # Escape clears the sort, restoring original order. + await pilot.press("escape") + await pilot.pause() + assert app.browser.get_sort("/") is None + + +@pytest.fixture(scope="module") +def wide_store(tmp_path_factory): + """A CTable wider than the viewport, FULL index only on the LAST column.""" + path = str(tmp_path_factory.mktemp("wide") / "wide.b2z") + ncols = 25 + cols = [f"c{i:02d}" for i in range(ncols)] + Row = dataclasses.make_dataclass( + "WideRow", + [(name, int, blosc2.field(blosc2.int64())) for name in cols], + ) + rng = np.random.default_rng(1) + data = rng.integers(0, 100000, size=(N, ncols)).astype(np.int64) + t = blosc2.CTable(Row, urlpath=path, mode="w", expected_size=N) + t.extend([tuple(int(v) for v in row) for row in data]) + t.create_index(cols[-1], kind=blosc2.IndexKind.FULL) # only the last column + t.close() + return path, cols + + +@pytest.mark.asyncio +@pytest.mark.tui +async def test_sort_last_column_keeps_full_window(wide_store): + path, cols = wide_store + last = cols[-1] + app = B2ViewApp(path, start_panel="data") + async with app.run_test(size=TERM_SIZE) as pilot: + await _wait_for_table(pilot) + app.query_one("#data-table").focus() + await pilot.pause() + + await pilot.press("S") + await pilot.pause() + await pilot.press("enter") # only indexed column is the last one + await pilot.pause() + assert app.browser.get_sort("/") == (last, False) + + page = app.table_page + # The tail window holds several columns ending at the last one — not a + # lone column — and they keep their natural left-to-right order. + assert page["col_stop"] == page["ncols"] + assert len(page["columns"]) > 1 + assert page["columns"] == cols[page["col_start"] : page["col_stop"]] + + # Cursor sits on the sorted (last) column, first row. + table = app.query_one("#data-table") + cur_row, cur_col = table.cursor_coordinate + assert cur_row == 0 + assert page["columns"][cur_col] == last + + # 'R' reverses in place: the horizontal window (same columns, same start) + # stays put, the cursor stays on the sorted column, and the order flips. + cols_before = list(page["columns"]) + col_start_before = page["col_start"] + await pilot.press("R") + await pilot.pause() + page = app.table_page + assert app.browser.get_sort("/") == (last, True) + assert page["col_start"] == col_start_before # window did not re-scroll + assert list(page["columns"]) == cols_before # same columns, none dropped + cur_row, cur_col = app.query_one("#data-table").cursor_coordinate + assert page["columns"][cur_col] == last + assert page["data"][last][0] == max(page["data"][last]) + + +@pytest.mark.asyncio +@pytest.mark.tui +async def test_columns_stable_across_vertical_scroll(wide_store): + """Scrolling down (across buffer reloads) keeps the same visible columns — + the width re-fit is sticky, so a wider lower row block can't drop a column.""" + path, _ = wide_store + app = B2ViewApp(path, start_panel="data") + async with app.run_test(size=TERM_SIZE) as pilot: + await _wait_for_table(pilot) + app.query_one("#data-table").focus() + await pilot.pause() + + key_top = app._col_fit_key() + cols_top = list(app.table_page["columns"]) + + await pilot.press("b") # jump to the last row → buffer reloads far down + await pilot.pause() + assert app._col_fit_key() == key_top # vertical move does not change the fit key + assert list(app.table_page["columns"]) == cols_top # columns unchanged + + +def test_take_n_columns_pins_count(): + """_take_n_columns keeps exactly the first n columns (clamped to range).""" + cols = [f"c{i}" for i in range(5)] + data = { + "source_kind": "ctable", + "nrows": 3, + "ncols": 5, + "col_start": 0, + "hidden_columns": 0, + "columns": cols, + "data": {name: ["x"] * 3 for name in cols}, + } + app = B2ViewApp.__new__(B2ViewApp) # no event loop needed for this pure helper + three = app._take_n_columns({**data, "data": dict(data["data"])}, 3) + assert three["columns"] == cols[:3] + assert three["col_stop"] == 3 + assert three["hidden_columns"] == 2 + allcols = app._take_n_columns({**data, "data": dict(data["data"])}, 99) + assert allcols["columns"] == cols # clamped to available + + +@pytest.mark.asyncio +@pytest.mark.tui +async def test_reverse_key_flips_active_sort(sort_store): + path, _, _ = sort_store + app = B2ViewApp(path, start_panel="data") + async with app.run_test(size=TERM_SIZE) as pilot: + await _wait_for_table(pilot) + app.query_one("#data-table").focus() + await pilot.pause() + + await pilot.press("S") + await pilot.pause() + await pilot.press("enter") # ascending + await pilot.pause() + col, reverse = app.browser.get_sort("/") + assert reverse is False + + await pilot.press("R") # flip to descending in place + await pilot.pause() + assert app.browser.get_sort("/") == (col, True) + assert app.table_page["data"][col][0] == max(app.table_page["data"][col]) + + +@pytest.mark.asyncio +@pytest.mark.tui +async def test_sort_reverse_toggle_in_dropdown(sort_store): + path, _, _ = sort_store + app = B2ViewApp(path, start_panel="data") + async with app.run_test(size=TERM_SIZE) as pilot: + await _wait_for_table(pilot) + app.query_one("#data-table").focus() + await pilot.pause() + + await pilot.press("S") + await pilot.pause() + await pilot.press("R") # toggle reverse (descending) before applying + await pilot.press("enter") + await pilot.pause() + col, reverse = app.browser.get_sort("/") + assert reverse is True + assert app.table_page["data"][col][0] == max(app.table_page["data"][col]) diff --git a/tests/ctable/test_column.py b/tests/ctable/test_column.py index 26e558116..33d922cc0 100644 --- a/tests/ctable/test_column.py +++ b/tests/ctable/test_column.py @@ -511,6 +511,109 @@ class CRow: t["val"].max() +# ------------------------------------------------------------------- +# min / max accelerated from index block summaries +# +# Any index kind persists per-block (min, max, flags); reducing those is exact +# and decompression-free when nulls are already excluded by the summary +# (non-nullable, or a NaN-sentinel float). Other null encodings fall back to +# the full scan but must still return the correct live, non-null result. +# ------------------------------------------------------------------- + +MINMAX_N = 50000 +INT_MIN = np.iinfo(np.int64).min + + +@dataclass +class MinMaxRow: + i: int = blosc2.field(blosc2.int64()) # non-nullable int → fast path + f: float = blosc2.field(blosc2.float64(null_value=float("nan"))) # NaN float → fast path + k: int = blosc2.field(blosc2.int64(null_value=INT_MIN)) # INT64_MIN sentinel → fallback + s: str = blosc2.field(blosc2.string(max_length=8)) # non-nullable string → fast path + + +@pytest.fixture(scope="module") +def indexed_minmax(tmp_path_factory): + rng = np.random.default_rng(0) + ivals = rng.integers(-1000, 1000, MINMAX_N).astype(np.int64) + fvals = rng.standard_normal(MINMAX_N) + fvals[rng.choice(MINMAX_N, 500, replace=False)] = np.nan + kvals = rng.integers(0, 1000, MINMAX_N).astype(np.int64) + kvals[rng.choice(MINMAX_N, 500, replace=False)] = INT_MIN + svals = np.array([f"s{x:05d}" for x in rng.integers(0, 99999, MINMAX_N)]) + + path = str(tmp_path_factory.mktemp("mm") / "t.b2z") + t = blosc2.CTable(MinMaxRow, urlpath=path, mode="w", expected_size=MINMAX_N) + t.extend(list(zip(ivals.tolist(), fvals.tolist(), kvals.tolist(), svals.tolist(), strict=True))) + for c in ("i", "f", "k", "s"): + t.create_index(c, kind=blosc2.IndexKind.FULL) + t.close() + + fv = fvals[~np.isnan(fvals)] + kv = kvals[kvals != INT_MIN] + refs = { + "i": (ivals.min(), ivals.max()), + "f": (fv.min(), fv.max()), + "k": (kv.min(), kv.max()), + "s": (min(svals), max(svals)), + } + return path, refs + + +@pytest.mark.parametrize(("col", "fast"), [("i", True), ("f", True), ("s", True), ("k", False)]) +def test_minmax_matches_reference(indexed_minmax, col, fast): + """min()/max() equal the live non-null reference, whether or not the + summary fast path is used.""" + path, refs = indexed_minmax + t = blosc2.open(path, mode="r") + try: + assert (t[col]._index_summary_minmax("min") is not NotImplemented) is fast + exp_min, exp_max = refs[col] + got_min, got_max = t[col].min(), t[col].max() + if isinstance(exp_min, float): + assert np.isclose(got_min, exp_min) + assert np.isclose(got_max, exp_max) + else: + assert got_min == exp_min + assert got_max == exp_max + finally: + t.close() + + +def test_minmax_no_index_falls_back(): + """With no index summary, min()/max() fall back and stay correct. + + (Persistent numeric columns auto-build a SUMMARY index; an in-memory table + has none, so it exercises the no-summary fallback branch.)""" + t = blosc2.CTable(MinMaxRow, expected_size=10) # in-memory: no auto index + t.extend([(3, 1.5, 7, "b"), (1, 2.5, 8, "a"), (2, 0.5, 9, "c")]) + assert t["i"]._index_summary_minmax("min") is NotImplemented + assert t["i"].min() == 1 + assert t["i"].max() == 3 + + +def test_minmax_stale_index_falls_back(tmp_path): + """An append marks the index stale → fall back, still correct; rebuild + restores the fast path.""" + path = str(tmp_path / "stale.b2d") + t = blosc2.CTable(MinMaxRow, urlpath=path, mode="w", expected_size=110) + t.extend([(x, float(x), x, f"s{x:05d}") for x in range(100)]) + t.create_index("i", kind=blosc2.IndexKind.FULL) + t.close() + + t = blosc2.open(path, mode="a") + try: + assert t["i"]._index_summary_minmax("min") is not NotImplemented # fresh + t.append({"i": -5, "f": 0.0, "k": 0, "s": "z"}) + assert t["i"]._index_summary_minmax("min") is NotImplemented # stale → fallback + assert t["i"].min() == -5 # still correct via full scan + t.rebuild_index("i") + assert t["i"]._index_summary_minmax("min") is not NotImplemented # restored + assert t["i"].min() == -5 + finally: + t.close() + + # ------------------------------------------------------------------- # Aggregates: argmin / argmax # ------------------------------------------------------------------- diff --git a/tests/ctable/test_sort_by.py b/tests/ctable/test_sort_by.py index b1423279f..080a90515 100644 --- a/tests/ctable/test_sort_by.py +++ b/tests/ctable/test_sort_by.py @@ -414,5 +414,137 @@ def test_sort_unprojected_view_opens_only_needed_columns(tmp_path): t.close() +def test_sort_view_zero_copy_slice(tmp_path): + """sort_by(view=True) returns a zero-copy view whose slices keep sorted order.""" + rng = np.random.default_rng(0) + n = 1000 + score = rng.integers(0, 50, n).astype(np.float64) # duplicates on purpose + ids = np.arange(n) + data = list(zip(ids.tolist(), score.tolist(), [True] * n, strict=True)) + + urlpath = str(tmp_path / "sort-view.b2z") + t = CTable(Row, new_data=data, urlpath=urlpath, mode="w") + t.create_index("id", kind=blosc2.IndexKind.FULL) # id has a FULL index + + sv = t.sort_by("score", view=True) + assert sv.base is not None # a view, not a materialised copy + + order = np.argsort(score, kind="stable") + for sl in [slice(0, 10), slice(-10, None), slice(None, None, 2), slice(100, 50, -1), slice(5, 25, 3)]: + np.testing.assert_array_equal(np.asarray(sv[sl]["score"][:]), score[order][sl]) + + # Descending, and a FULL-index-backed single-column sort, both stay ordered. + svd = t.sort_by("score", ascending=False, view=True) + np.testing.assert_array_equal(np.asarray(svd[:10]["score"][:]), score[order[::-1]][:10]) + svf = t.sort_by("id", view=True) + np.testing.assert_array_equal(np.asarray(svf[:10]["id"][:]), np.arange(10)) + + +@pytest.mark.parametrize("ascending", [True, False]) +def test_sort_view_full_index_nullable_persistent(tmp_path, ascending): + """A FULL index on a nullable column accelerates sort_by(view=True) on a .b2z, + and the result keeps nulls last (matching the materialised copy path).""" + + @dataclass + class NullRow: + key: int = blosc2.field(blosc2.int64(ge=0)) + val: float = blosc2.field(blosc2.float64(null_value=float("nan")), default=float("nan")) + + rng = np.random.default_rng(1) + n = 2000 + val = rng.integers(0, 100, n).astype(np.float64) + val[rng.choice(n, 50, replace=False)] = np.nan # scattered nulls + data = list(zip(range(n), val.tolist(), strict=True)) + + urlpath = str(tmp_path / "nullable.b2z") + t = CTable(NullRow, new_data=data, urlpath=urlpath, mode="w") + t.create_index("val", kind=blosc2.IndexKind.FULL) + t.close() + + t = blosc2.CTable.open(urlpath, mode="r") + try: + # Reference: copy path (its nulls-last behaviour is the contract). + ref = np.asarray(t.sort_by("val", ascending=ascending)["val"][:]) + got = np.asarray(t.sort_by("val", ascending=ascending, view=True)["val"][:]) + np.testing.assert_array_equal(got, ref) # NaNs compare equal here via positions + # Nulls must be last regardless of direction. + assert np.isnan(got[-50:]).all() + assert not np.isnan(got[:-50]).any() + finally: + t.close() + + +@pytest.mark.parametrize("ascending", [True, False]) +@pytest.mark.parametrize("sentinel", ["nan_back", "intmin_front", "mid_middle", "no_nulls"]) +def test_sorted_slice_window_matches_full_view(tmp_path, ascending, sentinel): + """sorted_slice reads only the index window yet matches the full sorted view, + for a null block at the back (NaN), front (INT64_MIN), middle (-999), or absent.""" + intmin = np.iinfo(np.int64).min + rng = np.random.default_rng(2) + n = 5000 + + if sentinel == "intmin_front" or sentinel == "no_nulls": + + @dataclass + class R: + key: int = blosc2.field(blosc2.int64(ge=0)) + val: int = blosc2.field(blosc2.int64(null_value=intmin), default=intmin) + + val = rng.integers(0, 1000, n).astype(np.int64) + if sentinel == "intmin_front": + val[rng.choice(n, 50, replace=False)] = intmin + else: + null_value = float("nan") if sentinel == "nan_back" else -999.0 + + @dataclass + class R: + key: int = blosc2.field(blosc2.int64(ge=0)) + val: float = blosc2.field(blosc2.float64(null_value=null_value), default=null_value) + + lo = 0 if sentinel == "nan_back" else -500 # -999 lands in the middle of [-500, 500) + val = rng.integers(lo, 1000 if sentinel == "nan_back" else 500, n).astype(np.float64) + val[rng.choice(n, 50, replace=False)] = null_value + + data = list(zip(range(n), val.tolist(), strict=True)) + urlpath = str(tmp_path / "ss.b2z") + # expected_size=n leaves no capacity padding in the index sidecar, so the + # partial-read window path engages (rather than falling back). + t = CTable(R, new_data=data, urlpath=urlpath, mode="w", expected_size=n) + t.create_index("val", kind=blosc2.IndexKind.FULL) + t.close() + + t = blosc2.CTable.open(urlpath, mode="r") + try: + full = t.sort_by("val", ascending=ascending, view=True) + for sl in [slice(0, 10), slice(n - 10, n), slice(2400, 2600), slice(5, 80, 7), slice(-12, None)]: + assert t._sorted_slice_positions("val", ascending, sl) is not None # window path, not fallback + got = np.asarray(t.sorted_slice("val", sl, ascending=ascending)["val"][:]) + ref = np.asarray(full[sl]["val"][:]) + np.testing.assert_array_equal(got, ref) + finally: + t.close() + + +def test_sorted_slice_falls_back_for_unindexed_column(): + """Without a FULL index, sorted_slice still returns the correct sorted slice.""" + t = CTable(Row, new_data=DATA) + got = np.asarray(t.sorted_slice("score", slice(0, 3))["score"][:]) + ref = np.asarray(t.sort_by("score")[0:3]["score"][:]) + np.testing.assert_array_equal(got, ref) + + +def test_sort_view_false_returns_copy(): + """The default (view=False) still returns an independent in-memory copy.""" + t = CTable(Row, new_data=DATA) + cp = t.sort_by("score") + assert cp.base is None + + +def test_sort_view_inplace_mutually_exclusive(): + t = CTable(Row, new_data=DATA) + with pytest.raises(ValueError, match="mutually exclusive"): + t.sort_by("score", inplace=True, view=True) + + if __name__ == "__main__": pytest.main(["-v", __file__]) diff --git a/tests/ctable/test_sort_by_strings.py b/tests/ctable/test_sort_by_strings.py new file mode 100644 index 000000000..df6f9ed35 --- /dev/null +++ b/tests/ctable/test_sort_by_strings.py @@ -0,0 +1,196 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +"""sort_by / sorted_slice on string-typed columns. + +Covers the two string column kinds the FULL-index window path now supports: +- ``dictionary[str]``: indexed by alphabetical *rank* (int32), so it reuses the + numeric window machinery; the index goes stale when the dictionary changes. +- fixed ``blosc2.string``: indexed directly on the (lexicographic) values. +""" + +import os +import subprocess +import sys +import textwrap +from dataclasses import dataclass + +import numpy as np +import pytest + +import blosc2 +from blosc2 import CTable + +POOL = ["delta", "alpha", "charlie", "bravo", "echo", "foxtrot", "golf"] +SLICES = [slice(0, 10), slice(-10, None), slice(5, 80, 7), slice(-12, None)] + + +def _expected_sorted(labels, ascending, null): + """Canonical nulls-last sorted sequence of labels (ties are identical strings).""" + nonnull = sorted(x for x in labels if x != null) + if not ascending: + nonnull = nonnull[::-1] + return nonnull + [null] * labels.count(null) + + +@dataclass +class DictRow: + key: int = blosc2.field(blosc2.int64(ge=0)) + label: str = blosc2.field(blosc2.dictionary()) + + +@dataclass +class StrRow: + key: int = blosc2.field(blosc2.int64(ge=0)) + s: str = blosc2.field(blosc2.string(max_length=8, null_value="")) + + +@pytest.mark.parametrize("ascending", [True, False]) +def test_dict_rank_sort_and_window(tmp_path, ascending): + """dictionary[str]: sort_by orders by decoded string (nulls last) and + sorted_slice matches the full sorted view via the window path (not fallback).""" + rng = np.random.default_rng(0) + n = 400 + labels = [POOL[i] for i in rng.integers(0, len(POOL), n)] + for i in rng.choice(n, 40, replace=False): + labels[i] = None # dictionary null + data = [(i, labels[i]) for i in range(n)] + + urlpath = str(tmp_path / "dict.b2z") + # expected_size=n trims capacity padding so the window read engages. + t = CTable(DictRow, new_data=data, urlpath=urlpath, mode="w", expected_size=n) + t.create_index("label", kind=blosc2.IndexKind.FULL) + t.close() + + t = blosc2.CTable.open(urlpath, mode="r") + try: + exp = _expected_sorted(labels, ascending, None) + full = list(t.sort_by("label", ascending=ascending, view=True)["label"][:]) + assert full == exp + for sl in SLICES: + assert t._sorted_slice_positions("label", ascending, sl) is not None # window, not fallback + got = list(t.sorted_slice("label", sl, ascending=ascending)["label"][:]) + assert got == exp[sl] + finally: + t.close() + + +@pytest.mark.parametrize("ascending", [True, False]) +def test_fixed_string_sort_and_window(tmp_path, ascending): + """fixed string with null_value="": FULL index builds, sort_by keeps nulls last, + sorted_slice matches the full sorted view via the window path.""" + rng = np.random.default_rng(1) + n = 400 + labels = [POOL[i] for i in rng.integers(0, len(POOL), n)] + for i in rng.choice(n, 40, replace=False): + labels[i] = "" # null sentinel + data = [(i, labels[i]) for i in range(n)] + + urlpath = str(tmp_path / "str.b2z") + t = CTable(StrRow, new_data=data, urlpath=urlpath, mode="w", expected_size=n) + t.create_index("s", kind=blosc2.IndexKind.FULL) + t.close() + + t = blosc2.CTable.open(urlpath, mode="r") + try: + exp = _expected_sorted(labels, ascending, "") + full = [str(x) for x in t.sort_by("s", ascending=ascending, view=True)["s"][:]] + assert full == exp + for sl in SLICES: + assert t._sorted_slice_positions("s", ascending, sl) is not None # window, not fallback + got = [str(x) for x in t.sorted_slice("s", sl, ascending=ascending)["s"][:]] + assert got == exp[sl] + finally: + t.close() + + +def test_dict_rank_index_stale_on_rank_shift(tmp_path): + """Appending a value that shifts alphabetical ranks invalidates the rank index: + sorted_slice falls back (correct result), and rebuild_index restores the window.""" + rng = np.random.default_rng(3) + n = 300 + pool = ["delta", "charlie", "echo", "foxtrot"] # no "alpha" yet + labels = [pool[i] for i in rng.integers(0, len(pool), n)] + data = [(i, labels[i]) for i in range(n)] + + urlpath = str(tmp_path / "stale.b2d") + t = CTable(DictRow, new_data=data, urlpath=urlpath, mode="w", expected_size=n + 10) + t.create_index("label", kind=blosc2.IndexKind.FULL) + t.close() + + t = blosc2.open(urlpath, mode="a") + try: + assert t._sorted_slice_positions("label", True, slice(0, 5)) is not None # window engaged + + # "alpha" becomes the new smallest → every stored rank is now off by one. + t.append({"key": n, "label": "alpha"}) + labels2 = labels + ["alpha"] + + assert t._sorted_slice_positions("label", True, slice(0, 5)) is None # stale → fallback + full = list(t.sort_by("label", ascending=True, view=True)["label"][:]) + assert full == sorted(labels2) # still correct via lexsort + + t.rebuild_index("label") + finally: + t.close() + + t = blosc2.open(urlpath, mode="r") + try: + assert t._sorted_slice_positions("label", True, slice(0, 5)) is not None # window restored + assert list(t.sort_by("label", ascending=True, view=True)["label"][:]) == sorted(labels2) + finally: + t.close() + + +_XPROC_SCRIPT = textwrap.dedent( + """ + import sys + from dataclasses import dataclass + import blosc2 + from blosc2 import CTable + + @dataclass + class DictRow: + key: int = blosc2.field(blosc2.int64(ge=0)) + label: str = blosc2.field(blosc2.dictionary()) + + mode, urlpath = sys.argv[1], sys.argv[2] + if mode == "build": + data = [(i, ["delta", "alpha", "charlie", "bravo"][i % 4]) for i in range(200)] + t = CTable(DictRow, new_data=data, urlpath=urlpath, mode="w", expected_size=200) + t.create_index("label", kind=blosc2.IndexKind.FULL) + t.close() + print("BUILT") + else: # query + t = blosc2.open(urlpath, mode="r") + engaged = t._sorted_slice_positions("label", True, slice(0, 5)) is not None + t.close() + print("WINDOW_OK" if engaged else "WINDOW_FALLBACK") + """ +) + + +@pytest.mark.skipif(blosc2.IS_WASM, reason="emscripten does not support subprocesses") +def test_dict_rank_hash_stable_across_processes(tmp_path): + """The persisted dict-rank index must engage in a *fresh* process with a + different PYTHONHASHSEED — i.e. the staleness hash is not hash()-salted.""" + urlpath = str(tmp_path / "xproc.b2d") + + def run(mode, seed): + env = {**os.environ, "PYTHONHASHSEED": seed} + r = subprocess.run( + [sys.executable, "-c", _XPROC_SCRIPT, mode, urlpath], + capture_output=True, + text=True, + env=env, + check=True, + ) + return r.stdout.strip().splitlines()[-1] + + assert run("build", "0") == "BUILT" + # Different seed → hash() of the same dictionary would differ; sha1 does not. + assert run("query", "1") == "WINDOW_OK"