Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/packaging_wheels.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ jobs:
matrix:
python: [ cp314 ]
platform:
- { os: windows-2025, arch: amd64, cibw_system: win }
- { os: windows-2022, arch: amd64, cibw_system: win }
- { os: windows-11-arm, arch: ARM64, cibw_system: win }
- { os: ubuntu-24.04, arch: x86_64, cibw_system: manylinux }
- { os: ubuntu-24.04-arm, arch: aarch64, cibw_system: manylinux }
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/targeted_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ on:
required: true
type: choice
options:
- 'windows-2025'
- 'windows-2022'
- 'windows-11-arm'
- 'ubuntu-24.04'
- 'ubuntu-24.04-arm'
Expand Down
4 changes: 2 additions & 2 deletions src/duckdb_py/arrow/arrow_array_stream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@

namespace duckdb {

void TransformDuckToArrowChunk(ArrowSchema &arrow_schema, ArrowArray &data, py::list &batches) {
void TransformDuckToArrowChunk(py::object pyarrow_schema, ArrowArray &data, py::list &batches) {
py::gil_assert();
auto pyarrow_lib_module = py::module::import("pyarrow").attr("lib");
auto batch_import_func = pyarrow_lib_module.attr("RecordBatch").attr("_import_from_c");
batches.append(batch_import_func(reinterpret_cast<uint64_t>(&data), reinterpret_cast<uint64_t>(&arrow_schema)));
batches.append(batch_import_func(reinterpret_cast<uint64_t>(&data), pyarrow_schema));
}

void VerifyArrowDatasetLoaded() {
Expand Down
22 changes: 16 additions & 6 deletions src/duckdb_py/arrow/arrow_export_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,18 +17,28 @@ namespace duckdb {

namespace pyarrow {

py::object ToArrowTable(const vector<LogicalType> &types, const vector<string> &names, const py::list &batches,
ClientProperties &options) {
py::object ToPyArrowSchema(const ArrowSchema &schema) {
py::gil_scoped_acquire acquire;

auto pyarrow_lib_module = py::module::import("pyarrow").attr("lib");
auto from_batches_func = pyarrow_lib_module.attr("Table").attr("from_batches");
auto schema_import_func = pyarrow_lib_module.attr("Schema").attr("_import_from_c");
return schema_import_func(reinterpret_cast<uint64_t>(&schema));
}

py::object ToArrowTable(const py::list &batches, py::object pyarrow_schema) {
py::gil_scoped_acquire acquire;

auto pyarrow_lib_module = py::module::import("pyarrow").attr("lib");
auto from_batches_func = pyarrow_lib_module.attr("Table").attr("from_batches");

return py::cast<duckdb::pyarrow::Table>(from_batches_func(batches, pyarrow_schema));
}

py::object ToArrowTable(const vector<LogicalType> &types, const vector<string> &names, const py::list &batches,
ClientProperties &options) {
ArrowSchema schema;
ArrowConverter::ToArrowSchema(&schema, types, names, options);
auto schema_obj = schema_import_func(reinterpret_cast<uint64_t>(&schema));

return py::cast<duckdb::pyarrow::Table>(from_batches_func(batches, schema_obj));
return ToArrowTable(batches, ToPyArrowSchema(schema));
}

} // namespace pyarrow
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ enum class PyArrowObjectType {
PolarsLazyFrame
};

void TransformDuckToArrowChunk(ArrowSchema &arrow_schema, ArrowArray &data, py::list &batches);
void TransformDuckToArrowChunk(py::object pyarrow_schema, ArrowArray &data, py::list &batches);

PyArrowObjectType GetArrowType(const py::handle &obj);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,13 @@ namespace duckdb {

namespace pyarrow {

py::object ToPyArrowSchema(const ArrowSchema &schema);

py::object ToArrowTable(const vector<LogicalType> &types, const vector<string> &names, const py::list &batches,
ClientProperties &options);

py::object ToArrowTable(const py::list &batches, py::object pyarrow_schema);

} // namespace pyarrow

} // namespace duckdb
17 changes: 14 additions & 3 deletions src/duckdb_py/include/duckdb_python/pyresult.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,15 +36,13 @@ struct DuckDBPyResult {

PandasDataFrame FetchDF(bool date_as_object);

duckdb::pyarrow::Table FetchArrowTable(idx_t rows_per_batch, bool to_polars);

PandasDataFrame FetchDFChunk(const idx_t vectors_per_chunk = 1, bool date_as_object = false);

py::dict FetchPyTorch();

py::dict FetchTF();

ArrowArrayStream FetchArrowArrayStream(idx_t rows_per_batch = 1000000);
duckdb::pyarrow::Table FetchArrowTable(idx_t rows_per_batch, bool to_polars);
duckdb::pyarrow::RecordBatchReader FetchRecordBatchReader(idx_t rows_per_batch = 1000000);
py::object FetchArrowCapsule(idx_t rows_per_batch = 1000000);

Expand All @@ -71,6 +69,19 @@ struct DuckDBPyResult {
unique_ptr<DataChunk> FetchNextRaw(QueryResult &result);
unique_ptr<NumpyResultConversion> InitializeNumpyConversion(bool pandas = false);

//! Re-feed an already-MATERIALIZED result (a ColumnDataCollection, e.g. from
//! rel.execute()) back through the engine on the user's own context. The eager
//! variant installs a PhysicalArrowCollector to produce an ArrowQueryResult
//! (parallel); the stream variant produces a lazy StreamQueryResult that co-owns
//! the context (so it survives `del conn`). Never call these on a StreamQueryResult:
//! a lazy result already has a live context and is converted/wrapped directly.
void PromoteMaterializedToArrow(idx_t batch_size);

template <typename T>
T RunWithArrowSchema(const std::function<T(const ArrowSchema &)> &fun, bool dedup_col_names);
duckdb::pyarrow::Table MaterializedResultToArrowTable(const ArrowSchema &arrow_schema, idx_t rows_per_batch);
ArrowArrayStream FetchArrowArrayStream(idx_t rows_per_batch);

private:
idx_t chunk_offset = 0;

Expand Down
25 changes: 8 additions & 17 deletions src/duckdb_py/pyrelation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -960,11 +960,11 @@ PandasDataFrame DuckDBPyRelation::FetchDFChunk(idx_t vectors_per_chunk, bool dat
return result->FetchDFChunk(vectors_per_chunk, date_as_object);
}

duckdb::pyarrow::Table DuckDBPyRelation::ToArrowTableInternal(idx_t batch_size, bool to_polars) {
pyarrow::Table DuckDBPyRelation::ToArrowTableInternal(idx_t batch_size, bool to_polars) {
if (!result && !rel) {
return py::none();
}
if (!result) {
if (!rel) {
return py::none();
}
auto &config = ClientConfig::GetConfig(*rel->context->GetContext());
ScopedConfigSetting scoped_setting(
config,
Expand All @@ -991,19 +991,9 @@ py::object DuckDBPyRelation::ToArrowCapsule(const py::object &requested_schema)
if (!rel) {
return py::none();
}
// The PyCapsule protocol doesn't allow custom parameters, so we use the same
// default batch size as fetch_arrow_table / fetch_record_batch.
idx_t batch_size = 1000000;
auto &config = ClientConfig::GetConfig(*rel->context->GetContext());
ScopedConfigSetting scoped_setting(
config,
[&batch_size](ClientConfig &config) {
config.get_result_collector = [&batch_size](ClientContext &context, PreparedStatementData &data) {
return PhysicalArrowCollector::Create(context, data, batch_size);
};
},
[](ClientConfig &config) { config.get_result_collector = nullptr; });
ExecuteOrThrow();
// Fresh relation: stream lazily on the user's context (capsule survives `del conn`,
// but shares the single active-stream slot - consume before reusing the connection).
ExecuteOrThrow(true);
}
AssertResultOpen();
auto res = result->FetchArrowCapsule();
Expand Down Expand Up @@ -1049,6 +1039,7 @@ duckdb::pyarrow::RecordBatchReader DuckDBPyRelation::ToRecordBatch(idx_t batch_s
if (!rel) {
return py::none();
}
// Fresh relation: stream lazily on the user's own context (survives `del conn`).
ExecuteOrThrow(true);
}
AssertResultOpen();
Expand Down
Loading
Loading