Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions examples/common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1064,6 +1064,12 @@ ArgOptions SDGenerationParams::get_options() {
"process vae in tiles to reduce memory usage",
true,
&vae_tiling_params.enabled},
{"",
"--no-vae-tiling-fallback",
"disable the automatic fallback to VAE tiling when an untiled decode would exceed the "
"backend's max buffer size (fail instead of tiling)",
false,
&vae_tiling_params.auto_tile},
{"",
"--temporal-tiling",
"enable temporal tiling for LTX video VAE decode",
Expand Down Expand Up @@ -1808,6 +1814,9 @@ bool SDGenerationParams::from_json_str(
if (tiling_json.contains("enabled") && tiling_json["enabled"].is_boolean()) {
vae_tiling_params.enabled = tiling_json["enabled"];
}
if (tiling_json.contains("auto_tile") && tiling_json["auto_tile"].is_boolean()) {
vae_tiling_params.auto_tile = tiling_json["auto_tile"];
}
if (tiling_json.contains("temporal_tiling") && tiling_json["temporal_tiling"].is_boolean()) {
vae_tiling_params.temporal_tiling = tiling_json["temporal_tiling"];
}
Expand Down Expand Up @@ -2621,10 +2630,12 @@ std::string build_sdcpp_image_metadata_json(const SDContextParams& ctx_params,
}

if (gen_params.vae_tiling_params.enabled ||
!gen_params.vae_tiling_params.auto_tile ||
gen_params.vae_tiling_params.temporal_tiling ||
!gen_params.extra_tiling_args.empty()) {
root["vae_tiling"] = {
{"enabled", gen_params.vae_tiling_params.enabled},
{"auto_tile", gen_params.vae_tiling_params.auto_tile},
{"temporal_tiling", gen_params.vae_tiling_params.temporal_tiling},
{"tile_size_x", gen_params.vae_tiling_params.tile_size_x},
{"tile_size_y", gen_params.vae_tiling_params.tile_size_y},
Expand Down
2 changes: 1 addition & 1 deletion examples/common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ struct SDGenerationParams {
int video_frames = 1;
int fps = 16;
float vace_strength = 1.f;
sd_tiling_params_t vae_tiling_params = {false, false, 0, 0, 0.5f, 0.0f, 0.0f, nullptr};
sd_tiling_params_t vae_tiling_params = {false, false, 0, 0, 0.5f, 0.0f, 0.0f, nullptr, true}; // auto_tile=true (AUTO)
std::string extra_tiling_args;

std::string pm_id_images_dir;
Expand Down
7 changes: 6 additions & 1 deletion include/stable-diffusion.h
Original file line number Diff line number Diff line change
Expand Up @@ -153,14 +153,19 @@ enum lora_apply_mode_t {
};

typedef struct {
bool enabled;
bool enabled; // true => always tile (ON)
bool temporal_tiling;
int tile_size_x;
int tile_size_y;
float target_overlap;
float rel_size_x;
float rel_size_y;
const char* extra_tiling_args;
// Tristate with `enabled`: enabled => ON (always tile); else auto_tile => AUTO (tile only when
// an untiled VAE compute buffer can't be allocated, e.g. it exceeds the backend's max buffer
// size on an iGPU); else OFF (never tile, fail if the untiled buffer doesn't fit). Default AUTO.
// Appended (rather than folded into an enum) to keep the struct ABI backward-compatible.
bool auto_tile;
} sd_tiling_params_t;

typedef struct {
Expand Down
43 changes: 42 additions & 1 deletion src/core/ggml_extend.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1710,6 +1710,12 @@ struct GGMLRunner {
bool stream_layers_enabled = false;
size_t observed_max_effective_budget_ = 0;

// When set, alloc_compute_buffer first measures the graph's planned compute
// buffer size (no allocation) and bails before allocating if it exceeds the
// backend's max single-buffer size. Used by VAE AUTO tiling to fall back to
// tiling proactively instead of attempting (and failing) a too-large decode.
bool probe_compute_buffer_fits_ = false;

sd::layer_registry::LayerRegistry layer_registry_;

std::shared_ptr<WeightAdapter> weight_adapter = nullptr;
Expand Down Expand Up @@ -1898,7 +1904,34 @@ struct GGMLRunner {
if (compute_allocr != nullptr) {
return true;
}
compute_allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(runtime_backend));
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(runtime_backend);

if (probe_compute_buffer_fits_) {
// Measure the planned compute buffer WITHOUT allocating (no_alloc
// planning) and bail before the real reserve if it exceeds the
// backend's max single-buffer size. This lets the caller (VAE AUTO
// tiling) fall back to tiling without the backend ever emitting its
// raw "allocation failed" error on the successful auto path. A
// genuine runtime OOM (planned size <= max, but the device is full)
// is NOT caught here -- it still surfaces from the real reserve
// below, so the reactive fallback remains the backstop.
size_t max_size = ggml_backend_buft_get_max_size(buft);
if (max_size > 0) {
ggml_gallocr* probe = ggml_gallocr_new(buft);
size_t sizes[1] = {0};
ggml_gallocr_reserve_n_size(probe, gf, nullptr, nullptr, sizes);
ggml_gallocr_free(probe);
if (sizes[0] > max_size) {
LOG_DEBUG("%s: untiled compute buffer %.2f MB exceeds backend max single buffer %.2f MB; deferring to tiling",
get_desc().c_str(),
sizes[0] / 1024.0 / 1024.0,
max_size / 1024.0 / 1024.0);
return false;
}
}
}

compute_allocr = ggml_gallocr_new(buft);

if (!ggml_gallocr_reserve(compute_allocr, gf)) {
// failed to allocate the compute buffer
Expand Down Expand Up @@ -3224,6 +3257,14 @@ struct GGMLRunner {
stream_layers_enabled = enabled;
}

// When enabled, the next compute() measures its planned compute buffer and
// declines to allocate (returning failure) if it would exceed the backend's
// max single-buffer size, instead of attempting the allocation and emitting
// the backend's raw error. See probe_compute_buffer_fits_.
void set_probe_compute_buffer_fits(bool enabled) {
probe_compute_buffer_fits_ = enabled;
}

sd::layer_registry::LayerRegistry& get_layer_registry() { return layer_registry_; }

ggml_backend_t get_runtime_backend() {
Expand Down
47 changes: 47 additions & 0 deletions src/model/vae/vae.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,54 @@ struct VAE : public GGMLRunner {
"vae decode compute failed while processing a tile",
silent);
} else {
// AUTO mode (enabled=false, auto_tile=true): proactively measure the untiled decode's
// compute buffer and, if it would exceed the backend's max single-buffer size, decline
// to allocate so the fallback below kicks in *without* the backend printing its raw
// allocation error. The reactive output.empty() check still backstops genuine runtime
// OOM (planned size fits the max, but the device is out of memory).
const bool auto_probe = !tiling_params.enabled && tiling_params.auto_tile;
if (auto_probe) {
set_probe_compute_buffer_fits(true);
}
output = _compute(n_threads, input, true);
if (auto_probe) {
set_probe_compute_buffer_fits(false);
}
if (output.empty() && !tiling_params.enabled && tiling_params.auto_tile) {
// The untiled VAE decode compute buffer can exceed the backend's maximum single
// buffer / allocation size — common on integrated GPUs, where the ceiling is
// per-buffer (e.g. Vulkan maxBufferSize), not total memory. sd.cpp already supports
// tiling that keeps each compute buffer small, so fall back to it automatically
// instead of failing the whole decode. CPU remains the ultimate fallback if even a
// tiled buffer cannot be allocated.
free_compute_buffer();
if (!silent) {
LOG_WARN("vae: untiled decode buffer exceeded the backend limit; retrying with tiling");
}
sd_tiling_params_t auto_tiling = tiling_params;
auto_tiling.enabled = true; // default tile size (32) via get_tile_sizes
set_tiling_params(auto_tiling);
const int scale_factor = get_scale_factor();
int64_t W = input.shape()[0] * scale_factor;
int64_t H = input.shape()[1] * scale_factor;
float tile_overlap;
int tile_size_x, tile_size_y;
get_tile_sizes(tile_size_x, tile_size_y, tile_overlap, auto_tiling, input.shape()[0], input.shape()[1]);
output = tiled_compute(
input,
n_threads,
static_cast<int>(W),
static_cast<int>(H),
scale_factor,
tile_size_x,
tile_size_y,
tile_overlap,
circular_x,
circular_y,
true,
"vae decode compute failed while processing a tile",
silent);
}
}

free_compute_buffer();
Expand Down
6 changes: 3 additions & 3 deletions src/stable-diffusion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ class StableDiffusionGGML {
bool apply_lora_immediately = false;

std::string taesd_path;
sd_tiling_params_t vae_tiling_params = {false, false, 0, 0, 0.5f, 0, 0, nullptr};
sd_tiling_params_t vae_tiling_params = {false, false, 0, 0, 0.5f, 0, 0, nullptr, true}; // auto_tile=true (AUTO)
bool offload_params_to_cpu = false;
float max_vram = 0.f;
bool stream_layers = false;
Expand Down Expand Up @@ -2868,7 +2868,7 @@ void sd_img_gen_params_init(sd_img_gen_params_t* sd_img_gen_params) {
sd_img_gen_params->batch_count = 1;
sd_img_gen_params->control_strength = 0.9f;
sd_img_gen_params->pm_params = {nullptr, 0, nullptr, 20.f};
sd_img_gen_params->vae_tiling_params = {false, false, 0, 0, 0.5f, 0.0f, 0.0f, nullptr};
sd_img_gen_params->vae_tiling_params = {false, false, 0, 0, 0.5f, 0.0f, 0.0f, nullptr, true}; // auto_tile=true (AUTO)
sd_cache_params_init(&sd_img_gen_params->cache);
sd_hires_params_init(&sd_img_gen_params->hires);
}
Expand Down Expand Up @@ -2955,7 +2955,7 @@ void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params) {
sd_vid_gen_params->fps = 16;
sd_vid_gen_params->moe_boundary = 0.875f;
sd_vid_gen_params->vace_strength = 1.f;
sd_vid_gen_params->vae_tiling_params = {false, false, 0, 0, 0.5f, 0.0f, 0.0f, nullptr};
sd_vid_gen_params->vae_tiling_params = {false, false, 0, 0, 0.5f, 0.0f, 0.0f, nullptr, true}; // auto_tile=true (AUTO)
sd_vid_gen_params->hires.enabled = false;
sd_vid_gen_params->hires.upscaler = SD_HIRES_UPSCALER_LATENT;
sd_vid_gen_params->hires.scale = 2.f;
Expand Down
Loading