diff --git a/examples/common/common.cpp b/examples/common/common.cpp index 3ae5faba7..7bdb2bcc1 100644 --- a/examples/common/common.cpp +++ b/examples/common/common.cpp @@ -1064,6 +1064,12 @@ ArgOptions SDGenerationParams::get_options() { "process vae in tiles to reduce memory usage", true, &vae_tiling_params.enabled}, + {"", + "--no-vae-tiling-fallback", + "disable the automatic fallback to VAE tiling when an untiled decode would exceed the " + "backend's max buffer size (fail instead of tiling)", + false, + &vae_tiling_params.auto_tile}, {"", "--temporal-tiling", "enable temporal tiling for LTX video VAE decode", @@ -1808,6 +1814,9 @@ bool SDGenerationParams::from_json_str( if (tiling_json.contains("enabled") && tiling_json["enabled"].is_boolean()) { vae_tiling_params.enabled = tiling_json["enabled"]; } + if (tiling_json.contains("auto_tile") && tiling_json["auto_tile"].is_boolean()) { + vae_tiling_params.auto_tile = tiling_json["auto_tile"]; + } if (tiling_json.contains("temporal_tiling") && tiling_json["temporal_tiling"].is_boolean()) { vae_tiling_params.temporal_tiling = tiling_json["temporal_tiling"]; } @@ -2621,10 +2630,12 @@ std::string build_sdcpp_image_metadata_json(const SDContextParams& ctx_params, } if (gen_params.vae_tiling_params.enabled || + !gen_params.vae_tiling_params.auto_tile || gen_params.vae_tiling_params.temporal_tiling || !gen_params.extra_tiling_args.empty()) { root["vae_tiling"] = { {"enabled", gen_params.vae_tiling_params.enabled}, + {"auto_tile", gen_params.vae_tiling_params.auto_tile}, {"temporal_tiling", gen_params.vae_tiling_params.temporal_tiling}, {"tile_size_x", gen_params.vae_tiling_params.tile_size_x}, {"tile_size_y", gen_params.vae_tiling_params.tile_size_y}, diff --git a/examples/common/common.h b/examples/common/common.h index a90a33132..746c97d57 100644 --- a/examples/common/common.h +++ b/examples/common/common.h @@ -223,7 +223,7 @@ struct SDGenerationParams { int video_frames = 1; int fps = 16; float vace_strength = 1.f; - sd_tiling_params_t vae_tiling_params = {false, false, 0, 0, 0.5f, 0.0f, 0.0f, nullptr}; + sd_tiling_params_t vae_tiling_params = {false, false, 0, 0, 0.5f, 0.0f, 0.0f, nullptr, true}; // auto_tile=true (AUTO) std::string extra_tiling_args; std::string pm_id_images_dir; diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h index 17596f849..0b315f80e 100644 --- a/include/stable-diffusion.h +++ b/include/stable-diffusion.h @@ -153,7 +153,7 @@ enum lora_apply_mode_t { }; typedef struct { - bool enabled; + bool enabled; // true => always tile (ON) bool temporal_tiling; int tile_size_x; int tile_size_y; @@ -161,6 +161,11 @@ typedef struct { float rel_size_x; float rel_size_y; const char* extra_tiling_args; + // Tristate with `enabled`: enabled => ON (always tile); else auto_tile => AUTO (tile only when + // an untiled VAE compute buffer can't be allocated, e.g. it exceeds the backend's max buffer + // size on an iGPU); else OFF (never tile, fail if the untiled buffer doesn't fit). Default AUTO. + // Appended (rather than folded into an enum) to keep the struct ABI backward-compatible. + bool auto_tile; } sd_tiling_params_t; typedef struct { diff --git a/src/core/ggml_extend.hpp b/src/core/ggml_extend.hpp index 7dc37cb9e..2f340f2d4 100644 --- a/src/core/ggml_extend.hpp +++ b/src/core/ggml_extend.hpp @@ -1710,6 +1710,12 @@ struct GGMLRunner { bool stream_layers_enabled = false; size_t observed_max_effective_budget_ = 0; + // When set, alloc_compute_buffer first measures the graph's planned compute + // buffer size (no allocation) and bails before allocating if it exceeds the + // backend's max single-buffer size. Used by VAE AUTO tiling to fall back to + // tiling proactively instead of attempting (and failing) a too-large decode. + bool probe_compute_buffer_fits_ = false; + sd::layer_registry::LayerRegistry layer_registry_; std::shared_ptr weight_adapter = nullptr; @@ -1898,7 +1904,34 @@ struct GGMLRunner { if (compute_allocr != nullptr) { return true; } - compute_allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(runtime_backend)); + ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(runtime_backend); + + if (probe_compute_buffer_fits_) { + // Measure the planned compute buffer WITHOUT allocating (no_alloc + // planning) and bail before the real reserve if it exceeds the + // backend's max single-buffer size. This lets the caller (VAE AUTO + // tiling) fall back to tiling without the backend ever emitting its + // raw "allocation failed" error on the successful auto path. A + // genuine runtime OOM (planned size <= max, but the device is full) + // is NOT caught here -- it still surfaces from the real reserve + // below, so the reactive fallback remains the backstop. + size_t max_size = ggml_backend_buft_get_max_size(buft); + if (max_size > 0) { + ggml_gallocr* probe = ggml_gallocr_new(buft); + size_t sizes[1] = {0}; + ggml_gallocr_reserve_n_size(probe, gf, nullptr, nullptr, sizes); + ggml_gallocr_free(probe); + if (sizes[0] > max_size) { + LOG_DEBUG("%s: untiled compute buffer %.2f MB exceeds backend max single buffer %.2f MB; deferring to tiling", + get_desc().c_str(), + sizes[0] / 1024.0 / 1024.0, + max_size / 1024.0 / 1024.0); + return false; + } + } + } + + compute_allocr = ggml_gallocr_new(buft); if (!ggml_gallocr_reserve(compute_allocr, gf)) { // failed to allocate the compute buffer @@ -3224,6 +3257,14 @@ struct GGMLRunner { stream_layers_enabled = enabled; } + // When enabled, the next compute() measures its planned compute buffer and + // declines to allocate (returning failure) if it would exceed the backend's + // max single-buffer size, instead of attempting the allocation and emitting + // the backend's raw error. See probe_compute_buffer_fits_. + void set_probe_compute_buffer_fits(bool enabled) { + probe_compute_buffer_fits_ = enabled; + } + sd::layer_registry::LayerRegistry& get_layer_registry() { return layer_registry_; } ggml_backend_t get_runtime_backend() { diff --git a/src/model/vae/vae.hpp b/src/model/vae/vae.hpp index bd0ce6c4e..8212bc62d 100644 --- a/src/model/vae/vae.hpp +++ b/src/model/vae/vae.hpp @@ -194,7 +194,54 @@ struct VAE : public GGMLRunner { "vae decode compute failed while processing a tile", silent); } else { + // AUTO mode (enabled=false, auto_tile=true): proactively measure the untiled decode's + // compute buffer and, if it would exceed the backend's max single-buffer size, decline + // to allocate so the fallback below kicks in *without* the backend printing its raw + // allocation error. The reactive output.empty() check still backstops genuine runtime + // OOM (planned size fits the max, but the device is out of memory). + const bool auto_probe = !tiling_params.enabled && tiling_params.auto_tile; + if (auto_probe) { + set_probe_compute_buffer_fits(true); + } output = _compute(n_threads, input, true); + if (auto_probe) { + set_probe_compute_buffer_fits(false); + } + if (output.empty() && !tiling_params.enabled && tiling_params.auto_tile) { + // The untiled VAE decode compute buffer can exceed the backend's maximum single + // buffer / allocation size — common on integrated GPUs, where the ceiling is + // per-buffer (e.g. Vulkan maxBufferSize), not total memory. sd.cpp already supports + // tiling that keeps each compute buffer small, so fall back to it automatically + // instead of failing the whole decode. CPU remains the ultimate fallback if even a + // tiled buffer cannot be allocated. + free_compute_buffer(); + if (!silent) { + LOG_WARN("vae: untiled decode buffer exceeded the backend limit; retrying with tiling"); + } + sd_tiling_params_t auto_tiling = tiling_params; + auto_tiling.enabled = true; // default tile size (32) via get_tile_sizes + set_tiling_params(auto_tiling); + const int scale_factor = get_scale_factor(); + int64_t W = input.shape()[0] * scale_factor; + int64_t H = input.shape()[1] * scale_factor; + float tile_overlap; + int tile_size_x, tile_size_y; + get_tile_sizes(tile_size_x, tile_size_y, tile_overlap, auto_tiling, input.shape()[0], input.shape()[1]); + output = tiled_compute( + input, + n_threads, + static_cast(W), + static_cast(H), + scale_factor, + tile_size_x, + tile_size_y, + tile_overlap, + circular_x, + circular_y, + true, + "vae decode compute failed while processing a tile", + silent); + } } free_compute_buffer(); diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index 8ba4a463a..da9f5a542 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -188,7 +188,7 @@ class StableDiffusionGGML { bool apply_lora_immediately = false; std::string taesd_path; - sd_tiling_params_t vae_tiling_params = {false, false, 0, 0, 0.5f, 0, 0, nullptr}; + sd_tiling_params_t vae_tiling_params = {false, false, 0, 0, 0.5f, 0, 0, nullptr, true}; // auto_tile=true (AUTO) bool offload_params_to_cpu = false; float max_vram = 0.f; bool stream_layers = false; @@ -2868,7 +2868,7 @@ void sd_img_gen_params_init(sd_img_gen_params_t* sd_img_gen_params) { sd_img_gen_params->batch_count = 1; sd_img_gen_params->control_strength = 0.9f; sd_img_gen_params->pm_params = {nullptr, 0, nullptr, 20.f}; - sd_img_gen_params->vae_tiling_params = {false, false, 0, 0, 0.5f, 0.0f, 0.0f, nullptr}; + sd_img_gen_params->vae_tiling_params = {false, false, 0, 0, 0.5f, 0.0f, 0.0f, nullptr, true}; // auto_tile=true (AUTO) sd_cache_params_init(&sd_img_gen_params->cache); sd_hires_params_init(&sd_img_gen_params->hires); } @@ -2955,7 +2955,7 @@ void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params) { sd_vid_gen_params->fps = 16; sd_vid_gen_params->moe_boundary = 0.875f; sd_vid_gen_params->vace_strength = 1.f; - sd_vid_gen_params->vae_tiling_params = {false, false, 0, 0, 0.5f, 0.0f, 0.0f, nullptr}; + sd_vid_gen_params->vae_tiling_params = {false, false, 0, 0, 0.5f, 0.0f, 0.0f, nullptr, true}; // auto_tile=true (AUTO) sd_vid_gen_params->hires.enabled = false; sd_vid_gen_params->hires.upscaler = SD_HIRES_UPSCALER_LATENT; sd_vid_gen_params->hires.scale = 2.f;