leejet · RapidMark · Jun 8, 2026 · Jun 8, 2026 · Jun 9, 2026
diff --git a/examples/common/common.cpp b/examples/common/common.cpp
@@ -1064,6 +1064,12 @@ ArgOptions SDGenerationParams::get_options() {
          "process vae in tiles to reduce memory usage",
          true,
          &vae_tiling_params.enabled},
+        {"",
+         "--no-vae-tiling-fallback",
+         "disable the automatic fallback to VAE tiling when an untiled decode would exceed the "
+         "backend's max buffer size (fail instead of tiling)",
+         false,
+         &vae_tiling_params.auto_tile},
         {"",
          "--temporal-tiling",
          "enable temporal tiling for LTX video VAE decode",
@@ -1808,6 +1814,9 @@ bool SDGenerationParams::from_json_str(
         if (tiling_json.contains("enabled") && tiling_json["enabled"].is_boolean()) {
             vae_tiling_params.enabled = tiling_json["enabled"];
         }
+        if (tiling_json.contains("auto_tile") && tiling_json["auto_tile"].is_boolean()) {
+            vae_tiling_params.auto_tile = tiling_json["auto_tile"];
+        }
         if (tiling_json.contains("temporal_tiling") && tiling_json["temporal_tiling"].is_boolean()) {
             vae_tiling_params.temporal_tiling = tiling_json["temporal_tiling"];
         }
@@ -2621,10 +2630,12 @@ std::string build_sdcpp_image_metadata_json(const SDContextParams& ctx_params,
     }
 
     if (gen_params.vae_tiling_params.enabled ||
+        !gen_params.vae_tiling_params.auto_tile ||
         gen_params.vae_tiling_params.temporal_tiling ||
         !gen_params.extra_tiling_args.empty()) {
         root["vae_tiling"] = {
             {"enabled", gen_params.vae_tiling_params.enabled},
+            {"auto_tile", gen_params.vae_tiling_params.auto_tile},
             {"temporal_tiling", gen_params.vae_tiling_params.temporal_tiling},
             {"tile_size_x", gen_params.vae_tiling_params.tile_size_x},
             {"tile_size_y", gen_params.vae_tiling_params.tile_size_y},

diff --git a/examples/common/common.h b/examples/common/common.h
@@ -223,7 +223,7 @@ struct SDGenerationParams {
     int video_frames                     = 1;
     int fps                              = 16;
     float vace_strength                  = 1.f;
-    sd_tiling_params_t vae_tiling_params = {false, false, 0, 0, 0.5f, 0.0f, 0.0f, nullptr};
+    sd_tiling_params_t vae_tiling_params = {false, false, 0, 0, 0.5f, 0.0f, 0.0f, nullptr, true};  // auto_tile=true (AUTO)
     std::string extra_tiling_args;
 
     std::string pm_id_images_dir;

diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h
@@ -153,14 +153,19 @@ enum lora_apply_mode_t {
 };
 
 typedef struct {
-    bool enabled;
+    bool enabled;  // true => always tile (ON)
     bool temporal_tiling;
     int tile_size_x;
     int tile_size_y;
     float target_overlap;
     float rel_size_x;
     float rel_size_y;
     const char* extra_tiling_args;
+    // Tristate with `enabled`: enabled => ON (always tile); else auto_tile => AUTO (tile only when
+    // an untiled VAE compute buffer can't be allocated, e.g. it exceeds the backend's max buffer
+    // size on an iGPU); else OFF (never tile, fail if the untiled buffer doesn't fit). Default AUTO.
+    // Appended (rather than folded into an enum) to keep the struct ABI backward-compatible.
+    bool auto_tile;
 } sd_tiling_params_t;
 
 typedef struct {

diff --git a/src/core/ggml_extend.hpp b/src/core/ggml_extend.hpp
@@ -1710,6 +1710,12 @@ struct GGMLRunner {
     bool stream_layers_enabled            = false;
     size_t observed_max_effective_budget_ = 0;
 
+    // When set, alloc_compute_buffer first measures the graph's planned compute
+    // buffer size (no allocation) and bails before allocating if it exceeds the
+    // backend's max single-buffer size. Used by VAE AUTO tiling to fall back to
+    // tiling proactively instead of attempting (and failing) a too-large decode.
+    bool probe_compute_buffer_fits_ = false;
+
     sd::layer_registry::LayerRegistry layer_registry_;
 
     std::shared_ptr<WeightAdapter> weight_adapter = nullptr;
@@ -1898,7 +1904,34 @@ struct GGMLRunner {
         if (compute_allocr != nullptr) {
             return true;
         }
-        compute_allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(runtime_backend));
+        ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(runtime_backend);
+
+        if (probe_compute_buffer_fits_) {
+            // Measure the planned compute buffer WITHOUT allocating (no_alloc
+            // planning) and bail before the real reserve if it exceeds the
+            // backend's max single-buffer size. This lets the caller (VAE AUTO
+            // tiling) fall back to tiling without the backend ever emitting its
+            // raw "allocation failed" error on the successful auto path. A
+            // genuine runtime OOM (planned size <= max, but the device is full)
+            // is NOT caught here -- it still surfaces from the real reserve
+            // below, so the reactive fallback remains the backstop.
+            size_t max_size = ggml_backend_buft_get_max_size(buft);
+            if (max_size > 0) {
+                ggml_gallocr* probe = ggml_gallocr_new(buft);
+                size_t sizes[1]     = {0};
+                ggml_gallocr_reserve_n_size(probe, gf, nullptr, nullptr, sizes);
+                ggml_gallocr_free(probe);
+                if (sizes[0] > max_size) {
+                    LOG_DEBUG("%s: untiled compute buffer %.2f MB exceeds backend max single buffer %.2f MB; deferring to tiling",
+                              get_desc().c_str(),
+                              sizes[0] / 1024.0 / 1024.0,
+                              max_size / 1024.0 / 1024.0);
+                    return false;
+                }
+            }
+        }
+
+        compute_allocr = ggml_gallocr_new(buft);
 
         if (!ggml_gallocr_reserve(compute_allocr, gf)) {
             // failed to allocate the compute buffer
@@ -3224,6 +3257,14 @@ struct GGMLRunner {
         stream_layers_enabled = enabled;
     }
 
+    // When enabled, the next compute() measures its planned compute buffer and
+    // declines to allocate (returning failure) if it would exceed the backend's
+    // max single-buffer size, instead of attempting the allocation and emitting
+    // the backend's raw error. See probe_compute_buffer_fits_.
+    void set_probe_compute_buffer_fits(bool enabled) {
+        probe_compute_buffer_fits_ = enabled;
+    }
+
     sd::layer_registry::LayerRegistry& get_layer_registry() { return layer_registry_; }
 
     ggml_backend_t get_runtime_backend() {

diff --git a/src/model/vae/vae.hpp b/src/model/vae/vae.hpp
@@ -194,7 +194,54 @@ struct VAE : public GGMLRunner {
                 "vae decode compute failed while processing a tile",
                 silent);
         } else {
+            // AUTO mode (enabled=false, auto_tile=true): proactively measure the untiled decode's
+            // compute buffer and, if it would exceed the backend's max single-buffer size, decline
+            // to allocate so the fallback below kicks in *without* the backend printing its raw
+            // allocation error. The reactive output.empty() check still backstops genuine runtime
+            // OOM (planned size fits the max, but the device is out of memory).
+            const bool auto_probe = !tiling_params.enabled && tiling_params.auto_tile;
+            if (auto_probe) {
+                set_probe_compute_buffer_fits(true);
+            }
             output = _compute(n_threads, input, true);
+            if (auto_probe) {
+                set_probe_compute_buffer_fits(false);
+            }
+            if (output.empty() && !tiling_params.enabled && tiling_params.auto_tile) {
+                // The untiled VAE decode compute buffer can exceed the backend's maximum single
+                // buffer / allocation size — common on integrated GPUs, where the ceiling is
+                // per-buffer (e.g. Vulkan maxBufferSize), not total memory. sd.cpp already supports
+                // tiling that keeps each compute buffer small, so fall back to it automatically
+                // instead of failing the whole decode. CPU remains the ultimate fallback if even a
+                // tiled buffer cannot be allocated.
+                free_compute_buffer();
+                if (!silent) {
+                    LOG_WARN("vae: untiled decode buffer exceeded the backend limit; retrying with tiling");
+                }
+                sd_tiling_params_t auto_tiling = tiling_params;
+                auto_tiling.enabled            = true;  // default tile size (32) via get_tile_sizes
+                set_tiling_params(auto_tiling);
+                const int scale_factor = get_scale_factor();
+                int64_t W              = input.shape()[0] * scale_factor;
+                int64_t H              = input.shape()[1] * scale_factor;
+                float tile_overlap;
+                int tile_size_x, tile_size_y;
+                get_tile_sizes(tile_size_x, tile_size_y, tile_overlap, auto_tiling, input.shape()[0], input.shape()[1]);
+                output = tiled_compute(
+                    input,
+                    n_threads,
+                    static_cast<int>(W),
+                    static_cast<int>(H),
+                    scale_factor,
+                    tile_size_x,
+                    tile_size_y,
+                    tile_overlap,
+                    circular_x,
+                    circular_y,
+                    true,
+                    "vae decode compute failed while processing a tile",
+                    silent);
+            }
         }
 
         free_compute_buffer();

diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp
@@ -188,7 +188,7 @@ class StableDiffusionGGML {
     bool apply_lora_immediately = false;
 
     std::string taesd_path;
-    sd_tiling_params_t vae_tiling_params = {false, false, 0, 0, 0.5f, 0, 0, nullptr};
+    sd_tiling_params_t vae_tiling_params = {false, false, 0, 0, 0.5f, 0, 0, nullptr, true};  // auto_tile=true (AUTO)
     bool offload_params_to_cpu           = false;
     float max_vram                       = 0.f;
     bool stream_layers                   = false;
@@ -2868,7 +2868,7 @@ void sd_img_gen_params_init(sd_img_gen_params_t* sd_img_gen_params) {
     sd_img_gen_params->batch_count       = 1;
     sd_img_gen_params->control_strength  = 0.9f;
     sd_img_gen_params->pm_params         = {nullptr, 0, nullptr, 20.f};
-    sd_img_gen_params->vae_tiling_params = {false, false, 0, 0, 0.5f, 0.0f, 0.0f, nullptr};
+    sd_img_gen_params->vae_tiling_params = {false, false, 0, 0, 0.5f, 0.0f, 0.0f, nullptr, true};  // auto_tile=true (AUTO)
     sd_cache_params_init(&sd_img_gen_params->cache);
     sd_hires_params_init(&sd_img_gen_params->hires);
 }
@@ -2955,7 +2955,7 @@ void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params) {
     sd_vid_gen_params->fps                                   = 16;
     sd_vid_gen_params->moe_boundary                          = 0.875f;
     sd_vid_gen_params->vace_strength                         = 1.f;
-    sd_vid_gen_params->vae_tiling_params                     = {false, false, 0, 0, 0.5f, 0.0f, 0.0f, nullptr};
+    sd_vid_gen_params->vae_tiling_params                     = {false, false, 0, 0, 0.5f, 0.0f, 0.0f, nullptr, true};  // auto_tile=true (AUTO)
     sd_vid_gen_params->hires.enabled                         = false;
     sd_vid_gen_params->hires.upscaler                        = SD_HIRES_UPSCALER_LATENT;
     sd_vid_gen_params->hires.scale                           = 2.f;