From 82c7c1e5eff838e94f5679847be069b88101feb1 Mon Sep 17 00:00:00 2001 From: Mark Caldwell Date: Wed, 10 Jun 2026 07:36:29 -0700 Subject: [PATCH] feat: optional sequential component loading (--sequential-load) Load the conditioner, run it, free it, then allocate and load the diffusion model -- instead of holding all components resident at once. Lowers peak device memory from ~sum(conditioner, diffusion, VAE) to ~max(conditioner, diffusion + VAE), so the fast "text encoder on GPU" path fits memory-constrained cards that otherwise cannot hold all three simultaneously. Opt-in via --sequential-load (default off; no behavior change otherwise). Single diffusion model only (skipped when a high-noise/refiner model is also present). Backend-agnostic -- implemented in stable-diffusion.cpp using the existing alloc_params_buffer() / ModelLoader::load_tensors(), with no backend patches. Validated on Strix Halo 8060S (Vulkan, LTX-2) and RX 6700 XT (RDNA2, 12GB): bit-identical output to the default path at a fixed seed, and the flag-off path is byte-identical to before. Peak device memory on RDNA2 (Flux Schnell Q4, 512^2) drops 9.78 -> 7.22 GB with no perf regression. --- examples/common/common.cpp | 5 +++ examples/common/common.h | 1 + include/stable-diffusion.h | 1 + src/stable-diffusion.cpp | 88 ++++++++++++++++++++++++++++++++++++-- 4 files changed, 92 insertions(+), 3 deletions(-) diff --git a/examples/common/common.cpp b/examples/common/common.cpp index 3ae5faba7..54421eac3 100644 --- a/examples/common/common.cpp +++ b/examples/common/common.cpp @@ -473,6 +473,10 @@ ArgOptions SDContextParams::get_options() { "--vae-on-cpu", "keep vae in cpu (for low vram)", true, &vae_on_cpu}, + {"", + "--sequential-load", + "load the conditioner, run it, free it, then load the diffusion model; lowers peak device memory so the text-encoder-on-GPU path fits smaller cards (single diffusion model only)", + true, &sequential_load}, {"", "--fa", "use flash attention", @@ -817,6 +821,7 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f stream_layers, backend.c_str(), params_backend.c_str(), + sequential_load, }; return sd_ctx_params; } diff --git a/examples/common/common.h b/examples/common/common.h index a90a33132..cec604eb5 100644 --- a/examples/common/common.h +++ b/examples/common/common.h @@ -152,6 +152,7 @@ struct SDContextParams { bool control_net_cpu = false; bool clip_on_cpu = false; bool vae_on_cpu = false; + bool sequential_load = false; bool flash_attn = false; bool diffusion_flash_attn = false; bool diffusion_conv_direct = false; diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h index 17596f849..882c703b0 100644 --- a/include/stable-diffusion.h +++ b/include/stable-diffusion.h @@ -226,6 +226,7 @@ typedef struct { bool stream_layers; // Enable residency+prefetch streaming on top of --max-vram (no effect without --max-vram) const char* backend; const char* params_backend; + bool sequential_load; // load conditioner -> run -> free -> then load the diffusion model (lowers peak device memory) } sd_ctx_params_t; typedef struct { diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index 8ba4a463a..8e8745129 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -200,6 +200,18 @@ class StableDiffusionGGML { std::map tensors; + // --- Sequential (lazy) component loading ------------------------------- + // Load the conditioner, run it, free it, THEN allocate + load the diffusion + // model. Cuts peak device memory from sum(cond, DiT, VAE) to ~max(cond, + // DiT+VAE), so the fast text-encoder-on-GPU recipe fits cards that can't + // hold all three at once. Opt-in (--sequential-load), single diffusion model + // only, and backend-agnostic (Vulkan/CPU/CUDA). + bool seq_load_requested = false; // requested via --sequential-load + bool seq_load = false; // effective (requested && single DiT) + bool dit_params_loaded = true; // false while DiT load is deferred + std::set deferred_dit_keys; // diffusion-model tensor keys to load later + ModelLoader model_loader; // retained so the deferred DiT load can read the file + // lora_name => multiplier std::unordered_map curr_lora_state; @@ -293,7 +305,9 @@ class StableDiffusionGGML { } max_vram = sd::ggml_graph_cut::resolve_max_vram_gib(max_vram, backend_for(SDBackendModule::DIFFUSION)); - ModelLoader model_loader; + // model_loader is retained as a member so the deferred diffusion-model + // load can read the file after the conditioner has run + been freed. + seq_load_requested = sd_ctx_params->sequential_load; if (strlen(SAFE_STR(sd_ctx_params->model_path)) > 0) { LOG_INFO("loading model from '%s'", sd_ctx_params->model_path); @@ -774,6 +788,21 @@ class StableDiffusionGGML { get_param_tensors(high_noise_diffusion_model, module_can_mmap(SDBackendModule::DIFFUSION)); } + // Sequential load only applies to the single-diffusion-model case (no + // high-noise model). Capture the DiT tensor keys so we can skip its + // up-front alloc/load and bring it in after the conditioner is freed. + seq_load = seq_load_requested && diffusion_model && !high_noise_diffusion_model; + if (seq_load) { + std::map dit_temp; + diffusion_model->get_param_tensors(dit_temp); + for (const auto& [k, t] : dit_temp) { + deferred_dit_keys.insert(k); + } + dit_params_loaded = false; + LOG_INFO("sequential load: deferring %zu diffusion-model tensors until after conditioning", + deferred_dit_keys.size()); + } + if (!ensure_backend_pair(SDBackendModule::VAE)) { return false; } @@ -1048,7 +1077,7 @@ class StableDiffusionGGML { ggml_free(ctx); return false; } - if (diffusion_model && !diffusion_model->alloc_params_buffer()) { + if (!seq_load && diffusion_model && !diffusion_model->alloc_params_buffer()) { LOG_ERROR("Diffusion model params buffer allocation failed"); ggml_free(ctx); return false; @@ -1081,7 +1110,19 @@ class StableDiffusionGGML { } } - bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads, sd_ctx_params->enable_mmap); + bool success; + if (seq_load) { + // First pass: load everything except the deferred diffusion-model tensors. + std::map first_tensors = tensors; + for (const auto& k : deferred_dit_keys) { + first_tensors.erase(k); + } + std::set first_ignore = ignore_tensors; + first_ignore.insert("model.diffusion_model."); // deferred — loaded after conditioning + success = model_loader.load_tensors(first_tensors, first_ignore, n_threads, sd_ctx_params->enable_mmap); + } else { + success = model_loader.load_tensors(tensors, ignore_tensors, n_threads, sd_ctx_params->enable_mmap); + } if (!success) { LOG_ERROR("load tensors from model loader failed"); ggml_free(ctx); @@ -1890,6 +1931,40 @@ class StableDiffusionGGML { *controls = std::move(*control_result); } + // Allocate + load the diffusion-model params that sequential loading deferred. + // No-op unless seq_load deferred them. Reads the retained model_loader, so the + // model file(s) are re-opened for the DiT tensors only (conditioner/VAE ignored). + bool ensure_diffusion_model_loaded() { + if (dit_params_loaded) { + return true; + } + int64_t t0 = ggml_time_ms(); + if (!diffusion_model->alloc_params_buffer()) { + LOG_ERROR("sequential load: diffusion model params buffer allocation failed"); + return false; + } + std::map dit_tensors; + for (const auto& k : deferred_dit_keys) { + auto it = tensors.find(k); + if (it != tensors.end()) { + dit_tensors[k] = it->second; + } + } + // Ignore the (already-loaded) non-DiT components so they don't log as unknown. + std::set ignore = { + "text_encoders.", "cond_stage_model.", "first_stage_model.", + "vae.", "audio_vae", "alphas_cumprod", + }; + if (!model_loader.load_tensors(dit_tensors, ignore, n_threads, false)) { + LOG_ERROR("sequential load: deferred diffusion model tensor load failed"); + return false; + } + dit_params_loaded = true; + LOG_INFO("sequential load: diffusion model allocated + loaded in %.2fs", + (ggml_time_ms() - t0) * 1.0f / 1000); + return true; + } + sd::Tensor sample(const std::shared_ptr& work_diffusion_model, bool inverse_noise_scaling, const sd::Tensor& init_latent, @@ -1915,6 +1990,12 @@ class StableDiffusionGGML { float frame_rate, const sd_cache_params_t* cache_params, const sd::Tensor& video_positions = {}) { + // Sequential load: bring in the diffusion model now (after the conditioner + // has run and freed its buffer), just before the first denoise step. + if (work_diffusion_model == diffusion_model && !ensure_diffusion_model_loaded()) { + LOG_ERROR("sequential load: diffusion model not available for sampling"); + return {}; + } std::vector skip_layers(guidance.slg.layers, guidance.slg.layers + guidance.slg.layer_count); float cfg_scale = guidance.txt_cfg; float img_cfg_scale = guidance.img_cfg; @@ -2703,6 +2784,7 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) { sd_ctx_params->vae_format = SD_VAE_FORMAT_AUTO; sd_ctx_params->backend = nullptr; sd_ctx_params->params_backend = nullptr; + sd_ctx_params->sequential_load = false; } char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {