Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions examples/common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -473,6 +473,10 @@ ArgOptions SDContextParams::get_options() {
"--vae-on-cpu",
"keep vae in cpu (for low vram)",
true, &vae_on_cpu},
{"",
"--sequential-load",
"load the conditioner, run it, free it, then load the diffusion model; lowers peak device memory so the text-encoder-on-GPU path fits smaller cards (single diffusion model only)",
true, &sequential_load},
{"",
"--fa",
"use flash attention",
Expand Down Expand Up @@ -817,6 +821,7 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f
stream_layers,
backend.c_str(),
params_backend.c_str(),
sequential_load,
};
return sd_ctx_params;
}
Expand Down
1 change: 1 addition & 0 deletions examples/common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ struct SDContextParams {
bool control_net_cpu = false;
bool clip_on_cpu = false;
bool vae_on_cpu = false;
bool sequential_load = false;
bool flash_attn = false;
bool diffusion_flash_attn = false;
bool diffusion_conv_direct = false;
Expand Down
1 change: 1 addition & 0 deletions include/stable-diffusion.h
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,7 @@ typedef struct {
bool stream_layers; // Enable residency+prefetch streaming on top of --max-vram (no effect without --max-vram)
const char* backend;
const char* params_backend;
bool sequential_load; // load conditioner -> run -> free -> then load the diffusion model (lowers peak device memory)
} sd_ctx_params_t;

typedef struct {
Expand Down
88 changes: 85 additions & 3 deletions src/stable-diffusion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,18 @@ class StableDiffusionGGML {

std::map<std::string, ggml_tensor*> tensors;

// --- Sequential (lazy) component loading -------------------------------
// Load the conditioner, run it, free it, THEN allocate + load the diffusion
// model. Cuts peak device memory from sum(cond, DiT, VAE) to ~max(cond,
// DiT+VAE), so the fast text-encoder-on-GPU recipe fits cards that can't
// hold all three at once. Opt-in (--sequential-load), single diffusion model
// only, and backend-agnostic (Vulkan/CPU/CUDA).
bool seq_load_requested = false; // requested via --sequential-load
bool seq_load = false; // effective (requested && single DiT)
bool dit_params_loaded = true; // false while DiT load is deferred
std::set<std::string> deferred_dit_keys; // diffusion-model tensor keys to load later
ModelLoader model_loader; // retained so the deferred DiT load can read the file

// lora_name => multiplier
std::unordered_map<std::string, float> curr_lora_state;

Expand Down Expand Up @@ -293,7 +305,9 @@ class StableDiffusionGGML {
}
max_vram = sd::ggml_graph_cut::resolve_max_vram_gib(max_vram, backend_for(SDBackendModule::DIFFUSION));

ModelLoader model_loader;
// model_loader is retained as a member so the deferred diffusion-model
// load can read the file after the conditioner has run + been freed.
seq_load_requested = sd_ctx_params->sequential_load;

if (strlen(SAFE_STR(sd_ctx_params->model_path)) > 0) {
LOG_INFO("loading model from '%s'", sd_ctx_params->model_path);
Expand Down Expand Up @@ -774,6 +788,21 @@ class StableDiffusionGGML {
get_param_tensors(high_noise_diffusion_model, module_can_mmap(SDBackendModule::DIFFUSION));
}

// Sequential load only applies to the single-diffusion-model case (no
// high-noise model). Capture the DiT tensor keys so we can skip its
// up-front alloc/load and bring it in after the conditioner is freed.
seq_load = seq_load_requested && diffusion_model && !high_noise_diffusion_model;
if (seq_load) {
std::map<std::string, ggml_tensor*> dit_temp;
diffusion_model->get_param_tensors(dit_temp);
for (const auto& [k, t] : dit_temp) {
deferred_dit_keys.insert(k);
}
dit_params_loaded = false;
LOG_INFO("sequential load: deferring %zu diffusion-model tensors until after conditioning",
deferred_dit_keys.size());
}

if (!ensure_backend_pair(SDBackendModule::VAE)) {
return false;
}
Expand Down Expand Up @@ -1048,7 +1077,7 @@ class StableDiffusionGGML {
ggml_free(ctx);
return false;
}
if (diffusion_model && !diffusion_model->alloc_params_buffer()) {
if (!seq_load && diffusion_model && !diffusion_model->alloc_params_buffer()) {
LOG_ERROR("Diffusion model params buffer allocation failed");
ggml_free(ctx);
return false;
Expand Down Expand Up @@ -1081,7 +1110,19 @@ class StableDiffusionGGML {
}
}

bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads, sd_ctx_params->enable_mmap);
bool success;
if (seq_load) {
// First pass: load everything except the deferred diffusion-model tensors.
std::map<std::string, ggml_tensor*> first_tensors = tensors;
for (const auto& k : deferred_dit_keys) {
first_tensors.erase(k);
}
std::set<std::string> first_ignore = ignore_tensors;
first_ignore.insert("model.diffusion_model."); // deferred — loaded after conditioning
success = model_loader.load_tensors(first_tensors, first_ignore, n_threads, sd_ctx_params->enable_mmap);
} else {
success = model_loader.load_tensors(tensors, ignore_tensors, n_threads, sd_ctx_params->enable_mmap);
}
if (!success) {
LOG_ERROR("load tensors from model loader failed");
ggml_free(ctx);
Expand Down Expand Up @@ -1890,6 +1931,40 @@ class StableDiffusionGGML {
*controls = std::move(*control_result);
}

// Allocate + load the diffusion-model params that sequential loading deferred.
// No-op unless seq_load deferred them. Reads the retained model_loader, so the
// model file(s) are re-opened for the DiT tensors only (conditioner/VAE ignored).
bool ensure_diffusion_model_loaded() {
if (dit_params_loaded) {
return true;
}
int64_t t0 = ggml_time_ms();
if (!diffusion_model->alloc_params_buffer()) {
LOG_ERROR("sequential load: diffusion model params buffer allocation failed");
return false;
}
std::map<std::string, ggml_tensor*> dit_tensors;
for (const auto& k : deferred_dit_keys) {
auto it = tensors.find(k);
if (it != tensors.end()) {
dit_tensors[k] = it->second;
}
}
// Ignore the (already-loaded) non-DiT components so they don't log as unknown.
std::set<std::string> ignore = {
"text_encoders.", "cond_stage_model.", "first_stage_model.",
"vae.", "audio_vae", "alphas_cumprod",
};
if (!model_loader.load_tensors(dit_tensors, ignore, n_threads, false)) {
LOG_ERROR("sequential load: deferred diffusion model tensor load failed");
return false;
}
dit_params_loaded = true;
LOG_INFO("sequential load: diffusion model allocated + loaded in %.2fs",
(ggml_time_ms() - t0) * 1.0f / 1000);
return true;
}

sd::Tensor<float> sample(const std::shared_ptr<DiffusionModelRunner>& work_diffusion_model,
bool inverse_noise_scaling,
const sd::Tensor<float>& init_latent,
Expand All @@ -1915,6 +1990,12 @@ class StableDiffusionGGML {
float frame_rate,
const sd_cache_params_t* cache_params,
const sd::Tensor<float>& video_positions = {}) {
// Sequential load: bring in the diffusion model now (after the conditioner
// has run and freed its buffer), just before the first denoise step.
if (work_diffusion_model == diffusion_model && !ensure_diffusion_model_loaded()) {
LOG_ERROR("sequential load: diffusion model not available for sampling");
return {};
}
std::vector<int> skip_layers(guidance.slg.layers, guidance.slg.layers + guidance.slg.layer_count);
float cfg_scale = guidance.txt_cfg;
float img_cfg_scale = guidance.img_cfg;
Expand Down Expand Up @@ -2703,6 +2784,7 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
sd_ctx_params->vae_format = SD_VAE_FORMAT_AUTO;
sd_ctx_params->backend = nullptr;
sd_ctx_params->params_backend = nullptr;
sd_ctx_params->sequential_load = false;
}

char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
Expand Down
Loading