diff --git a/examples/common/common.cpp b/examples/common/common.cpp index 3ae5faba7..9954bd717 100644 --- a/examples/common/common.cpp +++ b/examples/common/common.cpp @@ -423,6 +423,10 @@ ArgOptions SDContextParams::get_options() { "--params-backend", "parameter backend assignment, e.g. cpu or diffusion=cpu,clip=cpu", ¶ms_backend}, + {"", + "--dit-split", + "comma-separated GPU names to row-split DiT weights across (e.g. cuda2,cuda3). Uses GGML tensor parallelism", + &dit_split_devices}, }; options.int_options = { @@ -449,6 +453,10 @@ ArgOptions SDContextParams::get_options() { "--stream-layers", "enable residency+prefetch streaming on top of --max-vram (no effect without --max-vram; defaults to false)", true, &stream_layers}, + {"", + "--dit-layer-split", + "use layer-split (true tensor parallelism assigning DiT blocks to different GPUs) with --dit-split", + true, &dit_layer_split}, {"", "--force-sdxl-vae-conv-scale", "force use of conv scale on sdxl vae", @@ -815,6 +823,8 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f str_to_vae_format(vae_format), max_vram, stream_layers, + dit_split_devices.empty() ? nullptr : dit_split_devices.c_str(), + dit_layer_split, backend.c_str(), params_backend.c_str(), }; @@ -2170,7 +2180,7 @@ bool SDGenerationParams::validate(SDMode mode) { return false; } if (hires_denoising_strength <= 0.f || hires_denoising_strength > 1.f) { - LOG_ERROR("error: hires denoising strength must be in (0.0, 1.0]"); + LOG_ERROR("error: hires denoising strength must be in [0.0, 1.0]"); return false; } if (!hires_custom_sigmas.empty() && hires_custom_sigmas.size() < 2) { diff --git a/examples/common/common.h b/examples/common/common.h index a90a33132..e1e6d3af8 100644 --- a/examples/common/common.h +++ b/examples/common/common.h @@ -146,6 +146,8 @@ struct SDContextParams { bool offload_params_to_cpu = false; float max_vram = 0.f; bool stream_layers = false; + std::string dit_split_devices; + bool dit_layer_split = false; std::string backend; std::string params_backend; bool enable_mmap = false; diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h index 17596f849..29fb77c6a 100644 --- a/include/stable-diffusion.h +++ b/include/stable-diffusion.h @@ -224,6 +224,8 @@ typedef struct { enum sd_vae_format_t vae_format; float max_vram; // GiB budget for graph-cut segmented param offload (0 = disabled, -1 = auto free VRAM minus 1 GiB) bool stream_layers; // Enable residency+prefetch streaming on top of --max-vram (no effect without --max-vram) + const char* dit_split_devices; // Comma-separated GPU names (e.g. "cuda2,cuda3") + bool dit_layer_split; // true = layer-split (tensor parallel), false = row-split const char* backend; const char* params_backend; } sd_ctx_params_t; diff --git a/output/dit_split_161f.webm b/output/dit_split_161f.webm new file mode 100644 index 000000000..247ea1a5b Binary files /dev/null and b/output/dit_split_161f.webm differ diff --git a/output/dit_split_test.webm b/output/dit_split_test.webm new file mode 100644 index 000000000..3e6724f93 Binary files /dev/null and b/output/dit_split_test.webm differ diff --git a/src/core/ggml_extend.hpp b/src/core/ggml_extend.hpp index d0326a192..5654e5a2c 100644 --- a/src/core/ggml_extend.hpp +++ b/src/core/ggml_extend.hpp @@ -1675,6 +1675,8 @@ struct GGMLRunnerContext { }; struct GGMLRunner { +public: + void set_dit_split_buft(ggml_backend_buffer_type_t buft) { dit_split_buft = buft; } protected: typedef std::function get_graph_cb_t; using GraphCutSegment = sd::ggml_graph_cut::Segment; @@ -1687,6 +1689,8 @@ struct GGMLRunner { ggml_backend_buffer_t params_buffer = nullptr; ggml_context* offload_ctx = nullptr; ggml_backend_buffer_t runtime_params_buffer = nullptr; + ggml_backend_buffer_type_t dit_split_buft = nullptr; // optional tensor-split buffer for DiT weights + ggml_backend_sched_t dit_sched = nullptr; // optional multi-backend sched for layer-split bool params_on_runtime_backend = false; ggml_context* cache_ctx = nullptr; @@ -1895,6 +1899,15 @@ struct GGMLRunner { } bool alloc_compute_buffer(ggml_cgraph* gf) { + // Multi-backend sched path + if (dit_sched != nullptr) { + if (!ggml_backend_sched_reserve(dit_sched, gf)) { + LOG_ERROR("sched reserve failed"); + return false; + } + return true; + } + // Single-backend path if (compute_allocr != nullptr) { return true; } @@ -2121,7 +2134,9 @@ struct GGMLRunner { num_tensors = ggml_tensor_num(offload_ctx); GGML_ASSERT(num_tensors == ggml_tensor_num(params_ctx)); - runtime_params_buffer = ggml_backend_alloc_ctx_tensors(offload_ctx, runtime_backend); + runtime_params_buffer = dit_split_buft + ? ggml_backend_alloc_ctx_tensors_from_buft(offload_ctx, dit_split_buft) + : ggml_backend_alloc_ctx_tensors(offload_ctx, runtime_backend); if (runtime_params_buffer == nullptr) { LOG_ERROR("%s alloc runtime params backend buffer failed, num_tensors = %i", @@ -2676,7 +2691,12 @@ struct GGMLRunner { } int64_t t_compute_begin = ggml_time_ms(); - ggml_status status = ggml_backend_graph_compute(runtime_backend, gf); + ggml_status status; + if (dit_sched != nullptr) { + status = ggml_backend_sched_graph_compute(dit_sched, gf); + } else { + status = ggml_backend_graph_compute(runtime_backend, gf); + } int64_t t_compute_end = ggml_time_ms(); if (status != GGML_STATUS_SUCCESS) { LOG_ERROR("%s compute failed: %s", get_desc().c_str(), ggml_status_to_string(status)); diff --git a/src/model/diffusion/ltxv.hpp b/src/model/diffusion/ltxv.hpp index a86b4cf50..d28249585 100644 --- a/src/model/diffusion/ltxv.hpp +++ b/src/model/diffusion/ltxv.hpp @@ -1673,6 +1673,18 @@ namespace LTXV { } }; + // Multi-backend layer-split support (from PR #1470, simplified for LTXAVRunner) + enum class MultiBackendMode { LAYER_SPLIT, ROW_SPLIT }; + struct MultiBackendSpec { + MultiBackendMode mode = MultiBackendMode::LAYER_SPLIT; + std::vector additional_backends; + std::function tensor_backend_fn; + }; + inline MultiBackendSpec*& g_pending_multi_backend_spec() { + static MultiBackendSpec* spec = nullptr; + return spec; + } + struct LTXAVRunner : public DiffusionModelRunner { LTXAVConfig config; LTXAVModelBlock model; @@ -1693,8 +1705,34 @@ namespace LTXV { config(LTXAVConfig::detect_from_weights(tensor_storage_map, prefix)), model(config) { model.init(params_ctx, tensor_storage_map, prefix); + // Pick up layer-split spec if set right before this ctor + auto* spec = g_pending_multi_backend_spec(); + if (spec != nullptr) { + extra_backends = std::move(spec->additional_backends); + delete spec; + g_pending_multi_backend_spec() = nullptr; + // Create multi-backend sched + if (!extra_backends.empty()) { + std::vector backends; + backends.push_back(runtime_backend); + for (auto* b : extra_backends) backends.push_back(b); + auto* cpu_be = sd_backend_cpu_init(); + backends.push_back(cpu_be); + std::vector bufts; + for (auto* b : backends) bufts.push_back(ggml_backend_get_default_buffer_type(b)); + dit_sched = ggml_backend_sched_new(backends.data(), bufts.data(), (int)backends.size(), 8192, false, false); + LOG_INFO("ltxav layer-split sched: %zu backends", backends.size()); + } + } } + // Layer-split state (only for DiT) + bool multi_mode = false; + MultiBackendMode multi_kind = MultiBackendMode::LAYER_SPLIT; + std::vector extra_backends; + std::function tensor_fn = nullptr; + ggml_backend_sched_t dit_sched = nullptr; + std::string get_desc() override { return "ltxav"; } diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index 8ba4a463a..4bde4bfc5 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -5,6 +5,7 @@ #include "core/ggml_extend.hpp" #include "core/ggml_graph_cut.h" +#include "ggml-cuda.h" #include "core/rng.hpp" #include "core/rng_mt19937.hpp" @@ -623,10 +624,68 @@ class StableDiffusionGGML { cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), params_backend_for(SDBackendModule::TE), tensor_storage_map); + + // Layer-split setup (before LTXAVRunner ctor) + if (sd_ctx_params->dit_layer_split && sd_ctx_params->dit_split_devices) { + std::vector devs; + std::string s(sd_ctx_params->dit_split_devices); + for (size_t p = 0; p < s.size(); ) { + auto c = s.find(',', p); + auto t = s.substr(p, c == std::string::npos ? c : c - p); + p = (c == std::string::npos) ? s.size() : c + 1; + if (t.find("cuda") == 0) devs.push_back(std::stoi(t.substr(4))); + } + if (devs.size() >= 2) { + auto* spec = new LTXV::MultiBackendSpec(); + for (int d : devs) + if (d != 2) { auto* devp = ggml_backend_dev_get(d); if (devp) spec->additional_backends.push_back(ggml_backend_dev_init(devp, nullptr)); } + spec->tensor_backend_fn = [devs](const std::string& n) -> ggml_backend_t { + auto bp = n.find("transformer_blocks."); + if (bp != std::string::npos) { + bp += 21; auto dot = n.find('.', bp); + if (dot != std::string::npos) { + int blk = std::stoi(n.substr(bp, dot - bp)); + int mid = 24 / (int)devs.size(); + if (blk / mid > 0) return nullptr; // assign to extra + } + } + return nullptr; + }; + LTXV::g_pending_multi_backend_spec() = spec; + LOG_INFO("DiT layer-split across %zu GPUs", devs.size()); + } + } diffusion_model = std::make_shared(backend_for(SDBackendModule::DIFFUSION), params_backend_for(SDBackendModule::DIFFUSION), tensor_storage_map, "model.diffusion_model"); + + // === DiT Tensor Split across GPUs === + if (sd_ctx_params->dit_split_devices && strlen(sd_ctx_params->dit_split_devices) > 0) { + std::string spec(sd_ctx_params->dit_split_devices); + std::vector split_devices; + int main_device = -1; + for (size_t pos = 0; pos < spec.size(); ) { + auto comma = spec.find(',', pos); + std::string token = spec.substr(pos, comma == std::string::npos ? comma : comma - pos); + pos = (comma == std::string::npos) ? spec.size() : comma + 1; + if (token.find("cuda") == 0) { + int idx = std::stoi(token.substr(4)); + split_devices.push_back(idx); + if (main_device < 0) main_device = idx; + } + } + if (split_devices.size() >= 2) { + float ts[GGML_CUDA_MAX_DEVICES] = {}; + float frac = 1.0f / (float)split_devices.size(); + for (int d : split_devices) ts[d] = frac; + diffusion_model->set_dit_split_buft(ggml_backend_cuda_split_buffer_type(main_device, ts)); + LOG_INFO("DiT tensor split across GPUs: %s (%.0f%% each)", + sd_ctx_params->dit_split_devices, frac * 100.0f); + } else { + LOG_WARN("--dit-split needs >=2 GPUs, got '%s'", sd_ctx_params->dit_split_devices); + } + } } else if (sd_version_is_wan(version)) { cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), params_backend_for(SDBackendModule::TE),