Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion examples/common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -423,6 +423,10 @@ ArgOptions SDContextParams::get_options() {
"--params-backend",
"parameter backend assignment, e.g. cpu or diffusion=cpu,clip=cpu",
&params_backend},
{"",
"--dit-split",
"comma-separated GPU names to row-split DiT weights across (e.g. cuda2,cuda3). Uses GGML tensor parallelism",
&dit_split_devices},
};

options.int_options = {
Expand All @@ -449,6 +453,10 @@ ArgOptions SDContextParams::get_options() {
"--stream-layers",
"enable residency+prefetch streaming on top of --max-vram (no effect without --max-vram; defaults to false)",
true, &stream_layers},
{"",
"--dit-layer-split",
"use layer-split (true tensor parallelism assigning DiT blocks to different GPUs) with --dit-split",
true, &dit_layer_split},
{"",
"--force-sdxl-vae-conv-scale",
"force use of conv scale on sdxl vae",
Expand Down Expand Up @@ -815,6 +823,8 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f
str_to_vae_format(vae_format),
max_vram,
stream_layers,
dit_split_devices.empty() ? nullptr : dit_split_devices.c_str(),
dit_layer_split,
backend.c_str(),
params_backend.c_str(),
};
Expand Down Expand Up @@ -2170,7 +2180,7 @@ bool SDGenerationParams::validate(SDMode mode) {
return false;
}
if (hires_denoising_strength <= 0.f || hires_denoising_strength > 1.f) {
LOG_ERROR("error: hires denoising strength must be in (0.0, 1.0]");
LOG_ERROR("error: hires denoising strength must be in [0.0, 1.0]");
return false;
}
if (!hires_custom_sigmas.empty() && hires_custom_sigmas.size() < 2) {
Expand Down
2 changes: 2 additions & 0 deletions examples/common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,8 @@ struct SDContextParams {
bool offload_params_to_cpu = false;
float max_vram = 0.f;
bool stream_layers = false;
std::string dit_split_devices;
bool dit_layer_split = false;
std::string backend;
std::string params_backend;
bool enable_mmap = false;
Expand Down
2 changes: 2 additions & 0 deletions include/stable-diffusion.h
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,8 @@ typedef struct {
enum sd_vae_format_t vae_format;
float max_vram; // GiB budget for graph-cut segmented param offload (0 = disabled, -1 = auto free VRAM minus 1 GiB)
bool stream_layers; // Enable residency+prefetch streaming on top of --max-vram (no effect without --max-vram)
const char* dit_split_devices; // Comma-separated GPU names (e.g. "cuda2,cuda3")
bool dit_layer_split; // true = layer-split (tensor parallel), false = row-split
const char* backend;
const char* params_backend;
} sd_ctx_params_t;
Expand Down
Binary file added output/dit_split_161f.webm
Binary file not shown.
Binary file added output/dit_split_test.webm
Binary file not shown.
24 changes: 22 additions & 2 deletions src/core/ggml_extend.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1675,6 +1675,8 @@ struct GGMLRunnerContext {
};

struct GGMLRunner {
public:
void set_dit_split_buft(ggml_backend_buffer_type_t buft) { dit_split_buft = buft; }
protected:
typedef std::function<ggml_cgraph*()> get_graph_cb_t;
using GraphCutSegment = sd::ggml_graph_cut::Segment;
Expand All @@ -1687,6 +1689,8 @@ struct GGMLRunner {
ggml_backend_buffer_t params_buffer = nullptr;
ggml_context* offload_ctx = nullptr;
ggml_backend_buffer_t runtime_params_buffer = nullptr;
ggml_backend_buffer_type_t dit_split_buft = nullptr; // optional tensor-split buffer for DiT weights
ggml_backend_sched_t dit_sched = nullptr; // optional multi-backend sched for layer-split
bool params_on_runtime_backend = false;

ggml_context* cache_ctx = nullptr;
Expand Down Expand Up @@ -1895,6 +1899,15 @@ struct GGMLRunner {
}

bool alloc_compute_buffer(ggml_cgraph* gf) {
// Multi-backend sched path
if (dit_sched != nullptr) {
if (!ggml_backend_sched_reserve(dit_sched, gf)) {
LOG_ERROR("sched reserve failed");
return false;
}
return true;
}
// Single-backend path
if (compute_allocr != nullptr) {
return true;
}
Expand Down Expand Up @@ -2121,7 +2134,9 @@ struct GGMLRunner {
num_tensors = ggml_tensor_num(offload_ctx);
GGML_ASSERT(num_tensors == ggml_tensor_num(params_ctx));

runtime_params_buffer = ggml_backend_alloc_ctx_tensors(offload_ctx, runtime_backend);
runtime_params_buffer = dit_split_buft
? ggml_backend_alloc_ctx_tensors_from_buft(offload_ctx, dit_split_buft)
: ggml_backend_alloc_ctx_tensors(offload_ctx, runtime_backend);

if (runtime_params_buffer == nullptr) {
LOG_ERROR("%s alloc runtime params backend buffer failed, num_tensors = %i",
Expand Down Expand Up @@ -2676,7 +2691,12 @@ struct GGMLRunner {
}

int64_t t_compute_begin = ggml_time_ms();
ggml_status status = ggml_backend_graph_compute(runtime_backend, gf);
ggml_status status;
if (dit_sched != nullptr) {
status = ggml_backend_sched_graph_compute(dit_sched, gf);
} else {
status = ggml_backend_graph_compute(runtime_backend, gf);
}
int64_t t_compute_end = ggml_time_ms();
if (status != GGML_STATUS_SUCCESS) {
LOG_ERROR("%s compute failed: %s", get_desc().c_str(), ggml_status_to_string(status));
Expand Down
38 changes: 38 additions & 0 deletions src/model/diffusion/ltxv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1673,6 +1673,18 @@ namespace LTXV {
}
};

// Multi-backend layer-split support (from PR #1470, simplified for LTXAVRunner)
enum class MultiBackendMode { LAYER_SPLIT, ROW_SPLIT };
struct MultiBackendSpec {
MultiBackendMode mode = MultiBackendMode::LAYER_SPLIT;
std::vector<ggml_backend_t> additional_backends;
std::function<ggml_backend_t(const std::string&)> tensor_backend_fn;
};
inline MultiBackendSpec*& g_pending_multi_backend_spec() {
static MultiBackendSpec* spec = nullptr;
return spec;
}

struct LTXAVRunner : public DiffusionModelRunner {
LTXAVConfig config;
LTXAVModelBlock model;
Expand All @@ -1693,8 +1705,34 @@ namespace LTXV {
config(LTXAVConfig::detect_from_weights(tensor_storage_map, prefix)),
model(config) {
model.init(params_ctx, tensor_storage_map, prefix);
// Pick up layer-split spec if set right before this ctor
auto* spec = g_pending_multi_backend_spec();
if (spec != nullptr) {
extra_backends = std::move(spec->additional_backends);
delete spec;
g_pending_multi_backend_spec() = nullptr;
// Create multi-backend sched
if (!extra_backends.empty()) {
std::vector<ggml_backend_t> backends;
backends.push_back(runtime_backend);
for (auto* b : extra_backends) backends.push_back(b);
auto* cpu_be = sd_backend_cpu_init();
backends.push_back(cpu_be);
std::vector<ggml_backend_buffer_type_t> bufts;
for (auto* b : backends) bufts.push_back(ggml_backend_get_default_buffer_type(b));
dit_sched = ggml_backend_sched_new(backends.data(), bufts.data(), (int)backends.size(), 8192, false, false);
LOG_INFO("ltxav layer-split sched: %zu backends", backends.size());
}
}
}

// Layer-split state (only for DiT)
bool multi_mode = false;
MultiBackendMode multi_kind = MultiBackendMode::LAYER_SPLIT;
std::vector<ggml_backend_t> extra_backends;
std::function<ggml_backend_t(const std::string&)> tensor_fn = nullptr;
ggml_backend_sched_t dit_sched = nullptr;

std::string get_desc() override {
return "ltxav";
}
Expand Down
59 changes: 59 additions & 0 deletions src/stable-diffusion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

#include "core/ggml_extend.hpp"
#include "core/ggml_graph_cut.h"
#include "ggml-cuda.h"

#include "core/rng.hpp"
#include "core/rng_mt19937.hpp"
Expand Down Expand Up @@ -623,10 +624,68 @@ class StableDiffusionGGML {
cond_stage_model = std::make_shared<LTXAVEmbedder>(backend_for(SDBackendModule::TE),
params_backend_for(SDBackendModule::TE),
tensor_storage_map);

// Layer-split setup (before LTXAVRunner ctor)
if (sd_ctx_params->dit_layer_split && sd_ctx_params->dit_split_devices) {
std::vector<int> devs;
std::string s(sd_ctx_params->dit_split_devices);
for (size_t p = 0; p < s.size(); ) {
auto c = s.find(',', p);
auto t = s.substr(p, c == std::string::npos ? c : c - p);
p = (c == std::string::npos) ? s.size() : c + 1;
if (t.find("cuda") == 0) devs.push_back(std::stoi(t.substr(4)));
}
if (devs.size() >= 2) {
auto* spec = new LTXV::MultiBackendSpec();
for (int d : devs)
if (d != 2) { auto* devp = ggml_backend_dev_get(d); if (devp) spec->additional_backends.push_back(ggml_backend_dev_init(devp, nullptr)); }
spec->tensor_backend_fn = [devs](const std::string& n) -> ggml_backend_t {
auto bp = n.find("transformer_blocks.");
if (bp != std::string::npos) {
bp += 21; auto dot = n.find('.', bp);
if (dot != std::string::npos) {
int blk = std::stoi(n.substr(bp, dot - bp));
int mid = 24 / (int)devs.size();
if (blk / mid > 0) return nullptr; // assign to extra
}
}
return nullptr;
};
LTXV::g_pending_multi_backend_spec() = spec;
LOG_INFO("DiT layer-split across %zu GPUs", devs.size());
}
}
diffusion_model = std::make_shared<LTXV::LTXAVRunner>(backend_for(SDBackendModule::DIFFUSION),
params_backend_for(SDBackendModule::DIFFUSION),
tensor_storage_map,
"model.diffusion_model");

// === DiT Tensor Split across GPUs ===
if (sd_ctx_params->dit_split_devices && strlen(sd_ctx_params->dit_split_devices) > 0) {
std::string spec(sd_ctx_params->dit_split_devices);
std::vector<int> split_devices;
int main_device = -1;
for (size_t pos = 0; pos < spec.size(); ) {
auto comma = spec.find(',', pos);
std::string token = spec.substr(pos, comma == std::string::npos ? comma : comma - pos);
pos = (comma == std::string::npos) ? spec.size() : comma + 1;
if (token.find("cuda") == 0) {
int idx = std::stoi(token.substr(4));
split_devices.push_back(idx);
if (main_device < 0) main_device = idx;
}
}
if (split_devices.size() >= 2) {
float ts[GGML_CUDA_MAX_DEVICES] = {};
float frac = 1.0f / (float)split_devices.size();
for (int d : split_devices) ts[d] = frac;
diffusion_model->set_dit_split_buft(ggml_backend_cuda_split_buffer_type(main_device, ts));
LOG_INFO("DiT tensor split across GPUs: %s (%.0f%% each)",
sd_ctx_params->dit_split_devices, frac * 100.0f);
} else {
LOG_WARN("--dit-split needs >=2 GPUs, got '%s'", sd_ctx_params->dit_split_devices);
}
}
} else if (sd_version_is_wan(version)) {
cond_stage_model = std::make_shared<T5CLIPEmbedder>(backend_for(SDBackendModule::TE),
params_backend_for(SDBackendModule::TE),
Expand Down