Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 19 additions & 14 deletions koboldcpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -1136,10 +1136,11 @@ def autoset_gpu_layers(ctxsize, sdquanted, bbs, qkv_level): #shitty algo to dete
showmultigpuwarning = False
print("Multi-Part GGUF detected. Layer estimates may not be very accurate - recommend setting layers manually.")
fsize *= total_parts
sdquantsavings = sdquanted
if modelfile_extracted_meta[3] > 1024*1024*1024*5: #sdxl tax
mem -= 1024*1024*1024*(6 if sdquanted else 9)
mem -= 1024*1024*1024*(9 - sdquantsavings * 1.5) # 9, 7.5, 6
elif modelfile_extracted_meta[3] > 1024*1024*512: #normal sd tax
mem -= 1024*1024*1024*(3.25 if sdquanted else 4.25)
mem -= 1024*1024*1024*(4.25 - sdquantsavings * 0.5) # 4.25, 3.75, 3.25
if modelfile_extracted_meta[4] > 1024*1024*10: #whisper tax
mem -= max(350*1024*1024,modelfile_extracted_meta[4]*1.5)
if modelfile_extracted_meta[5] > 1024*1024*10: #mmproj tax
Expand Down Expand Up @@ -1656,22 +1657,25 @@ def sd_convdirect_option(value):
return 'full'
raise argparse.ArgumentTypeError(f"Invalid sdconvdirect option \"{value}\". Must be one of {sd_convdirect_choices}.")

sd_quant_choices = ['0 (off)', '1 (q8_0)', '2 (q4_0)']

def sd_quant_option(value):
return int((value or '0')[0])

def sd_load_model(model_filename,vae_filename,lora_filename,t5xxl_filename,clipl_filename,clipg_filename,photomaker_filename):
global args
inputs = sd_load_model_inputs()
inputs.model_filename = model_filename.encode("UTF-8")
thds = args.threads
quant = 0

if args.sdthreads and args.sdthreads > 0:
sdt = int(args.sdthreads)
if sdt > 0:
thds = sdt
if args.sdquant:
quant = 1

inputs.threads = thds
inputs.quant = quant
sd_quant_types = {0: -1, 1: 8, 2: 2} # enum sd_type_t
inputs.quant = sd_quant_types[args.sdquant]
inputs.flash_attention = args.sdflashattention
sdconvdirect = sd_convdirect_option(args.sdconvdirect)
inputs.diffusion_conv_direct = sdconvdirect == 'full'
Expand Down Expand Up @@ -4596,7 +4600,7 @@ def hide_tooltip(event):
sd_clamped_var = ctk.StringVar(value="0")
sd_clamped_soft_var = ctk.StringVar(value="0")
sd_threads_var = ctk.StringVar(value=str(default_threads))
sd_quant_var = ctk.IntVar(value=0)
sd_quant_var = ctk.StringVar(value=sd_quant_choices[0])

whisper_model_var = ctk.StringVar()
tts_model_var = ctk.StringVar()
Expand Down Expand Up @@ -4939,7 +4943,7 @@ def gui_changed_modelfile(*args):
pass

def changed_gpulayers_estimate(*args):
predicted_gpu_layers = autoset_gpu_layers(int(contextsize_text[context_var.get()]),(sd_quant_var.get()==1),int(blasbatchsize_values[int(blas_size_var.get())]),(quantkv_var.get() if flashattention_var.get()==1 else 0))
predicted_gpu_layers = autoset_gpu_layers(int(contextsize_text[context_var.get()]),sd_quant_option(sd_quant_var.get() or 0),int(blasbatchsize_values[int(blas_size_var.get())]),(quantkv_var.get() if flashattention_var.get()==1 else 0))
max_gpu_layers = (f"/{modelfile_extracted_meta[1][0]+3}" if (modelfile_extracted_meta and modelfile_extracted_meta[1] and modelfile_extracted_meta[1][0]!=0) else "")
index = runopts_var.get()
gpu_be = (index == "Use Vulkan" or index == "Use Vulkan (Old CPU)" or index == "Use CLBlast" or index == "Use CLBlast (Old CPU)" or index == "Use CLBlast (Older CPU)" or index == "Use CUDA" or index == "Use hipBLAS (ROCm)")
Expand Down Expand Up @@ -5341,7 +5345,7 @@ def togglehorde(a,b,c):
makelabelentry(images_tab, "(Soft):", sd_clamped_soft_var, 4, 50, padx=290,singleline=True,tooltip="Square image size restriction, to protect the server against memory crashes.\nAllows width-height tradeoffs, eg. 640 allows 640x640 and 512x768\nLeave at 0 for the default value: 832 for SD1.5/SD2, 1024 otherwise.",labelpadx=250)
makelabelentry(images_tab, "Image Threads:" , sd_threads_var, 8, 50,padx=290,singleline=True,tooltip="How many threads to use during image generation.\nIf left blank, uses same value as threads.")
sd_model_var.trace_add("write", gui_changed_modelfile)
makecheckbox(images_tab, "Compress Weights (Saves Memory)", sd_quant_var, 10,tooltiptxt="Quantizes the SD model weights to save memory. May degrade quality.")
makelabelcombobox(images_tab, "Compress Weights (Saves Memory)", sd_quant_var, 10, width=50, labelpadx=65, tooltiptxt="Quantizes the SD model weights to save memory.\nHigher levels save more memory, and cause more quality degradation.", values=sd_quant_choices)
sd_quant_var.trace_add("write", changed_gpulayers_estimate)

makefileentry(images_tab, "Image LoRA (safetensors/gguf):", "Select SD lora file",sd_lora_var, 20, width=280, singlecol=True, filetypes=[("*.safetensors *.gguf", "*.safetensors *.gguf")],tooltiptxt="Select a .safetensors or .gguf SD LoRA model file to be loaded. Should be unquantized!")
Expand Down Expand Up @@ -5626,8 +5630,7 @@ def export_vars():
args.sdclipg = sd_clipg_var.get()
if sd_photomaker_var.get() != "":
args.sdphotomaker = sd_photomaker_var.get()
if sd_quant_var.get()==1:
args.sdquant = True
args.sdquant = sd_quant_option(sd_quant_var.get())
if sd_lora_var.get() != "":
args.sdlora = sd_lora_var.get()
args.sdloramult = float(sd_loramult_var.get())
Expand Down Expand Up @@ -5837,7 +5840,7 @@ def import_vars(dict):
sd_clamped_var.set(int(dict["sdclamped"]) if ("sdclamped" in dict and dict["sdclamped"]) else 0)
sd_clamped_soft_var.set(int(dict["sdclampedsoft"]) if ("sdclampedsoft" in dict and dict["sdclampedsoft"]) else 0)
sd_threads_var.set(str(dict["sdthreads"]) if ("sdthreads" in dict and dict["sdthreads"]) else str(default_threads))
sd_quant_var.set(1 if ("sdquant" in dict and dict["sdquant"]) else 0)
sd_quant_var.set(sd_quant_choices[(dict["sdquant"] if "sdquant" in dict else 0)])
sd_flash_attention_var.set(1 if ("sdflashattention" in dict and dict["sdflashattention"]) else 0)
sd_convdirect_var.set(sd_convdirect_option(dict.get("sdconvdirect")))
sd_vae_var.set(dict["sdvae"] if ("sdvae" in dict and dict["sdvae"]) else "")
Expand Down Expand Up @@ -6190,7 +6193,7 @@ def convert_invalid_args(args):
if dict["sdconfig"] and len(dict["sdconfig"]) > 2:
dict["sdthreads"] = int(dict["sdconfig"][2])
if dict["sdconfig"] and len(dict["sdconfig"]) > 3:
dict["sdquant"] = (True if dict["sdconfig"][3]=="quant" else False)
dict["sdquant"] = (2 if dict["sdconfig"][3]=="quant" else 0)
if "hordeconfig" in dict and dict["hordeconfig"] and dict["hordeconfig"][0]!="":
dict["hordemodelname"] = dict["hordeconfig"][0]
if len(dict["hordeconfig"]) > 1:
Expand All @@ -6216,6 +6219,8 @@ def convert_invalid_args(args):
dict["model_param"] = model_value[0] # Take the first file in the list
if "sdnotile" in dict and "sdtiledvae" not in dict:
dict["sdtiledvae"] = (0 if (dict["sdnotile"]) else default_vae_tile_threshold) # convert legacy option
if 'sdquant' in dict and type(dict['sdquant']) is bool:
dict['sdquant'] = 2 if dict['sdquant'] else 0
return args

def setuptunnel(global_memory, has_sd):
Expand Down Expand Up @@ -7648,7 +7653,7 @@ def range_checker(arg: str):
sdparsergroupvae.add_argument("--sdvae", metavar=('[filename]'), help="Specify an image generation safetensors VAE which replaces the one in the model.", default="")
sdparsergroupvae.add_argument("--sdvaeauto", help="Uses a built-in VAE via TAE SD, which is very fast, and fixed bad VAEs.", action='store_true')
sdparsergrouplora = sdparsergroup.add_mutually_exclusive_group()
sdparsergrouplora.add_argument("--sdquant", help="If specified, loads the model quantized to save memory.", action='store_true')
sdparsergrouplora.add_argument("--sdquant", metavar=('[quantization level 0/1/2]'), help="If specified, loads the model quantized to save memory. 0=off, 1=q8, 2=q4", type=int, choices=[0,1,2], nargs="?", const=2, default=0)
sdparsergrouplora.add_argument("--sdlora", metavar=('[filename]'), help="Specify an image generation LORA safetensors model to be applied.", default="")
sdparsergroup.add_argument("--sdloramult", metavar=('[amount]'), help="Multiplier for the image LORA model to be applied.", type=float, default=1.0)
sdparsergroup.add_argument("--sdtiledvae", metavar=('[maxres]'), help="Adjust the automatic VAE tiling trigger for images above this size. 0 disables vae tiling.", type=int, default=default_vae_tile_threshold)
Expand Down
8 changes: 6 additions & 2 deletions otherarch/sdcpp/sdtype_adapter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) {
{
printf("Conv2D Direct for VAE model is enabled\n");
}
if(inputs.quant)
if(inputs.quant > 0)
{
printf("Note: Loading a pre-quantized model is always faster than using compress weights!\n");
}
Expand Down Expand Up @@ -253,7 +253,11 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) {

sd_params = new SDParams();
sd_params->model_path = inputs.model_filename;
sd_params->wtype = (inputs.quant==0?SD_TYPE_COUNT:SD_TYPE_Q4_0);
sd_params->wtype = SD_TYPE_COUNT;
if (inputs.quant > 0 && inputs.quant < SD_TYPE_COUNT) {
sd_params->wtype = (sd_type_t) inputs.quant;
printf("\nDiffusion Model quantized to %s", sd_type_name(sd_params->wtype));
}
sd_params->n_threads = inputs.threads; //if -1 use physical cores
sd_params->diffusion_flash_attn = inputs.flash_attention;
sd_params->diffusion_conv_direct = inputs.diffusion_conv_direct;
Expand Down