Skip to content

Commit 60948d7

Browse files
authored
Merge branch 'leejet:master' into master
2 parents ccf5aa3 + fd693ac commit 60948d7

33 files changed

+1715
-1246
lines changed

‎.github/workflows/build.yml‎

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -254,15 +254,15 @@ jobs:
254254
255255
- name: Copy and pack Cuda runtime
256256
id: pack_cuda_runtime
257-
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' && matrix.build == 'cuda12' ) || github.event.inputs.create_release == 'true' }}
257+
if: ${{ matrix.build == 'cuda12' && (github.event_name == 'push' && github.ref == 'refs/heads/master' || github.event.inputs.create_release == 'true') }}
258258
run: |
259259
echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
260260
$dst='.\build\bin\cudart\'
261261
robocopy "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
262262
7z a cudart-sd-bin-win-cu12-x64.zip $dst\*
263263
264264
- name: Upload Cuda runtime
265-
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' && matrix.build == 'cuda12' ) || github.event.inputs.create_release == 'true' }}
265+
if: ${{ matrix.build == 'cuda12' && (github.event_name == 'push' && github.ref == 'refs/heads/master' || github.event.inputs.create_release == 'true') }}
266266
uses: actions/upload-artifact@v4
267267
with:
268268
name: sd-cudart-sd-bin-win-cu12-x64.zip
@@ -288,6 +288,11 @@ jobs:
288288
- windows-latest-cmake
289289

290290
steps:
291+
- name: Clone
292+
uses: actions/checkout@v3
293+
with:
294+
fetch-depth: 0
295+
291296
- name: Download artifacts
292297
id: download-artifact
293298
uses: actions/download-artifact@v4
@@ -296,20 +301,27 @@ jobs:
296301
pattern: sd-*
297302
merge-multiple: true
298303

304+
- name: Get commit count
305+
id: commit_count
306+
run: |
307+
echo "count=$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
308+
299309
- name: Get commit hash
300310
id: commit
301311
uses: pr-mpt/actions-commit-hash@v2
302312

303313
- name: Create release
304314
id: create_release
315+
if: ${{ github.event_name == 'workflow_dispatch' || github.ref_name == 'master' }}
305316
uses: anzz1/action-create-release@v1
306317
env:
307318
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
308319
with:
309-
tag_name: ${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}
320+
tag_name: ${{ format('{0}-{1}-{2}', env.BRANCH_NAME, steps.commit_count.outputs.count, steps.commit.outputs.short) }}
310321

311322
- name: Upload release
312323
id: upload_release
324+
if: ${{ github.event_name == 'workflow_dispatch' || github.ref_name == 'master' }}
313325
uses: actions/github-script@v3
314326
with:
315327
github-token: ${{secrets.GITHUB_TOKEN}}

‎Dockerfile.sycl‎

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
ARG SYCL_VERSION=2025.1.0-0
2+
3+
FROM intel/oneapi-basekit:${SYCL_VERSION}-devel-ubuntu24.04 AS build
4+
5+
RUN apt-get update && apt-get install -y cmake
6+
7+
WORKDIR /sd.cpp
8+
9+
COPY . .
10+
11+
RUN mkdir build && cd build && \
12+
cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DSD_SYCL=ON -DCMAKE_BUILD_TYPE=Release && \
13+
cmake --build . --config Release -j$(nproc)
14+
15+
FROM intel/oneapi-basekit:${SYCL_VERSION}-devel-ubuntu24.04 AS runtime
16+
17+
COPY --from=build /sd.cpp/build/bin/sd /sd
18+
19+
ENTRYPOINT [ "/sd" ]

‎README.md‎

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -60,14 +60,6 @@ API and command-line option may change frequently.***
6060
- Windows
6161
- Android (via Termux, [Local Diffusion](https://github.com/rmatif/Local-Diffusion))
6262

63-
### TODO
64-
65-
- [ ] More sampling methods
66-
- [ ] Make inference faster
67-
- The current implementation of ggml_conv_2d is slow and has high memory usage
68-
- [ ] Continuing to reduce memory usage (quantizing the weights of ggml_conv_2d)
69-
- [ ] Implement Inpainting support
70-
7163
## Usage
7264

7365
For most users, you can download the built executable program from the latest [release](https://github.com/leejet/stable-diffusion.cpp/releases/latest).
@@ -307,9 +299,6 @@ arguments:
307299
--taesd [TAESD_PATH] path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
308300
--control-net [CONTROL_PATH] path to control net model
309301
--embd-dir [EMBEDDING_PATH] path to embeddings
310-
--stacked-id-embd-dir [DIR] path to PHOTOMAKER stacked id embeddings
311-
--input-id-images-dir [DIR] path to PHOTOMAKER input id images dir
312-
--normalize-input normalize PHOTOMAKER input id images
313302
--upscale-model [ESRGAN_PATH] path to esrgan model. Upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now
314303
--upscale-repeats Run the ESRGAN upscaler this many times (default 1)
315304
--type [TYPE] weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K)
@@ -321,6 +310,9 @@ arguments:
321310
-i, --end-img [IMAGE] path to the end image, required by flf2v
322311
--control-image [IMAGE] path to image condition, control net
323312
-r, --ref-image [PATH] reference image for Flux Kontext models (can be used multiple times)
313+
--control-video [PATH] path to control video frames, It must be a directory path.
314+
The video frames inside should be stored as images in lexicographical (character) order
315+
For example, if the control video path is `frames`, the directory contain images such as 00.png, 01.png, 鈥?etc.
324316
--increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1).
325317
-o, --output OUTPUT path to write result image to (default: ./output.png)
326318
-p, --prompt [PROMPT] the prompt to render
@@ -334,9 +326,10 @@ arguments:
334326
--skip-layers LAYERS Layers to skip for SLG steps: (default: [7,8,9])
335327
--skip-layer-start START SLG enabling point: (default: 0.01)
336328
--skip-layer-end END SLG disabling point: (default: 0.2)
337-
--scheduler {discrete, karras, exponential, ays, gits} Denoiser sigma scheduler (default: discrete)
329+
--scheduler {discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple} Denoiser sigma scheduler (default: discrete)
338330
--sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}
339-
sampling method (default: "euler_a")
331+
sampling method (default: "euler" for Flux/SD3/Wan, "euler_a" otherwise)
332+
--timestep-shift N shift timestep for NitroFusion models, default: 0, recommended N for NitroSD-Realism around 250 and 500 for NitroSD-Vibrant
340333
--steps STEPS number of sample steps (default: 20)
341334
--high-noise-cfg-scale SCALE (high noise) unconditional guidance scale: (default: 7.0)
342335
--high-noise-img-cfg-scale SCALE (high noise) image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
@@ -347,23 +340,25 @@ arguments:
347340
--high-noise-skip-layers LAYERS (high noise) Layers to skip for SLG steps: (default: [7,8,9])
348341
--high-noise-skip-layer-start (high noise) SLG enabling point: (default: 0.01)
349342
--high-noise-skip-layer-end END (high noise) SLG disabling point: (default: 0.2)
350-
--high-noise-scheduler {discrete, karras, exponential, ays, gits} Denoiser sigma scheduler (default: discrete)
343+
--high-noise-scheduler {discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple} Denoiser sigma scheduler (default: discrete)
351344
--high-noise-sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}
352345
(high noise) sampling method (default: "euler_a")
353346
--high-noise-steps STEPS (high noise) number of sample steps (default: -1 = auto)
354347
SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])
355348
--strength STRENGTH strength for noising/unnoising (default: 0.75)
356-
--style-ratio STYLE-RATIO strength for keeping input identity (default: 20)
357349
--control-strength STRENGTH strength to apply Control Net (default: 0.9)
358350
1.0 corresponds to full destruction of information in init image
359351
-H, --height H image height, in pixel space (default: 512)
360352
-W, --width W image width, in pixel space (default: 512)
361353
--rng {std_default, cuda} RNG (default: cuda)
362354
-s SEED, --seed SEED RNG seed (default: 42, use random seed for < 0)
363355
-b, --batch-count COUNT number of images to generate
364-
--clip-skip N ignore last_dot_pos layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)
356+
--clip-skip N ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)
365357
<= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x
366358
--vae-tiling process vae in tiles to reduce memory usage
359+
--vae-tile-size [X]x[Y] tile size for vae tiling (default: 32x32)
360+
--vae-relative-tile-size [X]x[Y] relative tile size for vae tiling, in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)
361+
--vae-tile-overlap OVERLAP tile overlap for vae tiling, in fraction of tile size (default: 0.5)
367362
--vae-on-cpu keep vae in cpu (for low vram)
368363
--clip-on-cpu keep clip in cpu (for low vram)
369364
--diffusion-fa use flash attention in the diffusion model (for low vram)
@@ -384,6 +379,11 @@ arguments:
384379
--moe-boundary BOUNDARY timestep boundary for Wan2.2 MoE model. (default: 0.875)
385380
only enabled if `--high-noise-steps` is set to -1
386381
--flow-shift SHIFT shift value for Flow models like SD3.x or WAN (default: auto)
382+
--vace-strength wan vace strength
383+
--photo-maker path to PHOTOMAKER model
384+
--pm-id-images-dir [DIR] path to PHOTOMAKER input id images dir
385+
--pm-id-embed-path [PATH] path to PHOTOMAKER v2 id embed
386+
--pm-style-strength strength for keeping PHOTOMAKER input identity (default: 20)
387387
-v, --verbose print extra info
388388
```
389389
@@ -393,9 +393,9 @@ arguments:
393393
./bin/sd -m ../models/sd-v1-4.ckpt -p "a lovely cat"
394394
# ./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat"
395395
# ./bin/sd -m ../models/sd_xl_base_1.0.safetensors --vae ../models/sdxl_vae-fp16-fix.safetensors -H 1024 -W 1024 -p "a lovely cat" -v
396-
# ./bin/sd -m ../models/sd3_medium_incl_clips_t5xxlfp16.safetensors -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable Diffusion CPP\"' --cfg-scale 4.5 --sampling-method euler -v
397-
# ./bin/sd --diffusion-model ../models/flux1-dev-q3_k.gguf --vae ../models/ae.sft --clip_l ../models/clip_l.safetensors --t5xxl ../models/t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v
398-
# ./bin/sd -m ..\models\sd3.5_large.safetensors --clip_l ..\models\clip_l.safetensors --clip_g ..\models\clip_g.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable diffusion 3.5 Large\"' --cfg-scale 4.5 --sampling-method euler -v
396+
# ./bin/sd -m ../models/sd3_medium_incl_clips_t5xxlfp16.safetensors -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable Diffusion CPP\"' --cfg-scale 4.5 --sampling-method euler -v --clip-on-cpu
397+
# ./bin/sd --diffusion-model ../models/flux1-dev-q3_k.gguf --vae ../models/ae.sft --clip_l ../models/clip_l.safetensors --t5xxl ../models/t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v --clip-on-cpu
398+
# ./bin/sd -m ..\models\sd3.5_large.safetensors --clip_l ..\models\clip_l.safetensors --clip_g ..\models\clip_g.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable diffusion 3.5 Large\"' --cfg-scale 4.5 --sampling-method euler -v --clip-on-cpu
399399
```
400400
401401
Using formats of different precisions will yield results of varying quality.
158 KB
Binary file not shown.
297 KB
Binary file not shown.
287 KB
Binary file not shown.

‎assets/wan/Wan2.1_14B_vace_r2v.mp4‎

152 KB
Binary file not shown.

‎assets/wan/Wan2.1_14B_vace_t2v.mp4‎

176 KB
Binary file not shown.

‎assets/wan/Wan2.1_14B_vace_v2v.mp4‎

347 KB
Binary file not shown.

‎clip.hpp‎

Lines changed: 25 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -548,9 +548,15 @@ class CLIPEmbeddings : public GGMLBlock {
548548
int64_t embed_dim;
549549
int64_t vocab_size;
550550
int64_t num_positions;
551+
bool force_clip_f32;
551552

552553
void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
553-
enum ggml_type token_wtype = GGML_TYPE_F32;
554+
enum ggml_type token_wtype = GGML_TYPE_F32;
555+
if (!force_clip_f32) {
556+
auto tensor_type = tensor_types.find(prefix + "token_embedding.weight");
557+
if (tensor_type != tensor_types.end())
558+
token_wtype = tensor_type->second;
559+
}
554560
enum ggml_type position_wtype = GGML_TYPE_F32;
555561

556562
params["token_embedding.weight"] = ggml_new_tensor_2d(ctx, token_wtype, embed_dim, vocab_size);
@@ -560,10 +566,12 @@ class CLIPEmbeddings : public GGMLBlock {
560566
public:
561567
CLIPEmbeddings(int64_t embed_dim,
562568
int64_t vocab_size = 49408,
563-
int64_t num_positions = 77)
569+
int64_t num_positions = 77,
570+
bool force_clip_f32 = false)
564571
: embed_dim(embed_dim),
565572
vocab_size(vocab_size),
566-
num_positions(num_positions) {
573+
num_positions(num_positions),
574+
force_clip_f32(force_clip_f32) {
567575
}
568576

569577
struct ggml_tensor* get_token_embed_weight() {
@@ -678,12 +686,11 @@ class CLIPTextModel : public GGMLBlock {
678686
int32_t n_head = 12;
679687
int32_t n_layer = 12; // num_hidden_layers
680688
int32_t projection_dim = 1280; // only for OPEN_CLIP_VIT_BIGG_14
681-
int32_t clip_skip = -1;
682689
bool with_final_ln = true;
683690

684691
CLIPTextModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14,
685692
bool with_final_ln = true,
686-
int clip_skip_value = -1)
693+
bool force_clip_f32 = false)
687694
: version(version), with_final_ln(with_final_ln) {
688695
if (version == OPEN_CLIP_VIT_H_14) {
689696
hidden_size = 1024;
@@ -696,20 +703,12 @@ class CLIPTextModel : public GGMLBlock {
696703
n_head = 20;
697704
n_layer = 32;
698705
}
699-
set_clip_skip(clip_skip_value);
700706

701-
blocks["embeddings"] = std::shared_ptr<GGMLBlock>(new CLIPEmbeddings(hidden_size, vocab_size, n_token));
707+
blocks["embeddings"] = std::shared_ptr<GGMLBlock>(new CLIPEmbeddings(hidden_size, vocab_size, n_token, force_clip_f32));
702708
blocks["encoder"] = std::shared_ptr<GGMLBlock>(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size));
703709
blocks["final_layer_norm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size));
704710
}
705711

706-
void set_clip_skip(int skip) {
707-
if (skip <= 0) {
708-
skip = -1;
709-
}
710-
clip_skip = skip;
711-
}
712-
713712
struct ggml_tensor* get_token_embed_weight() {
714713
auto embeddings = std::dynamic_pointer_cast<CLIPEmbeddings>(blocks["embeddings"]);
715714
return embeddings->get_token_embed_weight();
@@ -720,7 +719,8 @@ class CLIPTextModel : public GGMLBlock {
720719
struct ggml_tensor* input_ids,
721720
struct ggml_tensor* tkn_embeddings,
722721
size_t max_token_idx = 0,
723-
bool return_pooled = false) {
722+
bool return_pooled = false,
723+
int clip_skip = -1) {
724724
// input_ids: [N, n_token]
725725
auto embeddings = std::dynamic_pointer_cast<CLIPEmbeddings>(blocks["embeddings"]);
726726
auto encoder = std::dynamic_pointer_cast<CLIPEncoder>(blocks["encoder"]);
@@ -889,19 +889,15 @@ struct CLIPTextModelRunner : public GGMLRunner {
889889
const std::string prefix,
890890
CLIPVersion version = OPENAI_CLIP_VIT_L_14,
891891
bool with_final_ln = true,
892-
int clip_skip_value = -1)
893-
: GGMLRunner(backend, offload_params_to_cpu), model(version, with_final_ln, clip_skip_value) {
892+
bool force_clip_f32 = false)
893+
: GGMLRunner(backend, offload_params_to_cpu), model(version, with_final_ln, force_clip_f32) {
894894
model.init(params_ctx, tensor_types, prefix);
895895
}
896896

897897
std::string get_desc() {
898898
return "clip";
899899
}
900900

901-
void set_clip_skip(int clip_skip) {
902-
model.set_clip_skip(clip_skip);
903-
}
904-
905901
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
906902
model.get_param_tensors(tensors, prefix);
907903
}
@@ -911,22 +907,24 @@ struct CLIPTextModelRunner : public GGMLRunner {
911907
struct ggml_tensor* input_ids,
912908
struct ggml_tensor* embeddings,
913909
size_t max_token_idx = 0,
914-
bool return_pooled = false) {
910+
bool return_pooled = false,
911+
int clip_skip = -1) {
915912
size_t N = input_ids->ne[1];
916913
size_t n_token = input_ids->ne[0];
917914
if (input_ids->ne[0] > model.n_token) {
918915
GGML_ASSERT(input_ids->ne[0] % model.n_token == 0);
919916
input_ids = ggml_reshape_2d(ctx, input_ids, model.n_token, input_ids->ne[0] / model.n_token);
920917
}
921918

922-
return model.forward(ctx, backend, input_ids, embeddings, max_token_idx, return_pooled);
919+
return model.forward(ctx, backend, input_ids, embeddings, max_token_idx, return_pooled, clip_skip);
923920
}
924921

925922
struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids,
926923
int num_custom_embeddings = 0,
927924
void* custom_embeddings_data = NULL,
928925
size_t max_token_idx = 0,
929-
bool return_pooled = false) {
926+
bool return_pooled = false,
927+
int clip_skip = -1) {
930928
struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
931929

932930
input_ids = to_backend(input_ids);
@@ -945,7 +943,7 @@ struct CLIPTextModelRunner : public GGMLRunner {
945943
embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings, 1);
946944
}
947945

948-
struct ggml_tensor* hidden_states = forward(compute_ctx, runtime_backend, input_ids, embeddings, max_token_idx, return_pooled);
946+
struct ggml_tensor* hidden_states = forward(compute_ctx, runtime_backend, input_ids, embeddings, max_token_idx, return_pooled, clip_skip);
949947

950948
ggml_build_forward_expand(gf, hidden_states);
951949

@@ -958,10 +956,11 @@ struct CLIPTextModelRunner : public GGMLRunner {
958956
void* custom_embeddings_data,
959957
size_t max_token_idx,
960958
bool return_pooled,
959+
int clip_skip,
961960
ggml_tensor** output,
962961
ggml_context* output_ctx = NULL) {
963962
auto get_graph = [&]() -> struct ggml_cgraph* {
964-
return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled);
963+
return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled, clip_skip);
965964
};
966965
GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
967966
}

0 commit comments

Comments
 (0)