Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for APG (adaptive projected guidance) #593

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 72 additions & 18 deletions examples/cli/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -123,9 +123,14 @@ struct SDParams {
int upscale_repeats = 1;

std::vector<int> skip_layers = {7, 8, 9};
float slg_scale = 0.;
float skip_layer_start = 0.01;
float skip_layer_end = 0.2;
float slg_scale = 0.0f;
float skip_layer_start = 0.01f;
float skip_layer_end = 0.2f;

float apg_eta = 1.0f;
float apg_momentum = 0.0f;
float apg_norm_threshold = 0.0f;
float apg_norm_smoothing = 0.0f;
};

void print_params(SDParams params) {
Expand Down Expand Up @@ -207,6 +212,11 @@ void print_usage(int argc, const char* argv[]) {
printf(" -p, --prompt [PROMPT] the prompt to render\n");
printf(" -n, --negative-prompt PROMPT the negative prompt (default: \"\")\n");
printf(" --cfg-scale SCALE unconditional guidance scale: (default: 7.0)\n");
printf(" --apg-eta VALUE parallel projected guidance scale for APG (default: 1.0, recommended: between 0 and 1)\n");
printf(" --apg-momentum VALUE CFG update direction momentum for APG (default: 0, recommended: around -0.5)\n");
printf(" --apg-nt, --apg-rescale VALUE CFG update direction norm threshold for APG (default: 0 = disabled, recommended: 4-15)\n");
printf(" --apg-nt-smoothing VALUE EXPERIMENTAL! Norm threshold smoothing for APG (default: 0 = disabled)\n");
printf(" (replaces saturation with a smooth approximation)\n");
printf(" --slg-scale SCALE skip layer guidance (SLG) scale, only for DiT models: (default: 0)\n");
printf(" 0 means disabled, a value of 2.5 is nice for sd3.5 medium\n");
printf(" --skip-layers LAYERS Layers to skip for SLG steps: (default: [7,8,9])\n");
Expand Down Expand Up @@ -616,6 +626,30 @@ void parse_args(int argc, const char** argv, SDParams& params) {
break;
}
params.skip_layer_end = std::stof(argv[i]);
} else if (arg == "--apg-eta") {
if (++i >= argc) {
invalid_arg = true;
break;
}
params.apg_eta = std::stof(argv[i]);
} else if (arg == "--apg-momentum") {
if (++i >= argc) {
invalid_arg = true;
break;
}
params.apg_momentum = std::stof(argv[i]);
} else if (arg == "--apg-nt" || arg == "--apg-rescale") {
if (++i >= argc) {
invalid_arg = true;
break;
}
params.apg_norm_threshold = std::stof(argv[i]);
} else if (arg == "--apg-nt-smoothing") {
if (++i >= argc) {
invalid_arg = true;
break;
}
params.apg_norm_smoothing = std::stof(argv[i]);
} else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
print_usage(argc, argv);
Expand Down Expand Up @@ -706,6 +740,18 @@ std::string get_image_params(SDParams params, int64_t seed) {
}
parameter_string += "Steps: " + std::to_string(params.sample_steps) + ", ";
parameter_string += "CFG scale: " + std::to_string(params.cfg_scale) + ", ";
if (params.apg_eta != 1) {
parameter_string += "APG eta: " + std::to_string(params.apg_eta) + ", ";
}
if (params.apg_momentum != 0) {
parameter_string += "CFG momentum: " + std::to_string(params.apg_momentum) + ", ";
}
if (params.apg_norm_threshold != 0) {
parameter_string += "CFG normalization threshold: " + std::to_string(params.apg_norm_threshold) + ", ";
if (params.apg_norm_smoothing != 0) {
parameter_string += "CFG normalization threshold: " + std::to_string(params.apg_norm_smoothing) + ", ";
}
}
if (params.slg_scale != 0 && params.skip_layers.size() != 0) {
parameter_string += "SLG scale: " + std::to_string(params.cfg_scale) + ", ";
parameter_string += "Skip layers: [";
Expand Down Expand Up @@ -948,11 +994,15 @@ int main(int argc, const char* argv[]) {
params.style_ratio,
params.normalize_input,
params.input_id_images_path.c_str(),
params.skip_layers.data(),
params.skip_layers.size(),
params.slg_scale,
params.skip_layer_start,
params.skip_layer_end);
sd_slg_params_t{params.skip_layers.data(),
params.skip_layers.size(),
params.slg_scale,
params.skip_layer_start,
params.skip_layer_end},
sd_apg_params_t{params.apg_eta,
params.apg_momentum,
params.apg_norm_threshold,
params.apg_norm_smoothing});
} else {
sd_image_t input_image = {(uint32_t)params.width,
(uint32_t)params.height,
Expand Down Expand Up @@ -1016,11 +1066,15 @@ int main(int argc, const char* argv[]) {
params.style_ratio,
params.normalize_input,
params.input_id_images_path.c_str(),
params.skip_layers.data(),
params.skip_layers.size(),
params.slg_scale,
params.skip_layer_start,
params.skip_layer_end);
sd_slg_params_t{params.skip_layers.data(),
params.skip_layers.size(),
params.slg_scale,
params.skip_layer_start,
params.skip_layer_end},
sd_apg_params_t{params.apg_eta,
params.apg_momentum,
params.apg_norm_threshold,
params.apg_norm_smoothing});
}
}

Expand Down Expand Up @@ -1059,19 +1113,19 @@ int main(int argc, const char* argv[]) {

std::string dummy_name, ext, lc_ext;
bool is_jpg;
size_t last = params.output_path.find_last_of(".");
size_t last = params.output_path.find_last_of(".");
size_t last_path = std::min(params.output_path.find_last_of("/"),
params.output_path.find_last_of("\\"));
if (last != std::string::npos // filename has extension
&& (last_path == std::string::npos || last > last_path)) {
if (last != std::string::npos // filename has extension
&& (last_path == std::string::npos || last > last_path)) {
dummy_name = params.output_path.substr(0, last);
ext = lc_ext = params.output_path.substr(last);
std::transform(ext.begin(), ext.end(), lc_ext.begin(), ::tolower);
is_jpg = lc_ext == ".jpg" || lc_ext == ".jpeg" || lc_ext == ".jpe";
} else {
dummy_name = params.output_path;
ext = lc_ext = "";
is_jpg = false;
is_jpg = false;
}
// appending ".png" to absent or unknown extension
if (!is_jpg && lc_ext != ".png") {
Expand All @@ -1083,7 +1137,7 @@ int main(int argc, const char* argv[]) {
continue;
}
std::string final_image_path = i > 0 ? dummy_name + "_" + std::to_string(i + 1) + ext : dummy_name + ext;
if(is_jpg) {
if (is_jpg) {
stbi_write_jpg(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel,
results[i].data, 90, get_image_params(params, params.seed + i).c_str());
printf("save result JPEG image to '%s'\n", final_image_path.c_str());
Expand Down
121 changes: 82 additions & 39 deletions stable-diffusion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -796,11 +796,11 @@ class StableDiffusionGGML {
const std::vector<float>& sigmas,
int start_merge_step,
SDCondition id_cond,
std::vector<int> skip_layers = {},
float slg_scale = 0,
float skip_layer_start = 0.01,
float skip_layer_end = 0.2,
ggml_tensor* noise_mask = nullptr) {
sd_slg_params_t slg_params = {NULL, 0, 0, 0, 0},
sd_apg_params_t apg_params = {1, 0, 0, 0},
ggml_tensor* noise_mask = nullptr) {
std::vector<int> skip_layers(slg_params.skip_layers, slg_params.skip_layers + slg_params.skip_layers_count);

LOG_DEBUG("Sample");
struct ggml_init_params params;
size_t data_size = ggml_row_size(init_latent->type, init_latent->ne[0]);
Expand All @@ -823,7 +823,7 @@ class StableDiffusionGGML {
struct ggml_tensor* noised_input = ggml_dup_tensor(work_ctx, noise);

bool has_unconditioned = cfg_scale != 1.0 && uncond.c_crossattn != NULL;
bool has_skiplayer = slg_scale != 0.0 && skip_layers.size() > 0;
bool has_skiplayer = slg_params.scale != 0.0 && skip_layers.size() > 0;

// denoise wrapper
struct ggml_tensor* out_cond = ggml_dup_tensor(work_ctx, x);
Expand All @@ -843,6 +843,10 @@ class StableDiffusionGGML {
}
struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x);

std::vector<float> apg_momentum_buffer;
if (apg_params.momentum != 0)
apg_momentum_buffer.resize((size_t)ggml_nelements(denoised));

auto denoise = [&](ggml_tensor* input, float sigma, int step) -> ggml_tensor* {
if (step == 1) {
pretty_progress(0, (int)steps, 0);
Expand Down Expand Up @@ -923,7 +927,7 @@ class StableDiffusionGGML {
}

int step_count = sigmas.size();
bool is_skiplayer_step = has_skiplayer && step > (int)(skip_layer_start * step_count) && step < (int)(skip_layer_end * step_count);
bool is_skiplayer_step = has_skiplayer && step > (int)(slg_params.skip_layer_start * step_count) && step < (int)(slg_params.skip_layer_end * step_count);
float* skip_layer_data = NULL;
if (is_skiplayer_step) {
LOG_DEBUG("Skipping layers at step %d\n", step);
Expand All @@ -947,6 +951,58 @@ class StableDiffusionGGML {
float* vec_input = (float*)input->data;
float* positive_data = (float*)out_cond->data;
int ne_elements = (int)ggml_nelements(denoised);

float* deltas = vec_denoised;

// https://arxiv.org/pdf/2410.02416
float apg_scale_factor = 1.;
float diff_norm = 0;
float cond_norm_sq = 0;
float dot = 0;
if (has_unconditioned) {
for (int i = 0; i < ne_elements; i++) {
float delta = positive_data[i] - negative_data[i];
if (apg_params.momentum != 0) {
delta += apg_params.momentum * apg_momentum_buffer[i];
apg_momentum_buffer[i] = delta;
}
if (apg_params.norm_treshold > 0) {
diff_norm += delta * delta;
}
if (apg_params.eta != 1.0f) {
cond_norm_sq += positive_data[i] * positive_data[i];
dot += positive_data[i] * delta;
}
deltas[i] = delta;
}
if (apg_params.norm_treshold > 0) {
diff_norm = sqrtf(diff_norm);
if (apg_params.norm_treshold_smoothing <= 0) {
apg_scale_factor = std::min(1.0f, apg_params.norm_treshold / diff_norm);
} else {
// Experimental: smooth saturate
float x = apg_params.norm_treshold / diff_norm;
apg_scale_factor = x / std::pow(1 + std::pow(x, 1.0 / apg_params.norm_treshold_smoothing), apg_params.norm_treshold_smoothing);
}
}
if (apg_params.eta != 1.0f) {
dot *= apg_scale_factor;
// pre-normalize (avoids one square root and ne_elements extra divs)
dot /= cond_norm_sq;
}

for (int i = 0; i < ne_elements; i++) {
deltas[i] *= apg_scale_factor;
if (apg_params.eta != 1.0f) {
float apg_parallel = dot * positive_data[i];
float apg_orthogonal = deltas[i] - apg_parallel;

// tweak deltas
deltas[i] = apg_orthogonal + apg_params.eta * apg_parallel;
}
}
}

for (int i = 0; i < ne_elements; i++) {
float latent_result = positive_data[i];
if (has_unconditioned) {
Expand All @@ -956,11 +1012,13 @@ class StableDiffusionGGML {
int64_t i3 = i / out_cond->ne[0] * out_cond->ne[1] * out_cond->ne[2];
float scale = min_cfg + (cfg_scale - min_cfg) * (i3 * 1.0f / ne3);
} else {
latent_result = negative_data[i] + cfg_scale * (positive_data[i] - negative_data[i]);
float delta = deltas[i];

latent_result = positive_data[i] + (cfg_scale - 1) * delta;
}
}
if (is_skiplayer_step) {
latent_result = latent_result + (positive_data[i] - skip_layer_data[i]) * slg_scale;
latent_result = latent_result + (positive_data[i] - skip_layer_data[i]) * slg_params.scale;
}
// v = latent_result, eps = latent_result
// denoised = (v * c_out + input * c_skip) or (input + eps * c_out)
Expand Down Expand Up @@ -1000,7 +1058,8 @@ class StableDiffusionGGML {
}

// ldm.models.diffusion.ddpm.LatentDiffusion.get_first_stage_encoding
ggml_tensor* get_first_stage_encoding(ggml_context* work_ctx, ggml_tensor* moments) {
ggml_tensor*
get_first_stage_encoding(ggml_context* work_ctx, ggml_tensor* moments) {
// ldm.modules.distributions.distributions.DiagonalGaussianDistribution.sample
ggml_tensor* latent = ggml_new_tensor_4d(work_ctx, moments->type, moments->ne[0], moments->ne[1], moments->ne[2] / 2, moments->ne[3]);
struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, latent);
Expand Down Expand Up @@ -1204,11 +1263,9 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
float style_ratio,
bool normalize_input,
std::string input_id_images_path,
std::vector<int> skip_layers = {},
float slg_scale = 0,
float skip_layer_start = 0.01,
float skip_layer_end = 0.2,
ggml_tensor* masked_image = NULL) {
sd_slg_params_t slg_params,
sd_apg_params_t apg_params,
ggml_tensor* masked_image = NULL) {
if (seed < 0) {
// Generally, when using the provided command line, the seed is always >0.
// However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library
Expand Down Expand Up @@ -1460,10 +1517,8 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
sigmas,
start_merge_step,
id_cond,
skip_layers,
slg_scale,
skip_layer_start,
skip_layer_end,
slg_params,
apg_params,
noise_mask);

// struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin");
Expand Down Expand Up @@ -1532,12 +1587,8 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
float style_ratio,
bool normalize_input,
const char* input_id_images_path_c_str,
int* skip_layers = NULL,
size_t skip_layers_count = 0,
float slg_scale = 0,
float skip_layer_start = 0.01,
float skip_layer_end = 0.2) {
std::vector<int> skip_layers_vec(skip_layers, skip_layers + skip_layers_count);
sd_slg_params_t slg_params,
sd_apg_params_t apg_params) {
LOG_DEBUG("txt2img %dx%d", width, height);
if (sd_ctx == NULL) {
return NULL;
Expand Down Expand Up @@ -1610,10 +1661,8 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
style_ratio,
normalize_input,
input_id_images_path_c_str,
skip_layers_vec,
slg_scale,
skip_layer_start,
skip_layer_end);
slg_params,
apg_params);

size_t t1 = ggml_time_ms();

Expand Down Expand Up @@ -1642,12 +1691,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
float style_ratio,
bool normalize_input,
const char* input_id_images_path_c_str,
int* skip_layers = NULL,
size_t skip_layers_count = 0,
float slg_scale = 0,
float skip_layer_start = 0.01,
float skip_layer_end = 0.2) {
std::vector<int> skip_layers_vec(skip_layers, skip_layers + skip_layers_count);
sd_slg_params_t slg_params,
sd_apg_params_t apg_params) {
LOG_DEBUG("img2img %dx%d", width, height);
if (sd_ctx == NULL) {
return NULL;
Expand Down Expand Up @@ -1788,10 +1833,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
style_ratio,
normalize_input,
input_id_images_path_c_str,
skip_layers_vec,
slg_scale,
skip_layer_start,
skip_layer_end,
slg_params,
apg_params,
masked_image);

size_t t2 = ggml_time_ms();
Expand Down
Loading
Loading