Merge pull request #716 from bghira/main

final flux updates for release
bghira · Aug 11, 2024 · 6e6385c · 6e6385c
2 parents f50f929 + 80a574f
commit 6e6385c
Show file tree

Hide file tree

Showing 6 changed files with 83 additions and 212 deletions.
diff --git a/OPTIONS.md b/OPTIONS.md
@@ -242,7 +242,9 @@ usage: train.py [-h] [--snr_gamma SNR_GAMMA] [--use_soft_min_snr]
                 [--soft_min_snr_sigma_data SOFT_MIN_SNR_SIGMA_DATA]
                 [--model_type {full,lora,deepfloyd-full,deepfloyd-lora,deepfloyd-stage2,deepfloyd-stage2-lora}]
                 [--legacy] [--kolors] [--flux]
-                [--flux_lora_target {mmdit,all}] [--flux_fast_schedule]
+                [--flux_lora_target {mmdit,context,all,all+ffs}]
+                [--flow_matching_sigmoid_scale FLOW_MATCHING_SIGMOID_SCALE]
+                [--flux_fast_schedule]
                 [--flux_guidance_mode {constant,random-range}]
                 [--flux_guidance_value FLUX_GUIDANCE_VALUE]
                 [--flux_guidance_min FLUX_GUIDANCE_MIN]
@@ -251,10 +253,8 @@ usage: train.py [-h] [--snr_gamma SNR_GAMMA] [--use_soft_min_snr]
                 [--flow_matching_loss {diffusers,compatible,diffusion}]
                 [--pixart_sigma] [--sd3]
                 [--sd3_t5_mask_behaviour {do-nothing,mask}]
-                [--weighting_scheme {sigma_sqrt,logit_normal,mode,cosmap,none}]
-                [--logit_mean LOGIT_MEAN] [--logit_std LOGIT_STD]
-                [--mode_scale MODE_SCALE] [--lora_type {Standard}]
-                [--lora_init_type {default,gaussian,loftq}]
+                [--lora_type {Standard}]
+                [--lora_init_type {default,gaussian,loftq,olora,pissa}]
                 [--lora_rank LORA_RANK] [--lora_alpha LORA_ALPHA]
                 [--lora_dropout LORA_DROPOUT] [--controlnet]
                 [--controlnet_model_name_or_path]
@@ -340,6 +340,7 @@ usage: train.py [-h] [--snr_gamma SNR_GAMMA] [--use_soft_min_snr]
                 [--adam_epsilon ADAM_EPSILON] [--adam_bfloat16]
                 [--max_grad_norm MAX_GRAD_NORM] [--push_to_hub]
                 [--push_checkpoints_to_hub] [--hub_model_id HUB_MODEL_ID]
+                [--model_card_note MODEL_CARD_NOTE]
                 [--logging_dir LOGGING_DIR]
                 [--validation_seed_source {gpu,cpu}]
                 [--validation_torch_compile VALIDATION_TORCH_COMPILE]
@@ -373,8 +374,8 @@ usage: train.py [-h] [--snr_gamma SNR_GAMMA] [--use_soft_min_snr]
                 [--noise_offset_probability NOISE_OFFSET_PROBABILITY]
                 [--validation_guidance VALIDATION_GUIDANCE]
                 [--validation_guidance_real VALIDATION_GUIDANCE_REAL]
-                [--validation_guidance_rescale VALIDATION_GUIDANCE_RESCALE]
                 [--validation_no_cfg_until_timestep VALIDATION_NO_CFG_UNTIL_TIMESTEP]
+                [--validation_guidance_rescale VALIDATION_GUIDANCE_RESCALE]
                 [--validation_randomize] [--validation_seed VALIDATION_SEED]
                 [--fully_unload_text_encoder]
                 [--freeze_encoder_before FREEZE_ENCODER_BEFORE]
@@ -420,14 +421,19 @@ options:
                         model.
   --flux                This option must be provided when training a Flux
                         model.
-  --flux_lora_target {mmdit,all}
-                        Flux has single and joint attention blocks. The single
-                        attention blocks deal with text inputs and are not
-                        transformed by LoRA by default. All attention blocks
-                        are trained by default. If 'mmdit' is provided, the
-                        text input layers will not be trained. This is roughly
-                        equivalent to not training the text encoder(s) in
-                        earlier models.
+  --flux_lora_target {mmdit,context,all,all+ffs}
+                        Flux has single and joint attention blocks. Only the
+                        multimodal 'dual stream' attention blocks are trained
+                        by default. If 'mmdit' is provided, the text input
+                        layers will not be trained. If 'context' is provided,
+                        the mmdit layers will not be trained. If 'all' is
+                        provided, all layers will be trained, minus feed-
+                        forward and norms. If 'all+ffs' is provided, all
+                        layers will be trained including feed-forward and
+                        norms.
+  --flow_matching_sigmoid_scale FLOW_MATCHING_SIGMOID_SCALE
+                        Scale factor for sigmoid timestep sampling for flow-
+                        matching models..
   --flux_fast_schedule  An experimental feature to train Flux.1S using a noise
                         schedule closer to what it was trained with, which has
                         improved results in short experiments. Thanks to
@@ -442,7 +448,10 @@ options:
                         and --flux_guidance_max.
   --flux_guidance_value FLUX_GUIDANCE_VALUE
                         When using --flux_guidance_mode=constant, this value
-                        will be used for every input sample.
+                        will be used for every input sample. Using a value of
+                        1.0 seems to preserve the CFG distillation for the Dev
+                        model, and using any other value will result in the
+                        resulting LoRA requiring CFG at inference time.
   --flux_guidance_min FLUX_GUIDANCE_MIN
   --flux_guidance_max FLUX_GUIDANCE_MAX
   --smoldit             Use the experimental SmolDiT model architecture.
@@ -474,32 +483,12 @@ options:
                         prevents expansion of SD3 Medium's prompt length, as
                         it will unnecessarily attend to every token in the
                         prompt embed, even masked positions.
-  --weighting_scheme {sigma_sqrt,logit_normal,mode,cosmap,none}
-                        Stable Diffusion 3 used either uniform sampling of
-                        timesteps with post-prediction loss weighting, or a
-                        weighted timestep selection by mode or log-normal
-                        distribution. The default for SD3 is logit_normal,
-                        though upstream Diffusers training examples use
-                        sigma_sqrt. The mode option is experimental, as it is
-                        the most difficult to implement cleanly. In
-                        experiments, logit_normal produced the best results
-                        for large-scale finetuning across many nodes. For
-                        small scale tuning, 'none' returns the best results.
-                        The default is 'none'.
-  --logit_mean LOGIT_MEAN
-                        As outlined in the Stable Diffusion 3 paper, using a
-                        logit_mean of -0.5 produced the highest quality FID
-                        results. The default here is 0.0.
-  --logit_std LOGIT_STD
-                        Stable Diffusion 3-specific training parameters.
-  --mode_scale MODE_SCALE
-                        Stable Diffusion 3-specific training parameters.
   --lora_type {Standard}
                         When training using --model_type=lora, you may specify
                         a different type of LoRA to train here. Currently,
                         only 'Standard' type is supported. This option exists
                         for compatibility with Kohya configuration files.
-  --lora_init_type {default,gaussian,loftq}
+  --lora_init_type {default,gaussian,loftq,olora,pissa}
                         The initialization type for the LoRA model. 'default'
                         will use Microsoft's initialization method, 'gaussian'
                         will use a Gaussian scaled distribution, and 'loftq'
@@ -1006,6 +995,9 @@ options:
   --hub_model_id HUB_MODEL_ID
                         The name of the repository to keep in sync with the
                         local `output_dir`.
+  --model_card_note MODEL_CARD_NOTE
+                        Add a string to the top of your model card to provide
+                        users with some additional context.
   --logging_dir LOGGING_DIR
                         [TensorBoard](https://www.tensorflow.org/tensorboard)
                         log directory. Will default to
@@ -1106,12 +1098,8 @@ options:
                         validations with a single prompt on slower systems, or
                         if you are not interested in unconditional space
                         generations.
-  --disable_compel      If provided, validation pipeline prompts will be
-                        handled using the typical prompt encoding strategy.
-                        Otherwise, the default behaviour is to use Compel for
-                        prompt embed generation. Note that the training input
-                        text embeds are not generated using Compel, and will
-                        be truncated to 77 tokens.
+  --disable_compel      This option does nothing. It is deprecated and will be
+                        removed in a future release.
   --enable_watermark    The SDXL 0.9 and 1.0 licenses both require a watermark
                         be used to identify any images created to be shared.
                         Since the images created during validation typically
@@ -1195,14 +1183,11 @@ options:
   --validation_guidance VALIDATION_GUIDANCE
                         CFG value for validation images. Default: 7.5
   --validation_guidance_real VALIDATION_GUIDANCE_REAL
-                        For flux, for any >1.0 value the validation will use
-                        classifier free guidance instead of the distilled
-                        sampling.
+                        Use real CFG sampling for Flux validation images.
                         Default: 1.0
   --validation_no_cfg_until_timestep VALIDATION_NO_CFG_UNTIL_TIMESTEP
-                        When using real CFG with flux, do not use CFG until this
-                        sampling timestep.
-                        Default: 2
+                        When using real CFG sampling for Flux validation
+                        images, skip doing CFG on these timesteps. Default: 2
   --validation_guidance_rescale VALIDATION_GUIDANCE_RESCALE
                         CFG rescale value for validation images. Default: 0.0,
                         max 1.0

diff --git a/README.md b/README.md
@@ -60,7 +60,9 @@ For memory-constrained systems, see the [DeepSpeed document](/documentation/DEEP
 
 Preliminary training support for Flux.1 is included:
 
-- Low loss training using SD3 style loss calculations
+- Low loss training using optimised approach
+  - Preserve the dev model's distillation qualities
+  - Or, reintroduce CFG to the model and improve its creativity at the cost of inference speed.
 - LoRA or full tuning via DeepSpeed ZeRO
 - ControlNet training is not yet supported
 - Train either Schnell or Dev models

diff --git a/documentation/quickstart/FLUX.md b/documentation/quickstart/FLUX.md
@@ -312,8 +312,17 @@ In ComfyUI, you'll need to put Flux through another node called AdaptiveGuider.
 ### Classifier-free guidance
 
 #### Problem
-
-The Dev model arrives guidance-distilled out of the box, which means it does a very straight shot trajectory to the teacher model outputs - this isn't as extreme as what was done to the Schnell model, but it noticeably impacts training by re-introducing the classifier-free guidance objective into the model. Interestingly, this occurs whether caption dropout is set to 0.0 (disabled) or 0.1 (default).
+The Dev model arrives guidance-distilled out of the box, which means it does a very straight shot trajectory to the teacher model outputs. This is done through a guidance vector that is fed into the model at training and inference time - the value of this vector greatly impacts what type of resulting LoRA you end up with:
+- A value of 1.0 will preserve the initial distillation done to the Dev model
+  - This is the most compatible mode
+  - Inference is just as fast as the original model
+  - Flow-matching distillation reduces the creativity and output variability of the model, as with the original Flux Dev model (everything keeps the same composition/look)
+- A higher value (tested around 3.5-4.5) will reintroduce the CFG objective into the model
+  - This requires the inference pipeline to have support for CFG
+  - Inference is 50% slower and 0% VRAM increase **or** about 20% slower and 20% VRAM increase due to batched CFG inference
+  - However, this style of training improves creativity and model output variability, which might be required for certain training tasks
+
+It's not clear if we can reintroduce CFG to a de-distilled model by continuing tuning using a vector value of 1.0.
 
 #### Solution
 The solution for this is already enabled in the main branch; it is necessary to enable true CFG sampling at inference time when using LoRAs on Dev.
@@ -417,6 +426,6 @@ export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --base_model_default_dtype=bf16
 
 The users of [Terminus Research](https://huggingface.co/terminusresearch) who worked on this probably more than their day jobs to figure it out
 
-Lambda Labs for generous compute allocations that were used for tests and verifications for large scale training runs
+[Lambda Labs](https://lambdalabs.com) for generous compute allocations that were used for tests and verifications for large scale training runs
 
-Especially [@JimmyCarter](https://huggingface.co/jimmycarter) and [kaibioinfo](https://github.com/kaibioinfo) for coming up with some of the best ideas and putting them into action, offering pull requests and running exhaustive tests for analysis - even daring to use _their own faces_ for DreamBooth experimentation.
+Especially [@JimmyCarter](https://huggingface.co/jimmycarter) and [@kaibioinfo](https://github.com/kaibioinfo) for coming up with some of the best ideas and putting them into action, offering pull requests and running exhaustive tests for analysis - even daring to use _their own faces_ for DreamBooth experimentation.
diff --git a/helpers/arguments.py b/helpers/arguments.py
@@ -119,10 +119,10 @@ def parse_args(input_args=None):
         ),
     )
     parser.add_argument(
-        "--flux_sigmoid_scale",
+        "--flow_matching_sigmoid_scale",
         type=float,
         default=1.0,
-        help='Scale factor for sigmoid timestep sampling (only used when timestep_scheme is "flux").',
+        help="Scale factor for sigmoid timestep sampling for flow-matching models..",
     )
     parser.add_argument(
         "--flux_fast_schedule",
@@ -147,9 +147,11 @@ def parse_args(input_args=None):
     parser.add_argument(
         "--flux_guidance_value",
         type=float,
-        default=4.0,
+        default=1.0,
         help=(
             "When using --flux_guidance_mode=constant, this value will be used for every input sample."
+            " Using a value of 1.0 seems to preserve the CFG distillation for the Dev model,"
+            " and using any other value will result in the resulting LoRA requiring CFG at inference time."
         ),
     )
     parser.add_argument(
@@ -189,17 +191,6 @@ def parse_args(input_args=None):
             " Additionally, 'diffusion' is offered as an option to reparameterise a model to v_prediction loss."
         ),
     )
-    parser.add_argument(
-        "--timestep_scheme",
-        type=str,
-        choices=["sd3", "flux"],
-        default=None,
-        help=(
-            "When training flow-matching models like SD3 or Flux, we can select timesteps based on an approximated continuous schedule"
-            " that takes the 1000 timesteps and derives pseudo-sigmas from them. This is the default behaviour."
-            " Flux training seems to benefit from a sigma schedule, and is recommended to use the 'flux' option."
-        ),
-    )
     parser.add_argument(
         "--pixart_sigma",
         action="store_true",
@@ -228,40 +219,6 @@ def parse_args(input_args=None):
             " even masked positions."
         ),
     )
-    parser.add_argument(
-        "--weighting_scheme",
-        type=str,
-        default="cosmap",
-        choices=["sigma_sqrt", "logit_normal", "mode", "cosmap", "none"],
-        help=(
-            "Stable Diffusion 3 used either uniform sampling of timesteps with post-prediction loss weighting, or"
-            " a weighted timestep selection by mode or log-normal distribution. The default for SD3 is logit_normal, though"
-            " upstream Diffusers training examples use sigma_sqrt. The mode option is experimental,"
-            " as it is the most difficult to implement cleanly. In experiments, logit_normal produced the best results"
-            " for large-scale finetuning across many nodes. For small scale tuning, 'none' returns the best results."
-            " The default is 'none'."
-        ),
-    )
-    parser.add_argument(
-        "--logit_mean",
-        type=float,
-        default=0.0,
-        help=(
-            "As outlined in the Stable Diffusion 3 paper, using a logit_mean of -0.5 produced the highest quality FID results. The default here is 0.0."
-        ),
-    )
-    parser.add_argument(
-        "--logit_std",
-        type=float,
-        default=1.0,
-        help=("Stable Diffusion 3-specific training parameters."),
-    )
-    parser.add_argument(
-        "--mode_scale",
-        type=float,
-        default=1.29,
-        help=("Stable Diffusion 3-specific training parameters."),
-    )
     parser.add_argument(
         "--lora_type",
         type=str,
@@ -1523,7 +1480,7 @@ def parse_args(input_args=None):
         "--validation_guidance_real",
         type=float,
         default=1.0,
-        help="Use real CFG sampling for Flux validation images. Default: 1.0",
+        help="Use real CFG sampling for Flux validation images. Default: 1.0 (no CFG)",
     )
     parser.add_argument(
         "--validation_no_cfg_until_timestep",
@@ -1992,9 +1949,6 @@ def parse_args(input_args=None):
     if args.sd3:
         args.pretrained_vae_model_name_or_path = None
         args.disable_compel = True
-        if args.timestep_scheme is None:
-            args.timestep_scheme = "sd3"
-        logger.info(f"Using {args.timestep_scheme} timestep scheme.")
 
     t5_max_length = 77
     if args.sd3 and (
@@ -2020,9 +1974,6 @@ def parse_args(input_args=None):
     elif "dev" in args.pretrained_model_name_or_path.lower():
         model_max_seq_length = 512
     if args.flux:
-        if args.timestep_scheme is None:
-            args.timestep_scheme = "flux"
-        logger.info(f"Using {args.timestep_scheme} timestep scheme.")
         if (
             args.tokenizer_max_length is None
             or int(args.tokenizer_max_length) > model_max_seq_length

diff --git a/helpers/models/flux/__init__.py b/helpers/models/flux/__init__.py
@@ -5,7 +5,6 @@
 def update_flux_schedule_to_fast(args, noise_scheduler_to_copy):
     if args.flux_fast_schedule and args.flux:
         # 4-step noise schedule [0.7, 0.1, 0.1, 0.1] from SD3-Turbo paper
-        print(f"sigmas before: {noise_scheduler_to_copy.sigmas}")
         for i in range(0, 250):
             noise_scheduler_to_copy.sigmas[i] = 1.0
         for i in range(250, 500):
@@ -14,7 +13,6 @@ def update_flux_schedule_to_fast(args, noise_scheduler_to_copy):
             noise_scheduler_to_copy.sigmas[i] = 0.2
         for i in range(750, 1000):
             noise_scheduler_to_copy.sigmas[i] = 0.1
-        print(f"sigmas after: {noise_scheduler_to_copy.sigmas}")
     return noise_scheduler_to_copy