Merge pull request #714 from bghira/main

fixed flux training
bghira · Aug 10, 2024 · f50f929 · f50f929
2 parents eb40dc9 + fdc2b1d
commit f50f929
Show file tree

Hide file tree

Showing 21 changed files with 1,692 additions and 1,119 deletions.
diff --git a/OPTIONS.md b/OPTIONS.md
@@ -372,7 +372,9 @@ usage: train.py [-h] [--snr_gamma SNR_GAMMA] [--use_soft_min_snr]
                 [--set_grads_to_none] [--noise_offset NOISE_OFFSET]
                 [--noise_offset_probability NOISE_OFFSET_PROBABILITY]
                 [--validation_guidance VALIDATION_GUIDANCE]
+                [--validation_guidance_real VALIDATION_GUIDANCE_REAL]
                 [--validation_guidance_rescale VALIDATION_GUIDANCE_RESCALE]
+                [--validation_no_cfg_until_timestep VALIDATION_NO_CFG_UNTIL_TIMESTEP]
                 [--validation_randomize] [--validation_seed VALIDATION_SEED]
                 [--fully_unload_text_encoder]
                 [--freeze_encoder_before FREEZE_ENCODER_BEFORE]
@@ -1192,6 +1194,15 @@ options:
                         to be applied 25 percent of the time.
   --validation_guidance VALIDATION_GUIDANCE
                         CFG value for validation images. Default: 7.5
+  --validation_guidance_real VALIDATION_GUIDANCE_REAL
+                        For flux, for any >1.0 value the validation will use
+                        classifier free guidance instead of the distilled
+                        sampling.
+                        Default: 1.0
+  --validation_no_cfg_until_timestep VALIDATION_NO_CFG_UNTIL_TIMESTEP
+                        When using real CFG with flux, do not use CFG until this
+                        sampling timestep.
+                        Default: 2
   --validation_guidance_rescale VALIDATION_GUIDANCE_RESCALE
                         CFG rescale value for validation images. Default: 0.0,
                         max 1.0

diff --git a/README.md b/README.md
@@ -66,7 +66,7 @@ Preliminary training support for Flux.1 is included:
 - Train either Schnell or Dev models
 - Quantise the base model using `--base_model_precision` to `int8-quanto` or `fp8-quanto` for major memory savings
 
-See [hardware requirements](#flux1-dev-schnell).
+See [hardware requirements](#flux1-dev-schnell) or the [quickstart guide](/documentation/quickstart/FLUX.md).
 
 ### PixArt Sigma
 

diff --git a/config/config.env.example b/config/config.env.example
@@ -109,6 +109,12 @@ export VALIDATION_PROMPT="ethnographic photography of teddy bear at a picnic"
 export VALIDATION_GUIDANCE=7.5
 # You'll want to set this to 0.7 if you are training a terminal SNR model.
 export VALIDATION_GUIDANCE_RESCALE=0.0
+# For flux training, you may want to do validation with classifier free guidance.
+# You can set this to be >1.0 to do so. If you don't enable CFG, results may be unreliable or look very bad.
+# Flux LoRAs have a side-effect of requiring a blank negative prompt, so be sure to set VALIDATION_NEGATIVE_PROMPT="" as well.
+export VALIDATION_GUIDANCE_REAL=1.0
+# Skip CFG during validation sampling with CFG (flux only, default=2).
+export VALIDATION_NO_CFG_UNTIL_TIMESTEP=2
 # How frequently we will save and run a pipeline for validations.
 export VALIDATION_STEPS=100
 export VALIDATION_NUM_INFERENCE_STEPS=30

diff --git a/documentation/data_presets/README.md b/documentation/data_presets/README.md
@@ -6,3 +6,5 @@ To add a new preset, use [this template](/documentation/data_presets/preset.md)
 
 - [DALLE-3 1M](/documentation/data_presets/preset_dalle3.md)
 - [ptx0/photo-concept-bucket](/documentation/data_presets/preset_pexels.md)
+- [Midjourney v6 520k](/documentation/data_presets/preset_midjourney.md)
+- [Nijijourney v6 520k](/documentation/data_presets/preset_nijijourney.md)
diff --git a/documentation/data_presets/preset_midjourney.md b/documentation/data_presets/preset_midjourney.md
@@ -0,0 +1,59 @@
+# Midjourney v6 520k
+
+## Details
+
+- **Hub link**: [terminusresearch/midjourney-v6-520k-raw](https://huggingface.co/datasets/terminusresearch/midjourney-v6-520k-raw)
+- **Description**: ~520,000 high quality outputs where any Japanese user prompts have been re-captioned with GPT-3.5-Turbo.
+- **Caption format(s)**: Parquet
+
+## Required storage
+
+This dataset contains all image data, and as such, it will be difficult to extract without adequate disk space. **Ensure you have at least 1.5TB of disk space available to extract it.**
+
+T5-XXL text embeds for this model will consume ~520GB even with `--compress_disk_cache` enabled.
+The VAE embeds will consume just under 80 to 100GB of space, depending on the model being trained and the resolution of the embeds.
+
+
+## Download
+
+```bash
+huggingface-cli download --repo-type=dataset terminusresearch/midjourney-v6-520k-raw --local-dir=midjourney-v6-520k-raw
+```
+
+This will simultaneously download the chunked tar segments from Hugging Face Hub.
+
+## Extract
+
+```bash
+cd midjourney-v6-520k-raw
+cat *.tar | tar x
+```
+
+This will create a folder containing all of the samples inside the current directory.
+
+## Dataloader configuration example
+
+```json
+{
+    "id": "midjourney-v6-520k-raw",
+    "type": "local",
+    "cache_dir_vae": "cache/vae-mj-520k/",
+    "crop": true,
+    "crop_aspect": "square",
+    "resolution": 1.0,
+    "maximum_image_size": 1.0,
+    "minimum_image_size": 0.75,
+    "target_downsample_size": 1.00,
+    "resolution_type": "area",
+    "caption_strategy": "parquet",
+    "metadata_backend": "parquet",
+    "parquet": {
+        "path": "/path/to/midjourney-v6-520k-raw/train.parquet",
+        "caption_column": "gpt_caption",
+        "filename_column": "id",
+        "width_column": "width",
+        "height_column": "height",
+        "identifier_includes_extension": false
+    }
+}
+```
diff --git a/documentation/data_presets/preset_nijijourney.md b/documentation/data_presets/preset_nijijourney.md
@@ -0,0 +1,58 @@
+# Niji v6 520k
+
+## Details
+
+- **Hub link**: [terminusresearch/nijijourney-v6-520k-raw](https://huggingface.co/datasets/terminusresearch/nijijourney-v6-520k-raw)
+- **Description**: ~520,000 high quality outputs where any Japanese user prompts have been re-captioned with GPT-3.5-Turbo.
+- **Caption format(s)**: Parquet
+
+## Required storage
+
+This dataset contains all image data, and as such, it will be difficult to extract without adequate disk space. **Ensure you have at least 1.5TB of disk space available to extract it.**
+
+T5-XXL text embeds for this model will consume ~520GB even with `--compress_disk_cache` enabled.
+The VAE embeds will consume just under 80 to 100GB of space, depending on the model being trained and the resolution of the embeds.
+
+## Download
+
+```bash
+huggingface-cli download --repo-type=dataset terminusresearch/nijijourney-v6-520k-raw --local-dir=nijijourney-v6-520k-raw
+```
+
+This will simultaneously download the chunked tar segments from Hugging Face Hub.
+
+## Extract
+
+```bash
+cd nijijourney-v6-520k-raw
+cat *.tar | tar x
+```
+
+This will create a folder containing all of the samples inside the current directory.
+
+## Dataloader configuration example
+
+```json
+{
+    "id": "nijijourney-v6-520k-raw",
+    "type": "local",
+    "cache_dir_vae": "cache/vae-nj-520k/",
+    "crop": true,
+    "crop_aspect": "square",
+    "resolution": 1.0,
+    "maximum_image_size": 1.0,
+    "minimum_image_size": 0.75,
+    "target_downsample_size": 1.00,
+    "resolution_type": "area",
+    "caption_strategy": "parquet",
+    "metadata_backend": "parquet",
+    "parquet": {
+        "path": "/path/to/nijijourney-v6-520k-raw/train.parquet",
+        "caption_column": "gpt_caption",
+        "filename_column": "id",
+        "width_column": "width",
+        "height_column": "height",
+        "identifier_includes_extension": false
+    }
+}
+```