Skip to content

Releases: chengzeyi/ParaAttention

v0.3.13

02 Feb 16:31
Compare
Choose a tag to compare
use download artifact v4

v0.3.12

29 Jan 05:31
Compare
Choose a tag to compare
fix

v0.3.11

29 Jan 04:28
Compare
Choose a tag to compare
fix hunyuanvideo context parallel with compile

v0.3.10

09 Jan 08:52
Compare
Choose a tag to compare
fix cache accuracy

v0.3.9 Fastest FLUX.1-dev Inference

v0.3.8

03 Jan 09:02
1324733
Compare
Choose a tag to compare
Dev first block cache (#12)

* implement first block cache

* fix

* fix

* fix

* add doc

* fix

* make flux work

* fix

* fix

* fix

* fix

* refactor

* fix

* fix

* Update fastest_hunyuan_video.md

* Update fastest_hunyuan_video.md

* fix

* fix

* fix

* fix

v0.3.7

25 Dec 16:10
Compare
Choose a tag to compare
make hunyuan_video roboster

v0.3.6

25 Dec 15:01
1c83112
Compare
Choose a tag to compare
Update README.md

v0.3.5

19 Dec 05:40
Compare
Choose a tag to compare
remove unnecessary assert

v0.3.4

19 Dec 02:09
e91b702
Compare
Choose a tag to compare

Run HunyuanVideo🚀 with Parallel Inference

NOTE: To run HunyuanVideo, you need to install diffusers from its latest master branch.
It is suggested to run HunyuanVideo with GPUs with 80GB memory, or you might experience OOM errors,
and the performance might be worse due to frequent memory re-allocation.

import torch
import torch.distributed as dist
from diffusers import HunyuanVideoPipeline, HunyuanVideoTransformer3DModel
from diffusers.utils import export_to_video

# RuntimeError: Expected mha_graph->execute(handle, variant_pack, workspace_ptr.get()).is_good() to be true, but got false.
torch.backends.cuda.enable_cudnn_sdp(False)

dist.init_process_group()

model_id = "tencent/HunyuanVideo"
transformer = HunyuanVideoTransformer3DModel.from_pretrained(
    model_id,
    subfolder="transformer",
    torch_dtype=torch.bfloat16,
    revision="refs/pr/18",
)
pipe = HunyuanVideoPipeline.from_pretrained(
    model_id,
    transformer=transformer,
    torch_dtype=torch.float16,
    revision="refs/pr/18",
).to(f"cuda:{dist.get_rank()}")

pipe.vae.enable_tiling(
    # Make it runnable on GPUs with 48GB memory
    tile_sample_min_height=128,
    tile_sample_stride_height=96,
    tile_sample_min_width=128,
    tile_sample_stride_width=96,
    tile_sample_min_num_frames=32,
    tile_sample_stride_num_frames=24,
)

from para_attn.context_parallel import init_context_parallel_mesh
from para_attn.context_parallel.diffusers_adapters import parallelize_pipe
from para_attn.parallel_vae.diffusers_adapters import parallelize_vae

mesh = init_context_parallel_mesh(
    pipe.device.type,
)
parallelize_pipe(
    pipe,
    mesh=mesh,
)
parallelize_vae(pipe.vae, mesh=mesh._flatten())

# Fix OOM because of awful inductor lowering of attn_bias of _scaled_dot_product_efficient_attention
# import para_attn
# para_attn.config.attention.force_dispatch_to_custom_ops = True

# torch._inductor.config.reorder_for_compute_comm_overlap = True
# pipe.transformer = torch.compile(pipe.transformer, mode="max-autotune-no-cudagraphs")

output = pipe(
    prompt="A cat walks on the grass, realistic",
    height=320,
    width=512,
    num_frames=61,
    num_inference_steps=30,
    output_type="pil" if dist.get_rank() == 0 else "pt",
).frames[0]

if dist.get_rank() == 0:
    print("Saving video to hunyuan_video.mp4")
    export_to_video(output, "hunyuan_video.mp4", fps=15)

dist.destroy_process_group()

Save the above code to run_hunyuan_video.py and run it with torchrun:

torchrun --nproc_per_node=2 run_hunyuan_video.py