Skip to content

Commit

Permalink
update test file and README
Browse files Browse the repository at this point in the history
  • Loading branch information
rchalamala committed Jun 6, 2024
1 parent 612cb06 commit 4d50c3b
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 62 deletions.
47 changes: 20 additions & 27 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,6 @@ Recent advances in large multimodal models (LMMs) suggest that higher image reso

## 💿 Installation

Clone this repository and navigate to Dragonfly folder
```bash
git clone https://github.com/togethercomputer/Dragonfly.git
cd Dragonfly
```

Create a conda environment and install necessary packages
```bash
conda env create -f environment.yml
Expand Down Expand Up @@ -76,50 +70,49 @@ Question: Summarize the visual content of the image.

Load necessary packages
```python
import sys
from dragonfly.models.modeling_dragonfly import *
from dragonfly.models.processing_dragonfly import *
from transformers import AutoProcessor, AutoTokenizer
from PIL import Image
import torch
from PIL import Image
from transformers import AutoProcessor, AutoTokenizer

from dragonfly.models.modeling_dragonfly import DragonflyForCausalLM
from dragonfly.models.processing_dragonfly import DragonflyProcessor
from pipeline.train.train_utils import random_seed
```

Instantiate the tokenizer, processor, and model.
```python
device = torch.device("cuda:0")

tokenizer = AutoTokenizer.from_pretrained("togethercomputer/Llama-3-8B-Dragonfly-v1")
clip_processor = AutoProcessor.from_pretrained('openai/clip-vit-base-patch32')
clip_processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
image_processor = clip_processor.image_processor
processor = DragonflyProcessor(image_processor=image_processor, tokenizer=tokenizer, image_encoding_style='llava-hd')
model = DragonflyForCausalLM.from_pretrained(
"togethercomputer/Llama-3-8B-Dragonfly-v1"
)
processor = DragonflyProcessor(image_processor=image_processor, tokenizer=tokenizer, image_encoding_style="llava-hd")

model = DragonflyForCausalLM.from_pretrained("togethercomputer/Llama-3-8B-Dragonfly-v1")
model = model.to(torch.bfloat16)
model = model.to("cuda:0")
model = model.to(device)
```

Now, lets load the image and process them.
```python
image = Image.open("./test_images/skateboard.png")
image = image.convert('RGB')
image = image.convert("RGB")
images = [image]
# images = None # if you do not want to pass any images
# images = [None] # if you do not want to pass any images

text_prompt = "<|start_header_id|>user<|end_header_id|>\n\nSummarize the visual content of the image.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
inputs = processor(text=[text_prompt], images=[image], max_length=2048, return_tensors="pt", is_generate=True)
inputs = inputs.to("cuda:0")

inputs = processor(text=[text_prompt], images=images, max_length=2048, return_tensors="pt", is_generate=True)
inputs = inputs.to(device)
```

Finally, let us generate the responses from the model
```python
temperature = 0

with torch.inference_mode():
generation_output = model.generate(**inputs,
max_new_tokens=1024,
eos_token_id=tokenizer.encode('<|eot_id|>'),
do_sample=temperature > 0,
temperature=temperature,
use_cache=True)
generation_output = model.generate(**inputs, max_new_tokens=1024, eos_token_id=tokenizer.encode("<|eot_id|>"), do_sample=temperature > 0, temperature=temperature, use_cache=True)

generation_text = processor.batch_decode(generation_output, skip_special_tokens=False)
```

Expand Down
63 changes: 28 additions & 35 deletions test_dragonfly.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,17 @@
""" Testing script """
"""Testing script"""

import sys
from dragonfly.models.modeling_dragonfly import *
from dragonfly.models.processing_dragonfly import *
from transformers import AutoProcessor, AutoTokenizer
from PIL import Image
import torch
from PIL import Image
from transformers import AutoProcessor, AutoTokenizer

from dragonfly.models.modeling_dragonfly import DragonflyForCausalLM
from dragonfly.models.processing_dragonfly import DragonflyProcessor
from pipeline.train.train_utils import random_seed


def format_text(text, system_prompt=""):
if len(system_prompt) > 0:
instruction = f"{system_prompt} {text}"
else:
instruction = text
prompt = f"<|start_header_id|>user<|end_header_id|>\n\n{instruction}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
instruction = f"{system_prompt} {text}" if system_prompt else text
prompt = f"<|start_header_id|>user<|end_header_id|>\n\n" f"{instruction}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
return prompt


Expand All @@ -32,53 +29,49 @@ def format_text(text, system_prompt=""):
# question = "Summarize the visual content of the image."

# For biomed
pretrained_model_name_or_path = "togethercomputer/togethercomputer/Llama-3-8B-Dragonfly-Med-v1"
pretrained_model_name_or_path = "togethercomputer/Llama-3-8B-Dragonfly-Med-v1"
image_path = "./test_images/ROCO_04197.jpg"
question = "Provide a brief description of the given image."

# parameters
device = "cuda:0"
seed = 42
temperature = 0


def main():
random_seed(seed)

random_seed(40)
print(f"Loading pretrained model from {pretrained_model_name_or_path}")
device_map = "cuda:0"
kwargs = {}

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path)
clip_processor = AutoProcessor.from_pretrained('openai/clip-vit-base-patch32')
clip_processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
image_processor = clip_processor.image_processor
processor = DragonflyProcessor(image_processor=image_processor, tokenizer=tokenizer, image_encoding_style='llava-hd')
model = DragonflyForCausalLM.from_pretrained(
pretrained_model_name_or_path,
**kwargs
)
processor = DragonflyProcessor(image_processor=image_processor, tokenizer=tokenizer, image_encoding_style="llava-hd")

model = DragonflyForCausalLM.from_pretrained(pretrained_model_name_or_path)
model = model.to(torch.bfloat16)
model = model.to("cuda:0")
model = model.to(device)

# load the image
image = Image.open(image_path)
image = image.convert('RGB')
image = image.convert("RGB")
images = [image]

# prepare inputs for the model
text_prompt = format_text(question)

# process the text and image
inputs = processor(text=[text_prompt], images=[image], max_length=2048, return_tensors="pt", is_generate=True)
inputs = inputs.to(f"cuda:0")

inputs = processor(text=[text_prompt], images=images, max_length=2048, return_tensors="pt", is_generate=True)
inputs = inputs.to(device)

# generate the response
temperature = 0
with torch.inference_mode():
generation_output = model.generate(**inputs,
max_new_tokens=1024,
eos_token_id=tokenizer.encode('<|eot_id|>'),
do_sample=temperature > 0,
temperature=temperature,
use_cache=True)
generation_output = model.generate(**inputs, max_new_tokens=1024, eos_token_id=tokenizer.encode("<|eot_id|>"), do_sample=temperature > 0, temperature=temperature, use_cache=True)

generation_text = processor.batch_decode(generation_output, skip_special_tokens=False)
print(generation_text[0].replace('<|reserved_special_token_0|>','').replace('<|reserved_special_token_1|>',''))
print(generation_text[0].replace("<|reserved_special_token_0|>", "").replace("<|reserved_special_token_1|>", ""))


if __name__ == "__main__":
main()

0 comments on commit 4d50c3b

Please sign in to comment.