update test file and README

togethercomputer · Jun 6, 2024 · 4d50c3b · 4d50c3b
1 parent 612cb06
commit 4d50c3b
Show file tree

Hide file tree

Showing 2 changed files with 48 additions and 62 deletions.
diff --git a/README.md b/README.md
@@ -32,12 +32,6 @@ Recent advances in large multimodal models (LMMs) suggest that higher image reso
 
 ## 💿 Installation
 
-Clone this repository and navigate to Dragonfly folder
-```bash
-git clone https://github.com/togethercomputer/Dragonfly.git
-cd Dragonfly
-```
-
 Create a conda environment and install necessary packages
 ```bash
 conda env create -f environment.yml
@@ -76,50 +70,49 @@ Question: Summarize the visual content of the image.
 
 Load necessary packages
 ```python
-import sys
-from dragonfly.models.modeling_dragonfly import *
-from dragonfly.models.processing_dragonfly import *
-from transformers import AutoProcessor, AutoTokenizer
-from PIL import Image
 import torch
+from PIL import Image
+from transformers import AutoProcessor, AutoTokenizer
+
+from dragonfly.models.modeling_dragonfly import DragonflyForCausalLM
+from dragonfly.models.processing_dragonfly import DragonflyProcessor
 from pipeline.train.train_utils import random_seed
 ```
 
 Instantiate the tokenizer, processor, and model. 
 ```python
+device = torch.device("cuda:0")
+
 tokenizer = AutoTokenizer.from_pretrained("togethercomputer/Llama-3-8B-Dragonfly-v1")
-clip_processor = AutoProcessor.from_pretrained('openai/clip-vit-base-patch32')
+clip_processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
 image_processor = clip_processor.image_processor
-processor = DragonflyProcessor(image_processor=image_processor, tokenizer=tokenizer, image_encoding_style='llava-hd')
-model = DragonflyForCausalLM.from_pretrained(
-    "togethercomputer/Llama-3-8B-Dragonfly-v1"
-)
+processor = DragonflyProcessor(image_processor=image_processor, tokenizer=tokenizer, image_encoding_style="llava-hd")
+
+model = DragonflyForCausalLM.from_pretrained("togethercomputer/Llama-3-8B-Dragonfly-v1")
 model = model.to(torch.bfloat16)
-model = model.to("cuda:0")
+model = model.to(device)
 ```
 
 Now, lets load the image and process them.
 ```python
 image = Image.open("./test_images/skateboard.png")
-image = image.convert('RGB')
+image = image.convert("RGB")
 images = [image]
-# images = None # if you do not want to pass any images
+# images = [None] # if you do not want to pass any images
 
 text_prompt = "<|start_header_id|>user<|end_header_id|>\n\nSummarize the visual content of the image.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
-inputs = processor(text=[text_prompt], images=[image], max_length=2048, return_tensors="pt", is_generate=True)
-inputs = inputs.to("cuda:0")
+
+inputs = processor(text=[text_prompt], images=images, max_length=2048, return_tensors="pt", is_generate=True)
+inputs = inputs.to(device)
 ```
 
 Finally, let us generate the responses from the model
 ```python
 temperature = 0
+
 with torch.inference_mode():
-    generation_output = model.generate(**inputs, 
-                                        max_new_tokens=1024,
-                                        eos_token_id=tokenizer.encode('<|eot_id|>'), 
-                                        do_sample=temperature > 0, 
-                                        temperature=temperature,
-                                        use_cache=True)
+    generation_output = model.generate(**inputs, max_new_tokens=1024, eos_token_id=tokenizer.encode("<|eot_id|>"), do_sample=temperature > 0, temperature=temperature, use_cache=True)
+
 generation_text = processor.batch_decode(generation_output, skip_special_tokens=False)
 ```
 

diff --git a/test_dragonfly.py b/test_dragonfly.py
@@ -1,20 +1,17 @@
-""" Testing script """
+"""Testing script"""
 
-import sys
-from dragonfly.models.modeling_dragonfly import *
-from dragonfly.models.processing_dragonfly import *
-from transformers import AutoProcessor, AutoTokenizer
-from PIL import Image
 import torch
+from PIL import Image
+from transformers import AutoProcessor, AutoTokenizer
+
+from dragonfly.models.modeling_dragonfly import DragonflyForCausalLM
+from dragonfly.models.processing_dragonfly import DragonflyProcessor
 from pipeline.train.train_utils import random_seed
 
 
 def format_text(text, system_prompt=""):
-    if len(system_prompt) > 0:
-        instruction = f"{system_prompt} {text}"
-    else:
-        instruction = text
-    prompt = f"<|start_header_id|>user<|end_header_id|>\n\n{instruction}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
+    instruction = f"{system_prompt} {text}" if system_prompt else text
+    prompt = f"<|start_header_id|>user<|end_header_id|>\n\n" f"{instruction}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
     return prompt
 
 
@@ -32,53 +29,49 @@ def format_text(text, system_prompt=""):
 # question = "Summarize the visual content of the image."
 
 # For biomed
-pretrained_model_name_or_path = "togethercomputer/togethercomputer/Llama-3-8B-Dragonfly-Med-v1"
+pretrained_model_name_or_path = "togethercomputer/Llama-3-8B-Dragonfly-Med-v1"
 image_path = "./test_images/ROCO_04197.jpg"
 question = "Provide a brief description of the given image."
 
+# parameters
+device = "cuda:0"
+seed = 42
+temperature = 0
+
+
 def main():
+    random_seed(seed)
 
-    random_seed(40)
     print(f"Loading pretrained model from {pretrained_model_name_or_path}")
-    device_map = "cuda:0"
-    kwargs = {}
+
     tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path)
-    clip_processor = AutoProcessor.from_pretrained('openai/clip-vit-base-patch32')
+    clip_processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
     image_processor = clip_processor.image_processor
-    processor = DragonflyProcessor(image_processor=image_processor, tokenizer=tokenizer, image_encoding_style='llava-hd')
-    model = DragonflyForCausalLM.from_pretrained(
-        pretrained_model_name_or_path,
-        **kwargs
-    )
+    processor = DragonflyProcessor(image_processor=image_processor, tokenizer=tokenizer, image_encoding_style="llava-hd")
+
+    model = DragonflyForCausalLM.from_pretrained(pretrained_model_name_or_path)
     model = model.to(torch.bfloat16)
-    model = model.to("cuda:0")
+    model = model.to(device)
 
     # load the image
     image = Image.open(image_path)
-    image = image.convert('RGB')
+    image = image.convert("RGB")
     images = [image]
 
     # prepare inputs for the model
     text_prompt = format_text(question)
 
     # process the text and image
-    inputs = processor(text=[text_prompt], images=[image], max_length=2048, return_tensors="pt", is_generate=True)
-    inputs = inputs.to(f"cuda:0")
-
+    inputs = processor(text=[text_prompt], images=images, max_length=2048, return_tensors="pt", is_generate=True)
+    inputs = inputs.to(device)
 
     # generate the response
-    temperature = 0
     with torch.inference_mode():
-        generation_output = model.generate(**inputs, 
-                                            max_new_tokens=1024,
-                                            eos_token_id=tokenizer.encode('<|eot_id|>'), 
-                                            do_sample=temperature > 0, 
-                                            temperature=temperature,
-                                            use_cache=True)
+        generation_output = model.generate(**inputs, max_new_tokens=1024, eos_token_id=tokenizer.encode("<|eot_id|>"), do_sample=temperature > 0, temperature=temperature, use_cache=True)
+
     generation_text = processor.batch_decode(generation_output, skip_special_tokens=False)
-    print(generation_text[0].replace('<|reserved_special_token_0|>','').replace('<|reserved_special_token_1|>',''))
+    print(generation_text[0].replace("<|reserved_special_token_0|>", "").replace("<|reserved_special_token_1|>", ""))
 
 
 if __name__ == "__main__":
     main()
-