diff --git a/moondream/eval/chartqa.py b/moondream/eval/chartqa.py index 4d8b5f3a..c8563a08 100644 --- a/moondream/eval/chartqa.py +++ b/moondream/eval/chartqa.py @@ -10,22 +10,8 @@ PREFIX = "Analyze the chart carefully, consider both visual features and data values, and provide a precise answer without any additional explanation or formatting. " -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--model", type=str, required=True) - parser.add_argument("--debug", action="store_true") - args = parser.parse_args() - - if torch.cuda.is_available(): - torch.set_default_device("cuda") - elif torch.backends.mps.is_available(): - torch.set_default_device("mps") - - config = MoondreamConfig() - model = MoondreamModel(config) - load_weights_into_model(args.model, model) - model.compile() +def eval_chartqa(model, debug=False): dataset = datasets.load_dataset("vikhyatk/chartqa", split="test") correct = 0 @@ -33,7 +19,7 @@ human_correct = 0 human_total = 0 - for row in tqdm(dataset, disable=args.debug): + for row in tqdm(dataset, disable=debug): image = row["image"] encoded_image = model.encode_image(image) @@ -49,11 +35,11 @@ correct += 1 if qa["source"] == "human": human_correct += 1 - elif args.debug: + elif debug: print(f"Question: {qa['question']}") print(f"Answer: {answer}") print(f"Model Answer: {model_answer}") - if args.debug: + if debug: print( f"Correct: {correct}, Total: {total}, Human Correct: {human_correct}, Human Total: {human_total}" ) @@ -61,8 +47,28 @@ print(f"Total Accuracy: {correct * 100 / total:.2f}") print("---------") - print( - f"Correct: {correct}, Total: {total}, Human Correct: {human_correct}, Human Total: {human_total}" - ) - print(f"Human Accuracy: {human_correct * 100 / human_total:.2f}") - print(f"Total Accuracy: {correct * 100 / total:.2f}") + return { + "human_acc": human_correct * 100 / human_total, + "total_acc": correct * 100 / total, + } + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--model", type=str, required=True) + parser.add_argument("--debug", action="store_true") + args = parser.parse_args() + + if torch.cuda.is_available(): + torch.set_default_device("cuda") + elif torch.backends.mps.is_available(): + torch.set_default_device("mps") + + config = MoondreamConfig() + model = MoondreamModel(config) + load_weights_into_model(args.model, model) + model.compile() + + results = eval_chartqa(model, args.debug) + print(f"Human Accuracy: {results['human_acc']:.2f}") + print(f"Total Accuracy: {results['total_acc']:.2f}") diff --git a/moondream/eval/countbenchqa.py b/moondream/eval/countbenchqa.py index ce7e632e..d500dcf7 100644 --- a/moondream/eval/countbenchqa.py +++ b/moondream/eval/countbenchqa.py @@ -10,27 +10,14 @@ PREFIX = "Look at the image carefully and count the objects. Answer with just a number, without any additional text. " -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--model", type=str, required=True) - parser.add_argument("--debug", action="store_true") - args = parser.parse_args() - - if torch.cuda.is_available(): - torch.set_default_device("cuda") - elif torch.backends.mps.is_available(): - torch.set_default_device("mps") - - config = MoondreamConfig() - model = MoondreamModel(config) - load_weights_into_model(args.model, model) +def eval_countbenchqa(model, debug=False): dataset = datasets.load_dataset("vikhyatk/CountBenchQA", split="test") correct = 0 total = 0 - for row in tqdm(dataset, disable=args.debug): + for row in tqdm(dataset, disable=debug): image = row["image"] encoded_image = model.encode_image(image) @@ -41,14 +28,38 @@ total += 1 if model_answer.strip().lower() == answer.strip().lower(): correct += 1 - elif args.debug: + elif debug: print(f"Question: {row['question']}") print(f"Answer: {answer}") print(f"Model Answer: {model_answer}") - if args.debug: + if debug: print(f"Correct: {correct}, Total: {total}") print(f"Accuracy: {correct * 100 / total:.2f}") print("---------") - print(f"Correct: {correct}, Total: {total}") - print(f"Accuracy: {correct * 100 / total:.2f}") + return { + "acc": correct * 100 / total, + "correct_count": correct, + "total_count": total, + } + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--model", type=str, required=True) + parser.add_argument("--debug", action="store_true") + args = parser.parse_args() + + if torch.cuda.is_available(): + torch.set_default_device("cuda") + elif torch.backends.mps.is_available(): + torch.set_default_device("mps") + + config = MoondreamConfig() + model = MoondreamModel(config) + load_weights_into_model(args.model, model) + + result = eval_countbenchqa(model, args.debug) + + print(f"Accuracy: {result['acc']:.2f}") + print(f"Correct: {result['correct_count']}, Total: {result['total_count']}") diff --git a/moondream/eval/docvqa.py b/moondream/eval/docvqa.py index f464321d..ad245d2e 100644 --- a/moondream/eval/docvqa.py +++ b/moondream/eval/docvqa.py @@ -19,22 +19,7 @@ def get_anls(s1, s2): return anls -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--model", type=str, required=True) - parser.add_argument("--debug", action="store_true") - args = parser.parse_args() - - if torch.cuda.is_available(): - torch.set_default_device("cuda") - elif torch.backends.mps.is_available(): - torch.set_default_device("mps") - - config = MoondreamConfig() - model = MoondreamModel(config) - load_weights_into_model(args.model, model) - model.compile() - +def eval_docvqa(model, debug=False): docvqa_val = load_dataset("vikhyatk/docvqa-val", split="validation") scores = [] @@ -58,4 +43,27 @@ def get_anls(s1, s2): print(f"Current Average ANLS: {sum(scores) / len(scores):.4f}") print("---------") - print("ANLS:", sum(scores) / len(scores)) + return { + "anls": sum(scores) / len(scores), + } + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--model", type=str, required=True) + parser.add_argument("--debug", action="store_true") + args = parser.parse_args() + + if torch.cuda.is_available(): + torch.set_default_device("cuda") + elif torch.backends.mps.is_available(): + torch.set_default_device("mps") + + config = MoondreamConfig() + model = MoondreamModel(config) + load_weights_into_model(args.model, model) + model.compile() + + result = eval_docvqa(model, args.debug) + + print(f"ANLS: {result['anls']:.4f}") diff --git a/moondream/eval/gazefollow.py b/moondream/eval/gazefollow.py index f4221af4..56e94e99 100644 --- a/moondream/eval/gazefollow.py +++ b/moondream/eval/gazefollow.py @@ -8,87 +8,104 @@ from ..torch.moondream import MoondreamModel from ..torch.weights import load_weights_into_model -dataset = datasets.load_dataset("vikhyatk/gazefollow", split="test") -torch.set_default_device("cuda") -model = MoondreamModel(MoondreamConfig()) -load_weights_into_model("model.pt", model) +def eval_gazefollow(model, debug=False): + dataset = datasets.load_dataset("vikhyatk/gazefollow", split="test") + + mean_l2_error = [] + min_l2_error = [] + total = 0 + + for i, row in tqdm(enumerate(dataset), total=len(dataset)): + heads = [] + + for gaze in row["gazes"]: + head_bbox = gaze["head_bbox"] # xmin, ymin, xmax, ymax + eye_coord = (gaze["eye"]["x"], gaze["eye"]["y"]) + mean_target_gaze = (gaze["gaze"]["x"], gaze["gaze"]["y"]) + + # Check if a head already exists with the same approximate bbox. + # If so, use that head instead of creating a new one. + for head in heads: + if ( + abs(head["head_bbox"]["xmin"] - head_bbox["xmin"]) < 0.001 + and abs(head["head_bbox"]["xmax"] - head_bbox["xmax"]) < 0.001 + and abs(head["head_bbox"]["ymin"] - head_bbox["ymin"]) < 0.001 + and abs(head["head_bbox"]["ymax"] - head_bbox["ymax"]) < 0.001 + ): + head["gazes"].append(mean_target_gaze) + break + else: + heads.append( + { + "head_bbox": head_bbox, + "eye_coord": eye_coord, + "gazes": [mean_target_gaze], + } + ) -mean_l2_error = [] -min_l2_error = [] -total = 0 + for head in heads: + pred_gaze = model.detect_gaze( + row["image"], + eye=head["eye_coord"], + face={ + "x_min": head["head_bbox"]["xmin"], + "y_min": head["head_bbox"]["ymin"], + "x_max": head["head_bbox"]["xmax"], + "y_max": head["head_bbox"]["ymax"], + }, + unstable_settings={"force_detect": True}, + )["gaze"] + + mean_target_gaze = ( + sum(gaze[0] for gaze in head["gazes"]) / len(head["gazes"]), + sum(gaze[1] for gaze in head["gazes"]) / len(head["gazes"]), + ) + mean_l2 = math.sqrt( + (mean_target_gaze[0] - pred_gaze["x"]) ** 2 + + (mean_target_gaze[1] - pred_gaze["y"]) ** 2 + ) + min_l2 = min( + math.sqrt( + (target_gaze[0] - pred_gaze["x"]) ** 2 + + (target_gaze[1] - pred_gaze["y"]) ** 2 + ) + for target_gaze in head["gazes"] + ) + mean_l2_error.append(mean_l2) + min_l2_error.append(min_l2) + total += 1 -for i, row in tqdm(enumerate(dataset), total=len(dataset)): - encoded_image = model.encode_image(row["image"]) + if i % 100 == 0 and debug: + print("Mean L2 error:", sum(mean_l2_error) / total) + print("Min L2 error:", sum(min_l2_error) / total) - heads = [] + return { + "mean_l2": sum(mean_l2_error) / total, + "min_l2": sum(min_l2_error) / total, + } - for gaze in row["gazes"]: - head_bbox = gaze["head_bbox"] # xmin, ymin, xmax, ymax - eye_coord = (gaze["eye"]["x"], gaze["eye"]["y"]) - mean_target_gaze = (gaze["gaze"]["x"], gaze["gaze"]["y"]) - # Check if a head already exists with the same approximate bbox. - # If so, use that head instead of creating a new one. - for head in heads: - if ( - abs(head["head_bbox"]["xmin"] - head_bbox["xmin"]) < 0.001 - and abs(head["head_bbox"]["xmax"] - head_bbox["xmax"]) < 0.001 - and abs(head["head_bbox"]["ymin"] - head_bbox["ymin"]) < 0.001 - and abs(head["head_bbox"]["ymax"] - head_bbox["ymax"]) < 0.001 - ): - head["gazes"].append(mean_target_gaze) - break - else: - heads.append( - { - "head_bbox": head_bbox, - "eye_coord": eye_coord, - "gazes": [mean_target_gaze], - } - ) +if __name__ == "__main__": + import argparse - for head in heads: - pred_gaze = model.detect_gaze( - row["image"], - eye=head["eye_coord"], - face={ - "x_min": head["head_bbox"]["xmin"], - "y_min": head["head_bbox"]["ymin"], - "x_max": head["head_bbox"]["xmax"], - "y_max": head["head_bbox"]["ymax"], - }, - unstable_settings={"force_detect": True}, - )["gaze"] - - mean_target_gaze = ( - sum(gaze[0] for gaze in head["gazes"]) / len(head["gazes"]), - sum(gaze[1] for gaze in head["gazes"]) / len(head["gazes"]), - ) - mean_l2 = math.sqrt( - (mean_target_gaze[0] - pred_gaze["x"]) ** 2 - + (mean_target_gaze[1] - pred_gaze["y"]) ** 2 - ) - min_l2 = min( - math.sqrt( - (target_gaze[0] - pred_gaze["x"]) ** 2 - + (target_gaze[1] - pred_gaze["y"]) ** 2 - ) - for target_gaze in head["gazes"] - ) + parser = argparse.ArgumentParser() + parser.add_argument("--model", type=str, required=True) + + parser.add_argument("--debug", action="store_true") + args = parser.parse_args() - mean_l2_error.append(mean_l2) - min_l2_error.append(min_l2) - total += 1 + if torch.cuda.is_available(): + torch.set_default_device("cuda") + elif torch.backends.mps.is_available(): + torch.set_default_device("mps") - if i % 100 == 0: - print("Mean L2 error:", sum(mean_l2_error) / total) - print("Min L2 error:", sum(min_l2_error) / total) + config = MoondreamConfig() + model = MoondreamModel(config) + load_weights_into_model(args.model, model) + results = eval_gazefollow(model, debug=args.debug) -print() -print("Single prediction mode") -print("Final score:") -print("Mean L2 error:", sum(mean_l2_error) / total) -print("Min L2 error:", sum(min_l2_error) / total) + print(f"Mean L2 error: {results['mean_l2']:.4f}") + print(f"Min L2 error: {results['min_l2']:.4f}") diff --git a/moondream/eval/gqa.py b/moondream/eval/gqa.py deleted file mode 100644 index 8770e2b5..00000000 --- a/moondream/eval/gqa.py +++ /dev/null @@ -1,100 +0,0 @@ -import argparse -import datasets -import torch - -from tqdm import tqdm - -from ..torch.config import MoondreamConfig -from ..torch.moondream import MoondreamModel -from ..torch.weights import load_weights_into_model - - -def evaluate_gqa_answer(prediction: str, ground_truth: str) -> bool: - """ - Evaluates if a predicted answer matches the ground truth using GQA evaluation rules. - - Args: - prediction: Model's predicted answer string - ground_truth: Ground truth answer string - - Returns: - bool: True if answers match after preprocessing, False otherwise - """ - # Preprocess prediction - pred = prediction.strip().lower() - pred = pred.split(".")[0] - pred = pred.split(",")[0] - pred = pred.split("!")[0] - - # Remove common prefixes from prediction - for prefix in ["is ", "are ", "a ", "an ", "the "]: - if pred.startswith(prefix): - pred = pred[len(prefix) :] - - # Remove " of" suffix and anything after from prediction - if " of" in pred: - pred = pred.split(" of")[0] - pred = pred.strip() - - # Preprocess ground truth the same way - truth = ground_truth.strip().lower() - truth = truth.split(".")[0] - truth = truth.split(",")[0] - truth = truth.split("!")[0] - - for prefix in ["is ", "are ", "a ", "an ", "the "]: - if truth.startswith(prefix): - truth = truth[len(prefix) :] - - if " of" in truth: - truth = truth.split(" of")[0] - truth = truth.strip() - - return pred == truth - - -PREFIX = "Consider both visual features and relationships, and think carefully before providing the final answer. " - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--model", type=str, required=True) - parser.add_argument("--debug", action="store_true") - args = parser.parse_args() - - if torch.cuda.is_available(): - torch.set_default_device("cuda") - elif torch.backends.mps.is_available(): - torch.set_default_device("mps") - - config = MoondreamConfig() - model = MoondreamModel(config) - load_weights_into_model(args.model, model) - model.compile() - - dataset = datasets.load_dataset("vikhyatk/gqa-val", split="test") - - total = 0 - correct = 0 - - for row in tqdm(dataset, disable=args.debug): - image = row["image"] - encoded_image = model.encode_image(image) - - for qa in row["qa"]: - question = PREFIX + qa["question"] - answer = qa["answer"] - - model_answer = model.query(encoded_image, question)["answer"] - - total += 1 - if evaluate_gqa_answer(model_answer, answer): - correct += 1 - elif args.debug: - print(f"Question: {qa['question']}") - print(f"Answer: {answer}") - print(f"Model Answer: {model_answer}") - print(f"Correct: {correct}, Total: {total}") - print(f"Accuracy: {correct * 100 / total:.2f}") - print("---------") - - print(f"Total: {total}, Correct: {correct}, Accuracy: {correct * 100 / total:.2f}")