diff --git a/tests/run_llm_eval_test.py b/tests/run_llm_eval_test.py
new file mode 100644
index 0000000..43266ee
--- /dev/null
+++ b/tests/run_llm_eval_test.py
@@ -0,0 +1,76 @@
+import json
+import os
+import subprocess
+
+from rjsonnet import evaluate_file
+
+from llm_eval.common.testing import LLMEvalTestCase
+
+
+class TestRunLLMEvalScript(LLMEvalTestCase):
+    def test_script_with_config(self):
+        config_file = self.PROJECT_ROOT / "configs" / "run_lm_eval_example.jsonnet"
+
+        _ = subprocess.run(
+            ["python", "-m", "llm_eval.run_lm_eval", "--config_file", config_file],
+            capture_output=True,
+            text=True,
+        )
+
+        config = json.loads(evaluate_file(str(config_file)))
+
+        assert os.path.exists(config["metrics_file"])
+        assert os.path.exists(config["full_output_file"])
+
+        with open(config["metrics_file"]) as f:
+            metrics = json.load(f)
+
+        assert "metrics" in metrics
+        assert len(metrics["metrics"]) == len(config["task"])
+
+        os.remove(config["metrics_file"])
+        os.remove(config["full_output_file"])
+
+    def test_script_with_args(self):
+        predictions_file = self.TEST_DIR / "predicions.jsonl"
+        metrics_file = self.TEST_DIR / "metrics.jsonl"
+
+        _ = subprocess.run(
+            [
+                "python",
+                "-m",
+                "llm_eval.run_lm_eval",
+                "--model",
+                "lm::pretrained=EleutherAI/pythia-160m,revision=step140000",
+                "--task",
+                "arc_challenge",
+                "arc_easy",
+                "--split",
+                "validation",
+                "--full_output_file",
+                predictions_file,
+                "--metrics_file",
+                metrics_file,
+                "--model_max_length",
+                "2048",
+                "--max_batch_tokens",
+                "4096",
+                "--num_recorded_inputs",
+                "3",
+                "--num_shots",
+                "0",
+                "--limit",
+                "10",
+            ],
+            capture_output=True,
+            text=True,
+        )
+
+        assert os.path.exists(str(metrics_file))
+        assert os.path.exists(str(predictions_file))
+
+        with open(metrics_file) as f:
+            metrics = json.load(f)
+
+        assert "metrics" in metrics
+        assert len(metrics["metrics"]) == 2