better benchmarks readme

cdump · Feb 15, 2025 · 493b783 · 493b783
1 parent a6455bc
commit 493b783
Show file tree

Hide file tree

Showing 5 changed files with 90 additions and 61 deletions.
diff --git a/.gitignore b/.gitignore
@@ -10,3 +10,4 @@ test-results/
 dist/
 
 target/
+.aider*
diff --git a/benchmark/README.md b/benchmark/README.md
@@ -1,54 +1,67 @@
 # Benchmarks
 
-Test accuracy and speed of different function-signature and arguments extractors
+Test accuracy and speed of different EVM bytecode analysis tools
 
 For results, refer to the [main README.md](../README.md#Benchmark).
 
 ## Methodology
 1. Get N Etherscan-verified contracts, save the bytecode and ABI to `datasets/NAME/ADDR.json`.
-2. Extract function signatures/arguments/state mutability from the bytecode. Each tool runs inside a Docker container and is limited to 1 CPU (see `providers/NAME` and `Makefile`).
+2. Extract information from the bytecode using different tools. Each tool runs inside a Docker container and is limited to 1 CPU (see `providers/NAME` and `Makefile`).
 3. Assume Etherscan's ABI as ground truth.
-4. Compare the results with it and count [False Positives and False Negatives](https://en.wikipedia.org/wiki/False_positives_and_false_negatives) for signatures and count correct results (strings equal) for arguments and state mutability.
+4. Compare the results:
+   - For selectors: Count [False Positives and False Negatives](https://en.wikipedia.org/wiki/False_positives_and_false_negatives)
+   - For arguments/mutability: Count exact matches
 
 ## Reproduce
-Set the performance mode using `sudo cpupower frequency-set -g performance` and run `make benchmark-selectors` or `make benchmark-arguments` ([GNU Make](https://www.gnu.org/software/make/)) inside the `benchmark/` directory.
+Set the performance mode using `sudo cpupower frequency-set -g performance` and run benchmarks ([GNU Make](https://www.gnu.org/software/make/)) inside the `benchmark/` directory:
 
-To use [Podman](https://podman.io/) instead of Docker: `DOCKER=podman make benchmark-selectors`
+```sh
+make benchmark-selectors    # Run function selector tests
+make benchmark-arguments   # Run argument extraction tests
+make benchmark-mutability  # Run state mutability tests
+```
 
+To use [Podman](https://podman.io/) instead of Docker:
+```sh
+DOCKER=podman make benchmark-selectors
+```
 
-You can run only specific step; for example:
+You can run specific steps; for example:
 ```sh
 # Only build docker-images
 $ make build
 
-# Only run tests for selectors (assume that docker-images are already built)
+# Only run tests for selectors (assume docker-images are built)
 $ make run-selectors
 
-# Build `etherscan` docker image
+# Build specific provider
 $ make etherscan.build
 
-# Run `etherscan` on dataset `largest1k` to extract function selectors
+# Run specific provider/mode/dataset
 $ make etherscan.selectors/largest1k
-
-# Run `etherscan` on dataset `largest1k` to extract function arguments
 $ make etherscan.arguments/largest1k
 ```
 
-To process results run `compare.py`:
+## Process Results
+Use `compare.py` to analyze results:
+
 ```sh
-# default mode: compare 'selectors' results
+# Default mode (selectors)
 $ python3 compare.py
 
-# compare 'arguments' results
+# Compare specific mode
 $ python3 compare.py --mode=arguments
+$ python3 compare.py --mode=mutability
 
-# compare 'arguments' results for specified providers and datasets, show errors
-$ python3 compare.py --mode=arguments --datasets largest1k --providers etherscan evmole-py --show-errors
+# Filter by dataset/provider and show errors
+python3 compare.py --mode=arguments --datasets largest1k --providers etherscan evmole-py --show-errors
 
-# compare in web-browser
-$ ../.venv/bin/python3 compare.py --web-listen 127.0.0.1:8080 
-```
+# Normalize argument comparisons
+python3 compare.py --mode=arguments --normalize-args fixed-size-array tuples string-bytes
 
+# Output markdown tables
+python3 compare.py --mode=selectors --markdown
+```
 
-## How datasets was constructed
-See [datasets/README.md](datasets README)
+## Datasets
+See [datasets/README.md](datasets/README.md) for information about how the test datasets were constructed.
diff --git a/benchmark/compare.py b/benchmark/compare.py
@@ -3,6 +3,7 @@
 import math
 import pathlib
 import re
+from collections import defaultdict
 
 
 def load_data(btype: str, dname: str, providers: list[str], results_dir: str) -> tuple[list, list]:
@@ -15,25 +16,27 @@ def load_data(btype: str, dname: str, providers: list[str], results_dir: str) ->
             times.append(float(fh.read()))
     return data, times
 
-
 def process_selectors(dname: str, providers: list[str], results_dir: str):
     pdata, ptimes = load_data('selectors', dname, providers, results_dir)
-    ret = []
-    for fname, (_meta, gt) in pdata[0].items():
-        gt_set = set(gt)
-        data = []
-        for i in range(1, len(providers)): # skip ground_truth provider
-            d = set(pdata[i][fname][1])
-            fp = list(d - gt_set)
-            fn = list(gt_set - d)
-            data.append([fp, fn])
-        ret.append({
+    results = []
+    ground_truth_provider = pdata[0]
+    for fname, (_, ground_truth) in ground_truth_provider.items():
+        ground_truth_set = set(ground_truth)
+        provider_comparisons = []
+
+        for provider_data in pdata[1:]:
+            provider_set = set(provider_data[fname][1])
+            false_positives = list(provider_set - ground_truth_set)
+            false_negatives = list(ground_truth_set - provider_set)
+            provider_comparisons.append([false_positives, false_negatives])
+
+        results.append({
             'addr': fname[2:-5], # '0xFF.json' => 'FF'
-            'ground_truth': gt,
-            'data': data,
+            'ground_truth': ground_truth,
+            'data': provider_comparisons,
         })
-    return {'dataset': dname, 'results': ret, 'timings': ptimes[1:]}
 
+    return { 'dataset': dname, 'results': results, 'timings': ptimes[1:] }
 
 def format_time(val: float) -> str:
     return f'{val:.1f}s' if val < 10 else f'{val:.0f}s'
@@ -232,38 +235,35 @@ def process_arguments(dname: str, providers: list[str], results_dir: str, normal
 def process_storage(dname: str, providers: list[str], results_dir: str):
     pdata, ptimes = load_data('storage', dname, providers, results_dir)
     ret = []
-    for fname, (_meta, gt) in pdata[0].items():
+
+    for fname, (_, ground_truth) in pdata[0].items():
         func = []
-        for gt_slot, gt_type in gt.items():
+        for gt_slot, gt_type in ground_truth.items():
             data = []
             for i in range(1, len(providers)): # skip ground_truth provider
                 vtype = pdata[i][fname][1].get(gt_slot)
-                if vtype == gt_type:
-                    data.append([1])
-                else:
-                    data.append([0, vtype])
+                data.append([1] if vtype == gt_type else [0, vtype])
             func.append({'s': gt_slot, 'gt': gt_type, 'data': data})
 
-        qwe = set()
-        for i in range(1, len(providers)):
-            qwe |= set(pdata[i][fname][1].keys())
+        all_provider_slots = {
+            slot for i in range(1, len(providers))
+            for slot in pdata[i][fname][1].keys()
+        }
+        false_positive_slots = sorted(all_provider_slots - set(ground_truth.keys()))
 
-        false_positive_slots = sorted(list(qwe - set(pdata[0][fname][1].keys())))
         for slot in false_positive_slots:
             data = []
             for i in range(1, len(providers)): # skip ground_truth provider
                 vtype = pdata[i][fname][1].get(slot)
-                if vtype is None:
-                    data.append([1])
-                else:
-                    data.append([0, vtype])
+                data.append([1] if vtype is None else [0, vtype])
             func.append({'s': slot, 'gt': None, 'data': data})
 
         ret.append({
             'addr': fname[2:-5], # '0xFF.json' => 'FF'
             'func': func,
         })
-    return {'dataset': dname, 'results': ret, 'timings': ptimes[1:]}
+
+    return { 'dataset': dname, 'results': ret, 'timings': ptimes[1:] }
 
 def show_arguments_or_mutability(providers: list[str], all_results: list, show_errors: bool):
     for dataset_result in all_results:
@@ -303,17 +303,32 @@ def show_arguments_or_mutability(providers: list[str], all_results: list, show_e
     parser.add_argument('--show-errors', nargs='?', default=False, const=True, help='show errors')
     parser.add_argument('--normalize-args', nargs='+', required=False, choices=['fixed-size-array', 'tuples', 'string-bytes'], help='normalize arguments rules')
     cfg = parser.parse_args()
+
+    MODE_DEFAULTS = {
+        'storage': {
+            'datasets': ['storage3k'],
+            'providers': ['etherscan', 'evmole-rs', 'smlxl']
+        },
+        'selectors': {
+            'datasets': ['largest1k', 'random50k', 'vyper'],
+            'providers': ['etherscan', 'evmole-rs', 'evmole-js', 'evmole-py', 'whatsabi', 'sevm', 'evm-hound-rs', 'heimdall-rs', 'simple']
+        },
+        'arguments': {
+            'datasets': ['largest1k', 'random50k', 'vyper'],
+            'providers': ['etherscan', 'evmole-rs', 'evmole-js', 'evmole-py', 'heimdall-rs', 'simple']
+        },
+        'mutability': {
+            'datasets': ['largest1k', 'random50k', 'vyper'],
+            'providers': ['etherscan', 'evmole-rs', 'evmole-js', 'evmole-py', 'whatsabi', 'sevm', 'heimdall-rs', 'simple']
+        },
+    }
+
     if cfg.datasets is None:
-        cfg.datasets = ['storage3k'] if cfg.mode == 'storage' else ['largest1k', 'random50k', 'vyper']
+        cfg.datasets = MODE_DEFAULTS[cfg.mode]['datasets']
+
     if cfg.providers is None:
-        if cfg.mode == 'selectors':
-            cfg.providers = ['etherscan', 'evmole-rs', 'evmole-js', 'evmole-py', 'whatsabi', 'sevm', 'evm-hound-rs', 'heimdall-rs', 'simple']
-        elif cfg.mode == 'arguments':
-            cfg.providers = ['etherscan', 'evmole-rs', 'evmole-js', 'evmole-py', 'heimdall-rs', 'simple']
-        elif cfg.mode == 'mutability':
-            cfg.providers = ['etherscan', 'evmole-rs', 'evmole-js', 'evmole-py', 'whatsabi', 'sevm', 'heimdall-rs', 'simple']
-        elif cfg.mode == 'storage':
-            cfg.providers = ['etherscan', 'evmole-rs', 'smlxl']
+        cfg.providers = MODE_DEFAULTS[cfg.mode]['providers']
+
     print('Config:')
     print('\n'.join(f'  {field} = {getattr(cfg, field)}' for field in vars(cfg)), '\n')
 

diff --git a/benchmark/datasets b/benchmark/datasets
diff --git a/src/interface_js.rs b/src/interface_js.rs
@@ -107,7 +107,7 @@ const DOC_CONTRACT_INFO: &'static str = r#"
  * @param args - Configuration options for the analysis
  * @param args.selectors - When true, includes function selectors in the output
  * @param args.arguments - When true, includes function arguments information
- * @param args.state_mutability - When true, includes state mutability information for functions
+ * @param args.stateMutability - When true, includes state mutability information for functions
  * @param args.storage - When true, includes contract storage layout information
  * @param args.disassemble - When true, includes disassembled bytecode
  * @returns Analyzed contract information