Merge pull request #98 from IoT-Inspector/refact-processing

Refactor processing, get rid of strategies.py
onekey-sec · Dec 7, 2021 · 90057a4 · 90057a4
2 parents b312846 + 39004cd
commit 90057a4
Show file tree

Hide file tree

Showing 12 changed files with 248 additions and 169 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -37,5 +37,4 @@ jobs:
         run: git lfs pull
 
       - name: Run pytest
-        # TODO: Dummy coverage (branch: 0%) is needed for the pass rate. Increase this
-        run: poetry run pytest --cov=. --cov-branch --cov-fail-under=0
+        run: poetry run pytest
diff --git a/pyproject.toml b/pyproject.toml
@@ -32,6 +32,9 @@ profile = "black"
 [tool.poetry.scripts]
 unblob = "unblob.cli:main"
 
+[tool.pytest.ini_options]
+addopts = "--cov=. --cov-branch --cov-fail-under=90"
+
 [build-system]
 requires = ["poetry-core>=1.0.0"]
 build-backend = "poetry.core.masonry.api"
diff --git a/tests/test_finder.py b/tests/test_finder.py
@@ -0,0 +1,73 @@
+from pathlib import Path
+
+from unblob.finder import make_handler_map, make_yara_rules, search_yara_patterns
+from unblob.models import Handler
+
+
+class _BaseTestHandler(Handler):
+    def calculate_chunk(self, *args, **kwargs):
+        pass
+
+    @staticmethod
+    def make_extract_command(*args, **kwargs):
+        return []
+
+
+class TestHandler1(_BaseTestHandler):
+    NAME = "handler1"
+    YARA_RULE = r"""
+        strings:
+            $magic = { 21 3C }
+        condition:
+            $magic
+    """
+
+
+class TestHandler2(_BaseTestHandler):
+    NAME = "handler2"
+    YARA_RULE = r"""
+        strings:
+            $tar_magic = { 75 73 74 61 72 }
+        condition:
+            $tar_magic
+    """
+
+
+def test_make_yara_rules():
+    rules = make_yara_rules(tuple([TestHandler1, TestHandler2]))
+    matches = rules.match(data=b"!<        ustar")
+    assert len(matches) == 2
+    assert matches[0].strings == [(0, "$magic", b"!<")]
+    assert matches[1].strings == [(10, "$tar_magic", b"ustar")]
+
+
+def test_search_yara_patterns(tmp_path: Path):
+    handler1 = TestHandler1()
+    handler2 = TestHandler2
+    rules = make_yara_rules(tuple([TestHandler1, TestHandler2]))
+    handler_map = {"handler1": handler1, "handler2": handler2}
+    test_file = tmp_path / "test_file"
+    test_file.write_bytes(b"!<        ustar")
+    results = search_yara_patterns(rules, handler_map, test_file)
+
+    assert len(results) == 2
+    result1, result2 = results
+
+    assert result1.handler is handler1
+    assert result1.match.strings == [(0, "$magic", b"!<")]
+
+    assert result2.handler is handler2
+    assert result2.match.strings == [(10, "$tar_magic", b"ustar")]
+
+
+def test_make_handler_map():
+    handler_map = make_handler_map(tuple([TestHandler1, TestHandler2]))
+    assert isinstance(handler_map["handler1"], TestHandler1)
+    assert isinstance(handler_map["handler2"], TestHandler2)
+
+
+def test_make_handler_map_instances_are_cached():
+    handler_map1 = make_handler_map(tuple([TestHandler1, TestHandler2]))
+    handler_map2 = make_handler_map(tuple([TestHandler1, TestHandler2]))
+    assert handler_map1["handler1"] is handler_map2["handler1"]
+    assert handler_map1["handler2"] is handler_map2["handler2"]
diff --git a/tests/test_handlers.py b/tests/test_handlers.py
@@ -11,10 +11,12 @@
 import shlex
 import subprocess
 from pathlib import Path
+from typing import Type
 
 import pytest
 
 from unblob import handlers
+from unblob.models import Handler
 from unblob.processing import DEFAULT_DEPTH, process_file
 
 TEST_DATA_PATH = Path(__file__).parent / "integration"
@@ -68,12 +70,12 @@ def test_all_handlers(input_dir: Path, output_dir: Path, tmp_path: Path):
     "handler",
     (
         pytest.param(handler, id=handler.NAME)
-        for handler_map in handlers._ALL_MODULES_BY_PRIORITY
-        for handler in handler_map.values()
+        for handlers_in_priority in handlers.ALL_HANDLERS_BY_PRIORITY
+        for handler in handlers_in_priority
     ),
 )
-def test_missing_handlers_integrations_tests(handler):
-    handler_module_path = Path(inspect.getfile(handler.__class__))
+def test_missing_handlers_integrations_tests(handler: Type[Handler]):
+    handler_module_path = Path(inspect.getfile(handler))
     handler_test_path = handler_module_path.relative_to(
         HANDLERS_PACKAGE_PATH
     ).with_suffix("")

diff --git a/tests/test_strategies.py → tests/test_processing.py b/tests/test_strategies.py → tests/test_processing.py
@@ -3,7 +3,7 @@
 import pytest
 
 from unblob.models import UnknownChunk, ValidChunk
-from unblob.strategies import calculate_unknown_chunks, remove_inner_chunks
+from unblob.processing import calculate_unknown_chunks, remove_inner_chunks
 
 
 @pytest.mark.parametrize(

diff --git a/unblob/cli.py b/unblob/cli.py
@@ -6,7 +6,7 @@
 import click
 from structlog import get_logger
 
-from .logging import configure_logger
+from .logging import configure_logger, noformat
 from .processing import DEFAULT_DEPTH, process_file
 from .state import exit_code_var
 
@@ -37,7 +37,7 @@
 @click.option("-v", "--verbose", is_flag=True, help="Verbose mode, enable debug logs.")
 def cli(files: Tuple[Path], extract_root: Path, depth: int, verbose: bool):
     configure_logger(verbose, extract_root)
-    logger.info("Start processing files", count=len(files))
+    logger.info("Start processing files", count=noformat(len(files)))
     for path in files:
         root = path if path.is_dir() else path.parent
         process_file(root, path, extract_root, max_depth=depth)

diff --git a/unblob/extractor.py b/unblob/extractor.py
@@ -1,3 +1,6 @@
+"""
+File extraction related functions.
+"""
 import io
 import shlex
 import subprocess

diff --git a/unblob/finder.py b/unblob/finder.py
@@ -1,12 +1,21 @@
+"""
+Searching Chunk related functions.
+The main "entry point" is search_chunks_by_priority.
+"""
+import io
 from functools import lru_cache
+from operator import itemgetter
 from pathlib import Path
-from typing import Dict, List, Tuple
+from typing import Dict, List, Tuple, Type
 
 import yara
 from structlog import get_logger
 
-from .handlers import Handler
-from .models import YaraMatchResult
+from .file_utils import LimitedStartReader
+from .handlers import ALL_HANDLERS_BY_PRIORITY
+from .logging import noformat
+from .models import Handler, ValidChunk, YaraMatchResult
+from .state import exit_code_var
 
 logger = get_logger()
 
@@ -19,8 +28,71 @@
 """
 
 
+def search_chunks_by_priority(  # noqa: C901
+    path: Path, file: io.BufferedReader, file_size: int
+) -> List[ValidChunk]:
+    """Search all ValidChunks within the file.
+    Collect all the registered handlers by priority, search for YARA patterns and run
+    Handler.calculate_chunk() on the found matches.
+    We don't deal with offset within already found ValidChunks and invalid chunks are thrown away.
+    """
+    all_chunks = []
+
+    for priority_level, handler_classes in enumerate(ALL_HANDLERS_BY_PRIORITY, start=1):
+        logger.info("Starting priority level", priority_level=noformat(priority_level))
+        yara_rules = make_yara_rules(handler_classes)
+        handler_map = make_handler_map(handler_classes)
+        yara_results = search_yara_patterns(yara_rules, handler_map, path)
+
+        for result in yara_results:
+            handler, match = result.handler, result.match
+
+            by_offset = itemgetter(0)
+            sorted_match_strings = sorted(match.strings, key=by_offset)
+            for offset, identifier, string_data in sorted_match_strings:
+                real_offset = offset + handler.YARA_MATCH_OFFSET
+
+                # Skip chunk calculation if this would start inside another one,
+                # similar to remove_inner_chunks, but before we even begin calculating.
+                if any(chunk.contains_offset(real_offset) for chunk in all_chunks):
+                    continue
+
+                logger.info(
+                    "Calculating chunk for YARA match",
+                    start_offset=offset,
+                    real_offset=real_offset,
+                    identifier=identifier,
+                )
+
+                limited_reader = LimitedStartReader(file, real_offset)
+                try:
+                    chunk = handler.calculate_chunk(limited_reader, real_offset)
+                except Exception as exc:
+                    exit_code_var.set(1)
+                    logger.error(
+                        "Unhandled Exception during chunk calculation", exc_info=exc
+                    )
+                    continue
+
+                # We found some random bytes this handler couldn't parse
+                if chunk is None:
+                    continue
+
+                if chunk.end_offset > file_size or chunk.start_offset < 0:
+                    exit_code_var.set(1)
+                    logger.error("Chunk overflows file", chunk=chunk)
+                    continue
+
+                chunk.handler = handler
+                logger.info("Found valid chunk", chunk=chunk, handler=handler.NAME)
+                all_chunks.append(chunk)
+
+    return all_chunks
+
+
 @lru_cache
-def _make_yara_rules(handlers: Tuple[Handler, ...]):
+def make_yara_rules(handlers: Tuple[Type[Handler], ...]):
+    """Make yara.Rule by concatenating all handlers yara rules and compiling them."""
     all_yara_rules = "\n".join(
         _YARA_RULE_TEMPLATE.format(NAME=h.NAME, YARA_RULE=h.YARA_RULE.strip())
         for h in handlers
@@ -30,17 +102,25 @@ def _make_yara_rules(handlers: Tuple[Handler, ...]):
     return compiled_rules
 
 
-def search_chunks(
-    handlers: Dict[str, Handler], full_path: Path
+@lru_cache
+def make_handler_map(handler_classes: Tuple[Type[Handler], ...]) -> Dict[str, Handler]:
+    return {h.NAME: h() for h in handler_classes}
+
+
+def search_yara_patterns(
+    yara_rules: yara.Rule, handler_map: Dict[str, Handler], full_path: Path
 ) -> List[YaraMatchResult]:
-    yara_rules = _make_yara_rules(tuple(handlers.values()))
+    """Search with the compiled YARA rules and identify the handler which defined the rule."""
     # YARA uses a memory mapped file internally when given a path
     yara_matches: List[yara.Match] = yara_rules.match(str(full_path), timeout=60)
 
     yara_results = []
     for match in yara_matches:
-        handler = handlers[match.rule]
+        handler = handler_map[match.rule]
         yara_res = YaraMatchResult(handler=handler, match=match)
         yara_results.append(yara_res)
 
+    if yara_results:
+        logger.info("Found YARA results", count=noformat(len(yara_results)))
+
     return yara_results
diff --git a/unblob/handlers/__init__.py b/unblob/handlers/__init__.py
@@ -1,25 +1,20 @@
-from typing import Dict, List, Type
+from typing import List, Tuple, Type
 
 from ..models import Handler
 from .archive import ar, arc, arj, cab, cpio, dmg, rar, sevenzip, tar, zip
 from .compression import lzo
 from .filesystem import cramfs, fat, iso9660, squashfs, ubi
 
-
-def _make_handler_map(*handlers: Type[Handler]) -> Dict[str, Handler]:
-    return {h.NAME: h() for h in handlers}
-
-
-_ALL_MODULES_BY_PRIORITY: List[Dict[str, Handler]] = [
-    _make_handler_map(
+ALL_HANDLERS_BY_PRIORITY: List[Tuple[Type[Handler], ...]] = [
+    (
         cramfs.CramFSHandler,
         fat.FATHandler,
         squashfs.SquashFSv3Handler,
         squashfs.SquashFSv4Handler,
         ubi.UBIHandler,
         ubi.UBIFSHandler,
     ),
-    _make_handler_map(
+    (
         ar.ARHandler,
         arc.ARCHandler,
         arj.ARJHandler,
@@ -35,7 +30,5 @@ def _make_handler_map(*handlers: Type[Handler]) -> Dict[str, Handler]:
         dmg.DMGHandler,
         iso9660.ISO9660FSHandler,
     ),
-    _make_handler_map(
-        lzo.LZOHandler,
-    ),
+    (lzo.LZOHandler,),
 ]
diff --git a/unblob/handlers/archive/arj.py b/unblob/handlers/archive/arj.py
@@ -13,17 +13,13 @@ class ARJError(Exception):
 
 
 class ARJNullFile(ARJError):
-    """Raised for zero-sized ARJ files."""
+    """Zero-sized ARJ."""
 
 
 class ARJExtendedHeader(ARJError):
     """Main ARJ header contains extended_header, which we don't handle."""
 
 
-class ARJFileLengthError(ARJError):
-    """This is a zero-sized ARJ."""
-
-
 class ARJHandler(StructHandler):
     NAME = "arj"
 
@@ -150,11 +146,11 @@ def calculate_chunk(
             self._read_arj_main_header(file, start_offset)
             end_of_arj = self._read_arj_files(file)
         except ARJError as exc:
-            logger.warning("Invalid ARJ file", message=exc.__doc__)
+            logger.warning("Invalid ARJ file", reason=exc.__doc__)
             return
         except EOFError:
             logger.warning(
-                "Invalid ARJ file", message="File ends before ARJ file resolves."
+                "Invalid ARJ file", reason="File ends before ARJ file resolves."
             )
             return