Skip to content

Commit

Permalink
Merge pull request #98 from IoT-Inspector/refact-processing
Browse files Browse the repository at this point in the history
Refactor processing, get rid of strategies.py
  • Loading branch information
kissgyorgy authored Dec 7, 2021
2 parents b312846 + 39004cd commit 90057a4
Show file tree
Hide file tree
Showing 12 changed files with 248 additions and 169 deletions.
3 changes: 1 addition & 2 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,5 +37,4 @@ jobs:
run: git lfs pull

- name: Run pytest
# TODO: Dummy coverage (branch: 0%) is needed for the pass rate. Increase this
run: poetry run pytest --cov=. --cov-branch --cov-fail-under=0
run: poetry run pytest
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@ profile = "black"
[tool.poetry.scripts]
unblob = "unblob.cli:main"

[tool.pytest.ini_options]
addopts = "--cov=. --cov-branch --cov-fail-under=90"

[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"
73 changes: 73 additions & 0 deletions tests/test_finder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
from pathlib import Path

from unblob.finder import make_handler_map, make_yara_rules, search_yara_patterns
from unblob.models import Handler


class _BaseTestHandler(Handler):
def calculate_chunk(self, *args, **kwargs):
pass

@staticmethod
def make_extract_command(*args, **kwargs):
return []


class TestHandler1(_BaseTestHandler):
NAME = "handler1"
YARA_RULE = r"""
strings:
$magic = { 21 3C }
condition:
$magic
"""


class TestHandler2(_BaseTestHandler):
NAME = "handler2"
YARA_RULE = r"""
strings:
$tar_magic = { 75 73 74 61 72 }
condition:
$tar_magic
"""


def test_make_yara_rules():
rules = make_yara_rules(tuple([TestHandler1, TestHandler2]))
matches = rules.match(data=b"!< ustar")
assert len(matches) == 2
assert matches[0].strings == [(0, "$magic", b"!<")]
assert matches[1].strings == [(10, "$tar_magic", b"ustar")]


def test_search_yara_patterns(tmp_path: Path):
handler1 = TestHandler1()
handler2 = TestHandler2
rules = make_yara_rules(tuple([TestHandler1, TestHandler2]))
handler_map = {"handler1": handler1, "handler2": handler2}
test_file = tmp_path / "test_file"
test_file.write_bytes(b"!< ustar")
results = search_yara_patterns(rules, handler_map, test_file)

assert len(results) == 2
result1, result2 = results

assert result1.handler is handler1
assert result1.match.strings == [(0, "$magic", b"!<")]

assert result2.handler is handler2
assert result2.match.strings == [(10, "$tar_magic", b"ustar")]


def test_make_handler_map():
handler_map = make_handler_map(tuple([TestHandler1, TestHandler2]))
assert isinstance(handler_map["handler1"], TestHandler1)
assert isinstance(handler_map["handler2"], TestHandler2)


def test_make_handler_map_instances_are_cached():
handler_map1 = make_handler_map(tuple([TestHandler1, TestHandler2]))
handler_map2 = make_handler_map(tuple([TestHandler1, TestHandler2]))
assert handler_map1["handler1"] is handler_map2["handler1"]
assert handler_map1["handler2"] is handler_map2["handler2"]
10 changes: 6 additions & 4 deletions tests/test_handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,12 @@
import shlex
import subprocess
from pathlib import Path
from typing import Type

import pytest

from unblob import handlers
from unblob.models import Handler
from unblob.processing import DEFAULT_DEPTH, process_file

TEST_DATA_PATH = Path(__file__).parent / "integration"
Expand Down Expand Up @@ -68,12 +70,12 @@ def test_all_handlers(input_dir: Path, output_dir: Path, tmp_path: Path):
"handler",
(
pytest.param(handler, id=handler.NAME)
for handler_map in handlers._ALL_MODULES_BY_PRIORITY
for handler in handler_map.values()
for handlers_in_priority in handlers.ALL_HANDLERS_BY_PRIORITY
for handler in handlers_in_priority
),
)
def test_missing_handlers_integrations_tests(handler):
handler_module_path = Path(inspect.getfile(handler.__class__))
def test_missing_handlers_integrations_tests(handler: Type[Handler]):
handler_module_path = Path(inspect.getfile(handler))
handler_test_path = handler_module_path.relative_to(
HANDLERS_PACKAGE_PATH
).with_suffix("")
Expand Down
2 changes: 1 addition & 1 deletion tests/test_strategies.py → tests/test_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import pytest

from unblob.models import UnknownChunk, ValidChunk
from unblob.strategies import calculate_unknown_chunks, remove_inner_chunks
from unblob.processing import calculate_unknown_chunks, remove_inner_chunks


@pytest.mark.parametrize(
Expand Down
4 changes: 2 additions & 2 deletions unblob/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import click
from structlog import get_logger

from .logging import configure_logger
from .logging import configure_logger, noformat
from .processing import DEFAULT_DEPTH, process_file
from .state import exit_code_var

Expand Down Expand Up @@ -37,7 +37,7 @@
@click.option("-v", "--verbose", is_flag=True, help="Verbose mode, enable debug logs.")
def cli(files: Tuple[Path], extract_root: Path, depth: int, verbose: bool):
configure_logger(verbose, extract_root)
logger.info("Start processing files", count=len(files))
logger.info("Start processing files", count=noformat(len(files)))
for path in files:
root = path if path.is_dir() else path.parent
process_file(root, path, extract_root, max_depth=depth)
Expand Down
3 changes: 3 additions & 0 deletions unblob/extractor.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
"""
File extraction related functions.
"""
import io
import shlex
import subprocess
Expand Down
96 changes: 88 additions & 8 deletions unblob/finder.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,21 @@
"""
Searching Chunk related functions.
The main "entry point" is search_chunks_by_priority.
"""
import io
from functools import lru_cache
from operator import itemgetter
from pathlib import Path
from typing import Dict, List, Tuple
from typing import Dict, List, Tuple, Type

import yara
from structlog import get_logger

from .handlers import Handler
from .models import YaraMatchResult
from .file_utils import LimitedStartReader
from .handlers import ALL_HANDLERS_BY_PRIORITY
from .logging import noformat
from .models import Handler, ValidChunk, YaraMatchResult
from .state import exit_code_var

logger = get_logger()

Expand All @@ -19,8 +28,71 @@
"""


def search_chunks_by_priority( # noqa: C901
path: Path, file: io.BufferedReader, file_size: int
) -> List[ValidChunk]:
"""Search all ValidChunks within the file.
Collect all the registered handlers by priority, search for YARA patterns and run
Handler.calculate_chunk() on the found matches.
We don't deal with offset within already found ValidChunks and invalid chunks are thrown away.
"""
all_chunks = []

for priority_level, handler_classes in enumerate(ALL_HANDLERS_BY_PRIORITY, start=1):
logger.info("Starting priority level", priority_level=noformat(priority_level))
yara_rules = make_yara_rules(handler_classes)
handler_map = make_handler_map(handler_classes)
yara_results = search_yara_patterns(yara_rules, handler_map, path)

for result in yara_results:
handler, match = result.handler, result.match

by_offset = itemgetter(0)
sorted_match_strings = sorted(match.strings, key=by_offset)
for offset, identifier, string_data in sorted_match_strings:
real_offset = offset + handler.YARA_MATCH_OFFSET

# Skip chunk calculation if this would start inside another one,
# similar to remove_inner_chunks, but before we even begin calculating.
if any(chunk.contains_offset(real_offset) for chunk in all_chunks):
continue

logger.info(
"Calculating chunk for YARA match",
start_offset=offset,
real_offset=real_offset,
identifier=identifier,
)

limited_reader = LimitedStartReader(file, real_offset)
try:
chunk = handler.calculate_chunk(limited_reader, real_offset)
except Exception as exc:
exit_code_var.set(1)
logger.error(
"Unhandled Exception during chunk calculation", exc_info=exc
)
continue

# We found some random bytes this handler couldn't parse
if chunk is None:
continue

if chunk.end_offset > file_size or chunk.start_offset < 0:
exit_code_var.set(1)
logger.error("Chunk overflows file", chunk=chunk)
continue

chunk.handler = handler
logger.info("Found valid chunk", chunk=chunk, handler=handler.NAME)
all_chunks.append(chunk)

return all_chunks


@lru_cache
def _make_yara_rules(handlers: Tuple[Handler, ...]):
def make_yara_rules(handlers: Tuple[Type[Handler], ...]):
"""Make yara.Rule by concatenating all handlers yara rules and compiling them."""
all_yara_rules = "\n".join(
_YARA_RULE_TEMPLATE.format(NAME=h.NAME, YARA_RULE=h.YARA_RULE.strip())
for h in handlers
Expand All @@ -30,17 +102,25 @@ def _make_yara_rules(handlers: Tuple[Handler, ...]):
return compiled_rules


def search_chunks(
handlers: Dict[str, Handler], full_path: Path
@lru_cache
def make_handler_map(handler_classes: Tuple[Type[Handler], ...]) -> Dict[str, Handler]:
return {h.NAME: h() for h in handler_classes}


def search_yara_patterns(
yara_rules: yara.Rule, handler_map: Dict[str, Handler], full_path: Path
) -> List[YaraMatchResult]:
yara_rules = _make_yara_rules(tuple(handlers.values()))
"""Search with the compiled YARA rules and identify the handler which defined the rule."""
# YARA uses a memory mapped file internally when given a path
yara_matches: List[yara.Match] = yara_rules.match(str(full_path), timeout=60)

yara_results = []
for match in yara_matches:
handler = handlers[match.rule]
handler = handler_map[match.rule]
yara_res = YaraMatchResult(handler=handler, match=match)
yara_results.append(yara_res)

if yara_results:
logger.info("Found YARA results", count=noformat(len(yara_results)))

return yara_results
17 changes: 5 additions & 12 deletions unblob/handlers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,20 @@
from typing import Dict, List, Type
from typing import List, Tuple, Type

from ..models import Handler
from .archive import ar, arc, arj, cab, cpio, dmg, rar, sevenzip, tar, zip
from .compression import lzo
from .filesystem import cramfs, fat, iso9660, squashfs, ubi


def _make_handler_map(*handlers: Type[Handler]) -> Dict[str, Handler]:
return {h.NAME: h() for h in handlers}


_ALL_MODULES_BY_PRIORITY: List[Dict[str, Handler]] = [
_make_handler_map(
ALL_HANDLERS_BY_PRIORITY: List[Tuple[Type[Handler], ...]] = [
(
cramfs.CramFSHandler,
fat.FATHandler,
squashfs.SquashFSv3Handler,
squashfs.SquashFSv4Handler,
ubi.UBIHandler,
ubi.UBIFSHandler,
),
_make_handler_map(
(
ar.ARHandler,
arc.ARCHandler,
arj.ARJHandler,
Expand All @@ -35,7 +30,5 @@ def _make_handler_map(*handlers: Type[Handler]) -> Dict[str, Handler]:
dmg.DMGHandler,
iso9660.ISO9660FSHandler,
),
_make_handler_map(
lzo.LZOHandler,
),
(lzo.LZOHandler,),
]
10 changes: 3 additions & 7 deletions unblob/handlers/archive/arj.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,13 @@ class ARJError(Exception):


class ARJNullFile(ARJError):
"""Raised for zero-sized ARJ files."""
"""Zero-sized ARJ."""


class ARJExtendedHeader(ARJError):
"""Main ARJ header contains extended_header, which we don't handle."""


class ARJFileLengthError(ARJError):
"""This is a zero-sized ARJ."""


class ARJHandler(StructHandler):
NAME = "arj"

Expand Down Expand Up @@ -150,11 +146,11 @@ def calculate_chunk(
self._read_arj_main_header(file, start_offset)
end_of_arj = self._read_arj_files(file)
except ARJError as exc:
logger.warning("Invalid ARJ file", message=exc.__doc__)
logger.warning("Invalid ARJ file", reason=exc.__doc__)
return
except EOFError:
logger.warning(
"Invalid ARJ file", message="File ends before ARJ file resolves."
"Invalid ARJ file", reason="File ends before ARJ file resolves."
)
return

Expand Down
Loading

0 comments on commit 90057a4

Please sign in to comment.