Skip to content

Commit

Permalink
raw sync lambda minor improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
philerooski committed Sep 27, 2024
1 parent 614c478 commit ab02435
Show file tree
Hide file tree
Showing 2 changed files with 116 additions and 27 deletions.
12 changes: 10 additions & 2 deletions src/lambda_function/raw_sync/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ def append_s3_key(key: str, key_format: str, result: dict) -> None:
Returns:
None
"""
result = result.copy() # shallow copy safe for append
if not key.endswith("/"): # Ignore keys that represent "folders"
key_components = key.split("/")
if key_format == "raw":
Expand All @@ -87,10 +88,11 @@ def append_s3_key(key: str, key_format: str, result: dict) -> None:
result[data_type][cohort].append(key)
except StopIteration:
# Skip keys that don't match the expected pattern
return
return result
elif key_format == "input" and len(key_components) == 3:
cohort = key_components[1]
result[cohort].append(key)
return result


def list_s3_objects(
Expand Down Expand Up @@ -160,7 +162,7 @@ def list_s3_objects(
for response in response_iterator:
for obj in response.get("Contents", []):
key = obj["Key"]
append_s3_key(
result = append_s3_key(
key=key,
key_format=key_format,
result=result,
Expand Down Expand Up @@ -422,6 +424,12 @@ def list_files_in_archive(
"file_size": zip_info.file_size,
}
file_list.append(file_object)
if len(file_list) == 0:
logger.warning(
f"Did not find any files in s3://{bucket}/{key} which "
"satisfy the conditions needed to be processed by the "
"raw Lambda."
)
return file_list


Expand Down
131 changes: 106 additions & 25 deletions tests/test_lambda_raw_sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ def setup_list_files_in_archive_s3(
# Object without EOCD signature at all
s3_client.put_object(
Bucket=list_files_in_archive_bucket_name,
Key="no_eocd.zip",
Key="no_eocd.notazip",
Body=b"Random data with no EOCD signature",
)
# Object with central directory not fully contained
Expand All @@ -160,6 +160,18 @@ def setup_list_files_in_archive_s3(
Key="valid.zip",
Body=zip_file_data.read(),
)
# Object with an empty ZIP archive
empty_zip_data = io.BytesIO()
with zipfile.ZipFile(empty_zip_data, "w") as empty_zip:
pass # Create an empty ZIP archive (no files)
empty_zip_data.seek(0)
s3_client.put_object(
Bucket=list_files_in_archive_bucket_name,
Key="empty.zip",
Body=empty_zip_data.read(),
)

return list_files_in_archive_bucket_name


@pytest.fixture
Expand Down Expand Up @@ -287,13 +299,15 @@ def test_list_files_in_archive_missing_eocd_recursion(


def test_list_files_in_archive_no_eocd_returns_empty_list(
s3_client, setup_list_files_in_archive_s3
s3_client, setup_list_files_in_archive_s3, caplog
):
"""Test if the function returns an empty list when EOCD is not found at all."""
bucket_name = "list-files-in-archive-bucket"
key = "no_eocd.zip"
result = app.list_files_in_archive(s3_client, bucket_name, key, range_size=16)
key = "no_eocd.notazip"
with caplog.at_level("ERROR"):
result = app.list_files_in_archive(s3_client, bucket_name, key, range_size=16)
assert result == []
assert len(caplog.text)


@patch("src.lambda_function.raw_sync.app.list_files_in_archive")
Expand Down Expand Up @@ -344,6 +358,23 @@ def test_list_files_in_archive_returns_filenames(
assert "file2.txt" in filenames, "Expected 'file2.txt' to be in the result."


def test_list_files_in_archive_empty_zip(
s3_client, setup_list_files_in_archive_s3, caplog
):
"""
Test that an empty ZIP archive returns an empty file list.
"""
# Retrieve the bucket name from the fixture
bucket_name = setup_list_files_in_archive_s3
key = "empty.zip"

with caplog.at_level("WARNING"):
result = app.list_files_in_archive(s3_client, bucket=bucket_name, key=key)

assert result == []
assert len(caplog.text)


def test_append_s3_key_raw():
"""Test append_s3_key with 'raw' format."""
key = "namespace/json/dataset=example_data/cohort=example_cohort/file1.json"
Expand All @@ -354,7 +385,7 @@ def test_append_s3_key_raw():
app.append_s3_key(key, key_format, result)

# Expected result structure after processing the key
expected_result = {"example_data": {"example_cohort": [key]}}
result = expected_result = {"example_data": {"example_cohort": [key]}}

# Assert that the key was correctly added to the result dictionary
assert result == expected_result
Expand All @@ -370,12 +401,35 @@ def test_append_s3_key_input():
app.append_s3_key(key, key_format, result)

# Expected result structure after processing the key
expected_result = {"example_cohort": [key]}
result = expected_result = {"example_cohort": [key]}

# Assert that the key was correctly added to the result dictionary
assert result == expected_result


def test_append_s3_key_stop_iteration():
"""Test that result is unmodified when StopIteration is encountered."""
key = "namespace/json/invalid_key_structure"
key_format = "raw"

# Initial result dictionary (should remain unchanged)
result = {
"data_type_one": {
"cohort_one": [
"namespace/json/dataset=data_type_one/cohort=cohort_one/file1.json"
]
}
}
# Copy the result to check for modifications later
original_result = result.copy()

result = app.append_s3_key(key, key_format, result)

assert (
result == original_result
), "Expected result to remain unmodified on StopIteration."


def test_list_s3_objects_raw_format(s3_client, setup_s3):
"""Test the list_s3_objects function with the "raw" key format."""
bucket_name = "test-raw-bucket"
Expand Down Expand Up @@ -420,19 +474,16 @@ def test_list_s3_objects_input_format(s3_client, setup_s3):
assert result == expected_output, f"Expected {expected_output}, but got {result}"


def test_match_corresponding_raw_object(mocked_raw_keys):
"""Test the match_corresponding_raw_object function."""

# Test parameters for a match
def test_match_corresponding_raw_object_found(mocked_raw_keys):
"""Test when a matching key is found."""
namespace = "namespace"
data_type = "data_type_one"
cohort = "cohort_one"
file_identifier = "object_one"

# Expected matching key
expected_key = (
"namespace/json/dataset=data_type_one/cohort=cohort_one/object_one.ndjson.gz"
)
expected_key = f"{namespace}/json/dataset={data_type}/cohort={cohort}/{file_identifier}.ndjson.gz"

result = app.match_corresponding_raw_object(
data_type=data_type,
cohort=cohort,
Expand All @@ -441,32 +492,62 @@ def test_match_corresponding_raw_object(mocked_raw_keys):
)
assert result == expected_key

# Test for a non-matching scenario
unexpected_key = (
"namespace/json/dataset=data_type_one/cohort=cohort_one/nonexistent_file"
)

def test_match_corresponding_raw_object_non_matching_data_type(mocked_raw_keys):
"""Test when there is no match due to a non-matching data type."""
namespace = "namespace"
data_type = "fake_data_type"
cohort = "cohort_one"
file_identifier = "object_one"

# Expected matching key
expected_key = f"{namespace}/json/dataset={data_type}/cohort={cohort}/{file_identifier}.ndjson.gz"

result = app.match_corresponding_raw_object(
data_type=data_type,
cohort=cohort,
expected_key=unexpected_key,
expected_key=expected_key,
raw_keys=mocked_raw_keys,
)
assert result == None, "Expected None for a non-matching file identifier."
assert result is None

# Test with different data_type and cohort
data_type = "data_type_two"
cohort = "cohort_one"

def test_match_corresponding_raw_object_non_matching_cohort(mocked_raw_keys):
"""Test when there is no match due to a non-matching cohort."""
namespace = "namespace"
data_type = "data_type_one"
cohort = "fake_cohort"
file_identifier = "object_four"
expected_key = (
"namespace/json/dataset=data_type_two/cohort=cohort_one/object_four.ndjson.gz"

# Expected matching key
expected_key = f"{namespace}/json/dataset={data_type}/cohort={cohort}/{file_identifier}.ndjson.gz"

result = app.match_corresponding_raw_object(
data_type=data_type,
cohort=cohort,
expected_key=expected_key,
raw_keys=mocked_raw_keys,
)
assert result is None


def test_match_corresponding_raw_object_non_matching_file_identifier(mocked_raw_keys):
"""Test when there is no match due to a non-matching file identifier."""
namespace = "namespace"
data_type = "data_type_one"
cohort = "cohort_one"
file_identifier = "nonexistent_file"

# Expected matching key
expected_key = f"{namespace}/json/dataset={data_type}/cohort={cohort}/{file_identifier}.ndjson.gz"

result = app.match_corresponding_raw_object(
data_type=data_type,
cohort=cohort,
expected_key=expected_key,
raw_keys=mocked_raw_keys,
)
assert result == expected_key
assert result is None


@mock_sns
Expand Down

0 comments on commit ab02435

Please sign in to comment.