Skip to content

Commit

Permalink
Add more test cases for cautious-robot (#20)
Browse files Browse the repository at this point in the history
* Add more test cases for cautious-robot

* Update tests/test_buddycheck.py

Co-authored-by: Elizabeth Campolongo <38985481+egrace479@users.noreply.github.com>

* Update tests/test_download_images.py

Co-authored-by: Elizabeth Campolongo <38985481+egrace479@users.noreply.github.com>

* update the test_buddycheck.py: 1. add another buddycheck instance with buddy_id initialized. 2. removing the leading whitespace with self.image_source_file and self.check_source_file in setUp. 3. update missing_imgs to retain its original index

* update test_logging

* Update tests/test_buddycheck.py

Co-authored-by: Elizabeth Campolongo <38985481+egrace479@users.noreply.github.com>

* Update tests/test_buddycheck.py

Co-authored-by: Elizabeth Campolongo <38985481+egrace479@users.noreply.github.com>

* Update tests/test_buddycheck.py

Co-authored-by: Elizabeth Campolongo <38985481+egrace479@users.noreply.github.com>

* Update tests/test_buddycheck.py

Co-authored-by: Elizabeth Campolongo <38985481+egrace479@users.noreply.github.com>

* Update tests/test_buddycheck.py

Co-authored-by: Elizabeth Campolongo <38985481+egrace479@users.noreply.github.com>

* Update tests/test_buddycheck.py

Co-authored-by: Elizabeth Campolongo <38985481+egrace479@users.noreply.github.com>

* Update tests/test_buddycheck.py

Co-authored-by: Elizabeth Campolongo <38985481+egrace479@users.noreply.github.com>

* Update tests/test_buddycheck.py

Co-authored-by: Elizabeth Campolongo <38985481+egrace479@users.noreply.github.com>

* remove test_check_alignment_case_insensitive_columns

* Replace the retry template with a loop over retry codes to handle both scenarios: success after retry and failure after retry

* Update test_downsampled_image_creation

* Update tests/test_buddycheck.py

Co-authored-by: Elizabeth Campolongo <38985481+egrace479@users.noreply.github.com>

* Update tests/test_buddycheck.py

Co-authored-by: Elizabeth Campolongo <38985481+egrace479@users.noreply.github.com>

* 1. Update test_success_after_retries and test_failure_after_retries 2. Add testcases test_successful_download_with_subfolder and test_downsampled_image_creation_with_subfolder

* fix downsize read bug

* Update test_downsampled_image_creation(_with_subfolder)

* Updated test_download_images.py to make it run faster

* Update tests/test_download_images.py

Co-authored-by: Elizabeth Campolongo <38985481+egrace479@users.noreply.github.com>

* Update tests/test_download_images.py

Co-authored-by: Elizabeth Campolongo <38985481+egrace479@users.noreply.github.com>

* "Add sample images from Rare-Species dataset (8511cb36-ea18-419a-b938-e6316e1855d4 &  94c53a1f-6bd9-469f-85b9-b2ce93e90c21)
https://huggingface.co/datasets/imageomics/rare-species, Original source Smithsonian Institution, NMNH,
image1 from Invertebrate Zoology, CC-0 by Patricia Perez,
image2 from Entomology, CC-0 by Pixel Acuity LLC"

* Custom error when validating empty input df

Switch test from the merge method to the custom error in the validate method

---------

Co-authored-by: Elizabeth Campolongo <38985481+egrace479@users.noreply.github.com>
Co-authored-by: egrace479 <e.campolongo479@gmail.com>
Co-authored-by: Matthew Thompson <thompson.4509@osu.edu>
  • Loading branch information
4 people authored Jul 30, 2024
1 parent ee4e892 commit 2c986a2
Show file tree
Hide file tree
Showing 7 changed files with 479 additions and 5 deletions.
15 changes: 11 additions & 4 deletions src/cautiousrobot/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,10 +145,17 @@ def download_images(data, img_dir, log_filepath, error_log_filepath, filename =
if os.path.exists(downsample_dir_path) != True:
os.makedirs(downsample_dir_path, exist_ok=False)
# Downsample & save image
byte_data = io.BytesIO(response.content)
img = Image.open(byte_data)
#img.save(dest_path)
img.resize((downsample, downsample)).save(downsample_dir_path + "/" + image_name)
try:
img = Image.open(f"{image_dir_path}/{image_name}")
img.resize((downsample, downsample)).save(downsample_dir_path + "/" + image_name)
except Exception as e:
print(e)
log_errors = log_response(log_errors,
index = i,
image = "downsized_" + image_name,
url = url,
response_code = str(e))
update_log(log = log_errors, index = i, filepath = error_log_filepath)

# check for too many requests
elif response.status_code in REDO_CODE_LIST:
Expand Down
8 changes: 7 additions & 1 deletion src/cautiousrobot/buddy_check.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import pandas as pd
from cautiousrobot.exceptions import EmptyDataFrameError


class BuddyCheck:
Expand Down Expand Up @@ -74,7 +75,12 @@ def validate_download(self, source_df, checksum_df, source_id_col = "filename",
Returns:
missing_imgs - DataFrame. Subset of source_df that didn't match checksum_df, None if all match.
'''


if source_df.empty:
raise EmptyDataFrameError("source_df")
if checksum_df.empty:
raise EmptyDataFrameError("checksum_df")

if self.buddy_id is None:
check_type = "checksums"
merged_df = self.merge_on_checksum(source_df, checksum_df, source_validation_col)
Expand Down
4 changes: 4 additions & 0 deletions src/cautiousrobot/exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
class EmptyDataFrameError(Exception):
def __init__(self, df_name):
message = f"Input DataFrame {df_name} is empty."
super().__init__(message)
148 changes: 148 additions & 0 deletions tests/test_buddycheck.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
import unittest
from unittest.mock import patch
import pandas as pd
import tempfile
import os
from cautiousrobot import BuddyCheck
from cautiousrobot.utils import process_csv
from cautiousrobot.exceptions import EmptyDataFrameError

class TestBuddyCheck(unittest.TestCase):
def setUp(self):
self.buddy_check = BuddyCheck()
self.buddy_check_filename = BuddyCheck(buddy_id='filename')
self.buddy_check_id_col = BuddyCheck(buddy_id = "filename", buddy_col = "sha256")

self.img_source_file = tempfile.NamedTemporaryFile(delete=False, mode='w')
self.checksum_source_file = tempfile.NamedTemporaryFile(delete=False, mode='w')

self.img_source_file.write("""filename,checksum
image1.jpg,abc123
image2.jpg,def456
image3.jpg,ghi789
""")
self.img_source_file.close()

self.checksum_source_file.write("""filename,md5
image1.jpg,abc123
image2.jpg,def456
image3.jpg,ghi789
""")
self.checksum_source_file.close()


def tearDown(self):
os.remove(self.img_source_file.name)
os.remove(self.checksum_source_file.name)

def test_initialization(self):
self.assertEqual(self.buddy_check.buddy_id, None)
self.assertEqual(self.buddy_check.buddy_col, 'md5')
self.assertEqual(self.buddy_check_id_col.buddy_id, 'filename')
self.assertEqual(self.buddy_check_id_col.buddy_col, 'sha256')

def test_merge_on_checksum(self):
source_df = pd.read_csv(self.img_source_file.name)
checksum_df = pd.read_csv(self.checksum_source_file.name)

merged_df = self.buddy_check_filename.merge_on_checksum(source_df, checksum_df, 'checksum')
expected_df = pd.DataFrame({
'filename_x': ['image1.jpg', 'image2.jpg', 'image3.jpg'],
'checksum': ['abc123', 'def456', 'ghi789'],
'filename_y': ['image1.jpg', 'image2.jpg', 'image3.jpg'],
'md5': ['abc123', 'def456', 'ghi789']
})
pd.testing.assert_frame_equal(merged_df, expected_df)

def test_merge_on_filename_checksum(self):
source_df = pd.read_csv(self.img_source_file.name)
checksum_df = pd.read_csv(self.checksum_source_file.name)
merged_df = self.buddy_check_filename.merge_on_filename_checksum(source_df, checksum_df, 'filename', 'checksum')
expected_df = pd.DataFrame({
'filename': ['image1.jpg', 'image2.jpg', 'image3.jpg'],
'checksum': ['abc123', 'def456', 'ghi789'],
'md5': ['abc123', 'def456', 'ghi789']
})
pd.testing.assert_frame_equal(merged_df, expected_df)

def test_check_alignment_all_matching(self):
source_df = pd.read_csv(self.img_source_file.name)
checksum_df = pd.read_csv(self.checksum_source_file.name)
merged_df = self.buddy_check_filename.merge_on_filename_checksum(source_df, checksum_df, 'filename', 'checksum')
missing_imgs = self.buddy_check_filename.check_alignment(source_df, merged_df)
self.assertIsNone(missing_imgs)

def test_check_alignment_some_missing(self):
source_df = pd.DataFrame({
'filename': ['image1.jpg', 'image2.jpg', 'image3.jpg', 'image4.jpg'],
'checksum': ['abc123', 'def456', 'ghi789', 'jkl012']
})
checksum_df = pd.read_csv(self.checksum_source_file.name)
merged_df = self.buddy_check_filename.merge_on_filename_checksum(source_df, checksum_df, 'filename', 'checksum')
missing_imgs = self.buddy_check_filename.check_alignment(source_df, merged_df)
expected_missing_imgs = pd.DataFrame({
'filename': ['image4.jpg'],
'checksum': ['jkl012']
})
pd.testing.assert_frame_equal(missing_imgs.reset_index(drop=True), expected_missing_imgs)

def test_validate_download_success(self):
missing_imgs = self.buddy_check.validate_download(
source_df=pd.read_csv(self.img_source_file.name),
checksum_df=pd.read_csv(self.checksum_source_file.name),
source_id_col="filename",
source_validation_col="checksum"
)
self.assertIsNone(missing_imgs)

def test_validate_download_missing_images(self):
source_df = pd.DataFrame({
'filename': ['image1.jpg', 'image2.jpg', 'image3.jpg', 'image4.jpg'],
'checksum': ['abc123', 'def456', 'ghi789', 'jkl012']
})
checksum_df = pd.read_csv(self.checksum_source_file.name)
missing_imgs = self.buddy_check_filename.validate_download(
source_df=source_df,
checksum_df=checksum_df,
source_id_col="filename",
source_validation_col="checksum"
)
expected_missing_imgs = pd.DataFrame({
'filename': ['image4.jpg'],
'checksum': ['jkl012']
})
pd.testing.assert_frame_equal(missing_imgs.reset_index(drop=True), expected_missing_imgs)

def test_check_alignment_no_matching(self):
source_df = pd.read_csv(self.img_source_file.name)
checksum_df = pd.DataFrame({
'filename': ['image4.jpg', 'image5.jpg', 'image6.jpg'],
'md5': ['xyz123', 'uvw456', 'rst789']
})
merged_df = self.buddy_check_filename.merge_on_filename_checksum(source_df, checksum_df, 'filename', 'checksum')
missing_imgs = self.buddy_check_filename.check_alignment(source_df, merged_df)
self.assertIsNotNone(missing_imgs)
self.assertEqual(missing_imgs.shape[0], 3)

def test_check_alignment_checksums_only(self):
source_df = pd.read_csv(self.img_source_file.name)
checksum_df = pd.read_csv(self.checksum_source_file.name)
merged_df = self.buddy_check.merge_on_checksum(source_df, checksum_df, 'checksum')
missing_imgs = self.buddy_check.check_alignment(source_df, merged_df)
self.assertIsNone(missing_imgs)

def test_validate_download_empty_img_df(self):
source_df = pd.DataFrame(columns=['filename', 'checksum'])
checksum_df = pd.read_csv(self.checksum_source_file.name)
with self.assertRaises(EmptyDataFrameError):
missing_imgs = self.buddy_check.validate_download(source_df, checksum_df, 'filename', 'checksum')

def test_validate_download_empty_checksum_df(self):
source_df = pd.read_csv(self.img_source_file.name)
checksum_df = pd.DataFrame(columns=['filename', 'md5'])
with self.assertRaises(EmptyDataFrameError):
missing_imgs = self.buddy_check_filename.validate_download(source_df, checksum_df, 'filename', 'checksum')


if __name__ == '__main__':
unittest.main()
Loading

0 comments on commit 2c986a2

Please sign in to comment.