diff --git a/src/cautiousrobot/__main__.py b/src/cautiousrobot/__main__.py index c07cfc7..7402dcc 100644 --- a/src/cautiousrobot/__main__.py +++ b/src/cautiousrobot/__main__.py @@ -145,10 +145,17 @@ def download_images(data, img_dir, log_filepath, error_log_filepath, filename = if os.path.exists(downsample_dir_path) != True: os.makedirs(downsample_dir_path, exist_ok=False) # Downsample & save image - byte_data = io.BytesIO(response.content) - img = Image.open(byte_data) - #img.save(dest_path) - img.resize((downsample, downsample)).save(downsample_dir_path + "/" + image_name) + try: + img = Image.open(f"{image_dir_path}/{image_name}") + img.resize((downsample, downsample)).save(downsample_dir_path + "/" + image_name) + except Exception as e: + print(e) + log_errors = log_response(log_errors, + index = i, + image = "downsized_" + image_name, + url = url, + response_code = str(e)) + update_log(log = log_errors, index = i, filepath = error_log_filepath) # check for too many requests elif response.status_code in REDO_CODE_LIST: diff --git a/src/cautiousrobot/buddy_check.py b/src/cautiousrobot/buddy_check.py index a4dca22..cb522e6 100644 --- a/src/cautiousrobot/buddy_check.py +++ b/src/cautiousrobot/buddy_check.py @@ -1,4 +1,5 @@ import pandas as pd +from cautiousrobot.exceptions import EmptyDataFrameError class BuddyCheck: @@ -74,7 +75,12 @@ def validate_download(self, source_df, checksum_df, source_id_col = "filename", Returns: missing_imgs - DataFrame. Subset of source_df that didn't match checksum_df, None if all match. ''' - + + if source_df.empty: + raise EmptyDataFrameError("source_df") + if checksum_df.empty: + raise EmptyDataFrameError("checksum_df") + if self.buddy_id is None: check_type = "checksums" merged_df = self.merge_on_checksum(source_df, checksum_df, source_validation_col) diff --git a/src/cautiousrobot/exceptions.py b/src/cautiousrobot/exceptions.py new file mode 100644 index 0000000..aef3983 --- /dev/null +++ b/src/cautiousrobot/exceptions.py @@ -0,0 +1,4 @@ +class EmptyDataFrameError(Exception): + def __init__(self, df_name): + message = f"Input DataFrame {df_name} is empty." + super().__init__(message) diff --git a/tests/test_buddycheck.py b/tests/test_buddycheck.py new file mode 100644 index 0000000..8cbb436 --- /dev/null +++ b/tests/test_buddycheck.py @@ -0,0 +1,148 @@ +import unittest +from unittest.mock import patch +import pandas as pd +import tempfile +import os +from cautiousrobot import BuddyCheck +from cautiousrobot.utils import process_csv +from cautiousrobot.exceptions import EmptyDataFrameError + +class TestBuddyCheck(unittest.TestCase): + def setUp(self): + self.buddy_check = BuddyCheck() + self.buddy_check_filename = BuddyCheck(buddy_id='filename') + self.buddy_check_id_col = BuddyCheck(buddy_id = "filename", buddy_col = "sha256") + + self.img_source_file = tempfile.NamedTemporaryFile(delete=False, mode='w') + self.checksum_source_file = tempfile.NamedTemporaryFile(delete=False, mode='w') + + self.img_source_file.write("""filename,checksum +image1.jpg,abc123 +image2.jpg,def456 +image3.jpg,ghi789 +""") + self.img_source_file.close() + + self.checksum_source_file.write("""filename,md5 +image1.jpg,abc123 +image2.jpg,def456 +image3.jpg,ghi789 +""") + self.checksum_source_file.close() + + + def tearDown(self): + os.remove(self.img_source_file.name) + os.remove(self.checksum_source_file.name) + + def test_initialization(self): + self.assertEqual(self.buddy_check.buddy_id, None) + self.assertEqual(self.buddy_check.buddy_col, 'md5') + self.assertEqual(self.buddy_check_id_col.buddy_id, 'filename') + self.assertEqual(self.buddy_check_id_col.buddy_col, 'sha256') + + def test_merge_on_checksum(self): + source_df = pd.read_csv(self.img_source_file.name) + checksum_df = pd.read_csv(self.checksum_source_file.name) + + merged_df = self.buddy_check_filename.merge_on_checksum(source_df, checksum_df, 'checksum') + expected_df = pd.DataFrame({ + 'filename_x': ['image1.jpg', 'image2.jpg', 'image3.jpg'], + 'checksum': ['abc123', 'def456', 'ghi789'], + 'filename_y': ['image1.jpg', 'image2.jpg', 'image3.jpg'], + 'md5': ['abc123', 'def456', 'ghi789'] + }) + pd.testing.assert_frame_equal(merged_df, expected_df) + + def test_merge_on_filename_checksum(self): + source_df = pd.read_csv(self.img_source_file.name) + checksum_df = pd.read_csv(self.checksum_source_file.name) + merged_df = self.buddy_check_filename.merge_on_filename_checksum(source_df, checksum_df, 'filename', 'checksum') + expected_df = pd.DataFrame({ + 'filename': ['image1.jpg', 'image2.jpg', 'image3.jpg'], + 'checksum': ['abc123', 'def456', 'ghi789'], + 'md5': ['abc123', 'def456', 'ghi789'] + }) + pd.testing.assert_frame_equal(merged_df, expected_df) + + def test_check_alignment_all_matching(self): + source_df = pd.read_csv(self.img_source_file.name) + checksum_df = pd.read_csv(self.checksum_source_file.name) + merged_df = self.buddy_check_filename.merge_on_filename_checksum(source_df, checksum_df, 'filename', 'checksum') + missing_imgs = self.buddy_check_filename.check_alignment(source_df, merged_df) + self.assertIsNone(missing_imgs) + + def test_check_alignment_some_missing(self): + source_df = pd.DataFrame({ + 'filename': ['image1.jpg', 'image2.jpg', 'image3.jpg', 'image4.jpg'], + 'checksum': ['abc123', 'def456', 'ghi789', 'jkl012'] + }) + checksum_df = pd.read_csv(self.checksum_source_file.name) + merged_df = self.buddy_check_filename.merge_on_filename_checksum(source_df, checksum_df, 'filename', 'checksum') + missing_imgs = self.buddy_check_filename.check_alignment(source_df, merged_df) + expected_missing_imgs = pd.DataFrame({ + 'filename': ['image4.jpg'], + 'checksum': ['jkl012'] + }) + pd.testing.assert_frame_equal(missing_imgs.reset_index(drop=True), expected_missing_imgs) + + def test_validate_download_success(self): + missing_imgs = self.buddy_check.validate_download( + source_df=pd.read_csv(self.img_source_file.name), + checksum_df=pd.read_csv(self.checksum_source_file.name), + source_id_col="filename", + source_validation_col="checksum" + ) + self.assertIsNone(missing_imgs) + + def test_validate_download_missing_images(self): + source_df = pd.DataFrame({ + 'filename': ['image1.jpg', 'image2.jpg', 'image3.jpg', 'image4.jpg'], + 'checksum': ['abc123', 'def456', 'ghi789', 'jkl012'] + }) + checksum_df = pd.read_csv(self.checksum_source_file.name) + missing_imgs = self.buddy_check_filename.validate_download( + source_df=source_df, + checksum_df=checksum_df, + source_id_col="filename", + source_validation_col="checksum" + ) + expected_missing_imgs = pd.DataFrame({ + 'filename': ['image4.jpg'], + 'checksum': ['jkl012'] + }) + pd.testing.assert_frame_equal(missing_imgs.reset_index(drop=True), expected_missing_imgs) + + def test_check_alignment_no_matching(self): + source_df = pd.read_csv(self.img_source_file.name) + checksum_df = pd.DataFrame({ + 'filename': ['image4.jpg', 'image5.jpg', 'image6.jpg'], + 'md5': ['xyz123', 'uvw456', 'rst789'] + }) + merged_df = self.buddy_check_filename.merge_on_filename_checksum(source_df, checksum_df, 'filename', 'checksum') + missing_imgs = self.buddy_check_filename.check_alignment(source_df, merged_df) + self.assertIsNotNone(missing_imgs) + self.assertEqual(missing_imgs.shape[0], 3) + + def test_check_alignment_checksums_only(self): + source_df = pd.read_csv(self.img_source_file.name) + checksum_df = pd.read_csv(self.checksum_source_file.name) + merged_df = self.buddy_check.merge_on_checksum(source_df, checksum_df, 'checksum') + missing_imgs = self.buddy_check.check_alignment(source_df, merged_df) + self.assertIsNone(missing_imgs) + + def test_validate_download_empty_img_df(self): + source_df = pd.DataFrame(columns=['filename', 'checksum']) + checksum_df = pd.read_csv(self.checksum_source_file.name) + with self.assertRaises(EmptyDataFrameError): + missing_imgs = self.buddy_check.validate_download(source_df, checksum_df, 'filename', 'checksum') + + def test_validate_download_empty_checksum_df(self): + source_df = pd.read_csv(self.img_source_file.name) + checksum_df = pd.DataFrame(columns=['filename', 'md5']) + with self.assertRaises(EmptyDataFrameError): + missing_imgs = self.buddy_check_filename.validate_download(source_df, checksum_df, 'filename', 'checksum') + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_download_images.py b/tests/test_download_images.py new file mode 100644 index 0000000..fd6c9a0 --- /dev/null +++ b/tests/test_download_images.py @@ -0,0 +1,309 @@ +import unittest +from unittest.mock import patch, MagicMock, mock_open, call +import pandas as pd +import os +import io +import shutil +from io import BytesIO +from PIL import Image +import requests +import base64 +from cautiousrobot.__main__ import download_images, main +from http.server import HTTPServer, SimpleHTTPRequestHandler +import threading + +TESTDATA_DIR = os.path.join(os.path.dirname(__file__), 'testdata') + +class CustomHTTPRequestHandler(SimpleHTTPRequestHandler): + def translate_path(self, path): + return os.path.join(TESTDATA_DIR, os.path.relpath(path, '/')) + +class TestDownload(unittest.TestCase): + + @classmethod + def setUpClass(cls): + cls.httpd = HTTPServer(('localhost', 9201), CustomHTTPRequestHandler) + cls.server_thread = threading.Thread(target=cls.httpd.serve_forever) + cls.server_thread.start() + print(f"Serving {TESTDATA_DIR} on http://localhost:9201") + + @classmethod + def tearDownClass(cls): + cls.httpd.shutdown() + cls.server_thread.join() + + + def setUp(self): + self.DUMMY_DATA = pd.DataFrame(data={ + "filename": ["test_file1.jpg", "test_file2.jpg"], + "file_url": ["http://localhost:9201/images/image1.jpg", "http://localhost:9201/images/image2.png"], + "subfolder": ["test_subfolder1", "test_subfolder2"] + }) + self.IMG_DIR = "test_dir" + self.LOG_FILEPATH = "test_log_path.jsonl" + self.ERROR_LOG_FILEPATH = "test_error_log_path.jsonl" + self.DOWNSAMPLE_DIR = self.IMG_DIR + "_downsized" + self.DOWNSAMPLE_SIZE = 100 + + os.makedirs(self.IMG_DIR, exist_ok=True) + os.makedirs(self.DOWNSAMPLE_DIR, exist_ok=True) + for subfolder in self.DUMMY_DATA["subfolder"]: + os.makedirs(os.path.join(self.DOWNSAMPLE_DIR, subfolder), exist_ok=True) + + def tearDown(self): + shutil.rmtree(self.IMG_DIR, ignore_errors=True) + shutil.rmtree(self.DOWNSAMPLE_DIR, ignore_errors=True) + if os.path.exists(self.LOG_FILEPATH): + os.remove(self.LOG_FILEPATH) + if os.path.exists(self.ERROR_LOG_FILEPATH): + os.remove(self.ERROR_LOG_FILEPATH) + + @patch('requests.get') + def test_response_exception(self, get_mock): + get_mock.side_effect = requests.exceptions.RequestException + download_images(self.DUMMY_DATA, self.IMG_DIR, self.LOG_FILEPATH, self.ERROR_LOG_FILEPATH) + for filename in self.DUMMY_DATA['filename']: + self.assertFalse(os.path.isfile(f"{self.IMG_DIR}/{filename}")) + + @patch('requests.get') + def test_successful_download(self, get_mock): + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.raw = BytesIO(b"fake_image_data") + get_mock.return_value = mock_response + + download_images(self.DUMMY_DATA, self.IMG_DIR, self.LOG_FILEPATH, self.ERROR_LOG_FILEPATH) + + for filename in self.DUMMY_DATA['filename']: + self.assertTrue(os.path.isfile(f"{self.IMG_DIR}/{filename}")) + + @patch('requests.get') + def test_successful_download_with_subfolder(self, get_mock): + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.raw = BytesIO(b"fake_image_data") + get_mock.return_value = mock_response + + download_images(self.DUMMY_DATA, self.IMG_DIR, self.LOG_FILEPATH, self.ERROR_LOG_FILEPATH, subfolders="subfolder") + + for i, filename in enumerate(self.DUMMY_DATA['filename']): + subfolder = self.DUMMY_DATA['subfolder'][i] + self.assertTrue(os.path.isfile(f"{self.IMG_DIR}/{subfolder}/{filename}")) + + + @patch('requests.get') + @patch('time.sleep', return_value=None) + def test_success_after_retries(self,sleep_mock, get_mock): + retry_status_codes = [429, 500, 502, 503, 504] + for status_code in retry_status_codes: + with self.subTest(status_code=status_code): + mock_response_retry = MagicMock() + mock_response_retry.status_code = status_code + mock_response_success = MagicMock() + mock_response_success.status_code = 200 + mock_response_success.raw = BytesIO(b"fake_image_data") + get_mock.side_effect = [ + mock_response_retry, mock_response_retry, mock_response_success, # For test_file1.jpg + mock_response_retry, mock_response_retry, mock_response_success # For test_file2.jpg + ] + + download_images(self.DUMMY_DATA, self.IMG_DIR, self.LOG_FILEPATH, self.ERROR_LOG_FILEPATH, retry=3) + + for filename in self.DUMMY_DATA['filename']: + self.assertTrue(os.path.isfile(f"{self.IMG_DIR}/{filename}")) + + @patch('requests.get') + @patch('time.sleep', return_value=None) + def test_failure_after_retries(self, sleep_mock,get_mock): + retry_status_codes = [429, 500, 502, 503, 504] + for status_code in retry_status_codes: + with self.subTest(status_code=status_code): + mock_response_retry = MagicMock() + mock_response_retry.status_code = status_code + get_mock.side_effect = [ + mock_response_retry, mock_response_retry, mock_response_retry, mock_response_retry, mock_response_retry, # For test_file1.jpg + mock_response_retry, mock_response_retry, mock_response_retry, mock_response_retry, mock_response_retry # For test_file2.jpg + ] + + download_images(self.DUMMY_DATA, self.IMG_DIR, self.LOG_FILEPATH, self.ERROR_LOG_FILEPATH, retry=5) + + for filename in self.DUMMY_DATA['filename']: + self.assertFalse(os.path.isfile(f"{self.IMG_DIR}/{filename}")) + + def test_downsampled_image_creation(self): + download_images(self.DUMMY_DATA, self.IMG_DIR, self.LOG_FILEPATH, self.ERROR_LOG_FILEPATH, + downsample_path=self.DOWNSAMPLE_DIR, downsample=self.DOWNSAMPLE_SIZE) + + for filename in self.DUMMY_DATA['filename']: + downsampled_path = os.path.join(self.DOWNSAMPLE_DIR, filename) + print(f"Checking existence of downsampled image: {downsampled_path}") + self.assertTrue(os.path.isfile(f"{self.DOWNSAMPLE_DIR}/{filename}")) + + def test_downsampled_image_creation_with_subfolder(self): + download_images(self.DUMMY_DATA, self.IMG_DIR, self.LOG_FILEPATH, self.ERROR_LOG_FILEPATH, + downsample_path=self.DOWNSAMPLE_DIR, downsample=self.DOWNSAMPLE_SIZE, subfolders="subfolder") + + for i, filename in enumerate(self.DUMMY_DATA['filename']): + subfolder = self.DUMMY_DATA['subfolder'][i] + self.assertTrue(os.path.isfile(f"{self.DOWNSAMPLE_DIR}/{subfolder}/{filename}")) + + @patch('requests.get') + def test_logging(self, get_mock): + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.raw = BytesIO(b"fake_image_data") + get_mock.return_value = mock_response + + download_images(self.DUMMY_DATA, self.IMG_DIR, self.LOG_FILEPATH, self.ERROR_LOG_FILEPATH) + + self.assertTrue(os.path.isfile(self.LOG_FILEPATH)) + self.assertFalse(os.path.isfile(self.ERROR_LOG_FILEPATH)) + +class TestMainFunction(unittest.TestCase): + @patch('cautiousrobot.__main__.parse_args') + @patch('cautiousrobot.__main__.process_csv') + @patch('cautiousrobot.__main__.download_images') + @patch('cautiousrobot.__main__.get_checksums') + @patch('cautiousrobot.__main__.BuddyCheck') + @patch('os.path.exists') + @patch('builtins.input', return_value='y') + def test_main_successful_execution(self, mock_input, mock_exists, mock_BuddyCheck, mock_get_checksums, mock_download_images, mock_process_csv, mock_parse_args): + mock_args = MagicMock() + mock_args.input_file = 'test.csv' + mock_args.img_name_col = 'filename_col' + mock_args.url_col = 'url_col' + mock_args.subdir_col = None + mock_args.output_dir = 'output_dir' + mock_args.side_length = None + mock_args.wait_time = 0 + mock_args.max_retries = 3 + mock_args.starting_idx = 0 + mock_args.checksum_algorithm = 'md5' + mock_args.verifier_col = None + + mock_parse_args.return_value = mock_args + mock_exists.return_value = False + + mock_data = pd.DataFrame({ + 'filename_col': ['file1', 'file2', 'file3', 'file4'], + 'url_col': ['url1', 'url2', 'url3', 'url4'] + }) + + mock_process_csv.return_value = mock_data + + try: + main() + except SystemExit as e: + self.fail(f"main() raised SystemExit unexpectedly: {e}") + + @patch('cautiousrobot.__main__.parse_args') + def test_main_csv_extension_error(self, mock_parse_args): + mock_args = MagicMock() + mock_args.input_file = 'test.txt' + mock_parse_args.return_value = mock_args + + with self.assertRaises(SystemExit) as cm: + main() + + self.assertEqual(cm.exception.code, "Expected CSV for input file; extension should be '.csv'") + + @patch('cautiousrobot.__main__.parse_args') + @patch('cautiousrobot.__main__.process_csv') + def test_main_missing_columns_error(self, mock_process_csv, mock_parse_args): + mock_args = MagicMock() + mock_args.input_file = 'test.csv' + mock_args.img_name_col = 'filename_col' + mock_args.url_col = 'url_col' + mock_args.subdir_col = None + mock_parse_args.return_value = mock_args + + mock_process_csv.side_effect = Exception("Missing required columns") + + with self.assertRaises(SystemExit) as cm: + main() + + self.assertEqual(cm.exception.code, "Missing required columns Please adjust inputs and try again.") + + @patch('cautiousrobot.__main__.parse_args') + @patch('cautiousrobot.__main__.process_csv') + def test_main_non_unique_filenames(self, mock_process_csv, mock_parse_args): + mock_args = MagicMock() + mock_args.input_file = 'test.csv' + mock_args.img_name_col = 'filename_col' + mock_args.url_col = 'url_col' + mock_args.subdir_col = None + mock_parse_args.return_value = mock_args + + mock_data = pd.DataFrame({ + 'filename_col': ['file1', 'file2', 'file1', 'file4'], + 'url_col': ['url1', 'url2', 'url3', 'url4'] + }) + + mock_process_csv.return_value = mock_data + + with self.assertRaises(SystemExit) as cm: + main() + + self.assertEqual( + str(cm.exception), + "filename_col is not a unique identifier for this dataset, please choose a column with unique values for filenames." + ) + + @patch('cautiousrobot.__main__.parse_args') + @patch('cautiousrobot.__main__.process_csv') + @patch('builtins.input', return_value='n') + def test_main_missing_filenames(self, mock_input, mock_process_csv, mock_parse_args): + mock_args = MagicMock() + mock_args.input_file = 'test.csv' + mock_args.img_name_col = 'filename_col' + mock_args.url_col = 'url_col' + mock_args.subdir_col = None + mock_parse_args.return_value = mock_args + + mock_data = pd.DataFrame({ + 'filename_col': [None, None, 'file2', 'file3'], + 'url_col': ['url1', 'url2', 'url3', 'url4'] + }) + + mock_process_csv.return_value = mock_data + + with self.assertRaises(SystemExit) as cm: + main() + + self.assertEqual(cm.exception.code, "Exited without executing.") + + @patch('cautiousrobot.__main__.parse_args') + @patch('cautiousrobot.__main__.process_csv') + @patch('builtins.input', return_value='n') + @patch('os.path.exists', return_value=True) + def test_main_directory_exists(self, mock_exists, mock_input, mock_process_csv, mock_parse_args): + mock_args = MagicMock() + mock_args.input_file = 'test.csv' + mock_args.img_name_col = 'filename_col' + mock_args.url_col = 'url_col' + mock_args.subdir_col = None + mock_args.output_dir = 'output_dir' + mock_args.side_length = None + mock_args.wait_time = 0 + mock_args.max_retries = 3 + mock_args.starting_idx = 0 + mock_args.checksum_algorithm = 'md5' + mock_args.verifier_col = None + + mock_parse_args.return_value = mock_args + + mock_data = pd.DataFrame({ + 'filename_col': ['file1', 'file2', 'file3', 'file4'], + 'url_col': ['url1', 'url2', 'url3', 'url4'] + }) + + mock_process_csv.return_value = mock_data + + with self.assertRaises(SystemExit) as cm: + main() + + self.assertEqual(cm.exception.code, "Exited without executing.") + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/testdata/images/image1.jpg b/tests/testdata/images/image1.jpg new file mode 100644 index 0000000..8c15955 Binary files /dev/null and b/tests/testdata/images/image1.jpg differ diff --git a/tests/testdata/images/image2.png b/tests/testdata/images/image2.png new file mode 100644 index 0000000..c13c2a3 Binary files /dev/null and b/tests/testdata/images/image2.png differ