Merge branch 'develop' into dl/sync-with-upstream

cvat-ai · Mar 11, 2025 · 3263886 · 3263886
2 parents b63c636 + f9cdbfb
commit 3263886
Show file tree

Hide file tree

Showing 8 changed files with 139 additions and 42 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -22,7 +22,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   (<https://github.com/openvinotoolkit/datumaro/pull/678>)
 - Detection for Cityscapes format
   (<https://github.com/openvinotoolkit/datumaro/pull/680>)
-- Maximum recursion `--depth` parameter for `detect-dataset` CLI command
+- Maximum recursion `--depth` parameter for `detect` CLI command
   (<https://github.com/openvinotoolkit/datumaro/pull/680>)
 - An option to save a single subset in the `download` command
   (<https://github.com/openvinotoolkit/datumaro/pull/697>)
@@ -52,6 +52,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   (<https://github.com/cvat-ai/datumaro/pull/71>)
 - Support for tracks in Ultralytics YOLO formats
   (<https://github.com/cvat-ai/datumaro/pull/70>)
+- \[API\] `ImageFromBytes.save()` now preserves image extension if no output extension is specified
+  (<https://github.com/cvat-ai/datumaro/pull/91>)
+- \[API\] `ImageFromBytes.save()` now guarantees there will be no extra image encoding/decoding
+  when possible (e.g. if input and output extension is the same)
+  (<https://github.com/cvat-ai/datumaro/pull/91>)
 
 ### Changed
 - `env.detect_dataset()` now returns a list of detected formats at all recursion levels
@@ -113,6 +118,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   (<https://github.com/cvat-ai/datumaro/pull/45>)
 - Failing `resize` transform for RLE masks
   (<https://github.com/cvat-ai/datumaro/pull/46>)
+- Invalid handling of Mac OS special dirs in format detection
+  (<https://github.com/cvat-ai/datumaro/pull/88>)
 
 ### Security
 - TBD

diff --git a/requirements-core.txt b/requirements-core.txt
@@ -51,8 +51,8 @@ tabulate
 # prune
 scikit-learn
 
-# Stream JSON parser
-json-stream
+# Stream JSON dumper
+python-rapidjson==1.20
 
 # TabularValidator
 nltk

diff --git a/src/datumaro/components/environment.py b/src/datumaro/components/environment.py
@@ -13,7 +13,7 @@
 from datumaro.components.cli_plugin import plugin_types
 from datumaro.components.format_detection import RejectionReason, detect_dataset_format
 from datumaro.components.registry import PluginRegistry
-from datumaro.util.os_util import import_foreign_module, split_path
+from datumaro.util.os_util import SPECIAL_MACOS_FOLDERS, import_foreign_module, split_path
 
 
 class Environment:
@@ -208,7 +208,8 @@ def detect_dataset(
         depth: int = 1,
         rejection_callback: Optional[Callable[[str, RejectionReason, str], None]] = None,
     ) -> List[str]:
-        ignore_dirs = {"__MSOSX", "__MACOSX"}
+        ignore_dirs = SPECIAL_MACOS_FOLDERS
+
         matched_formats = set()
         for _ in range(depth + 1):
             detected_formats = detect_dataset_format(
@@ -225,9 +226,12 @@ def detect_dataset(
             elif detected_formats:
                 matched_formats |= set(detected_formats)
 
-            paths = glob.glob(osp.join(path, "*"))
-            path = "" if len(paths) != 1 else paths[0]
-            if not osp.isdir(path) or osp.basename(path) in ignore_dirs:
+            # If there is only a single nested dir, recurse into it up to the allowed level
+            nested_paths = [
+                p for p in glob.glob(osp.join(path, "*")) if osp.basename(p) not in ignore_dirs
+            ]
+            path = "" if len(nested_paths) != 1 else nested_paths[0]
+            if not path or not osp.isdir(path):
                 break
 
         return [format.name for format in matched_formats]

diff --git a/src/datumaro/components/media.py b/src/datumaro/components/media.py
@@ -291,7 +291,7 @@ def _get_ext_to_save(self, fp: Union[str, io.IOBase], ext: Optional[str] = None)
             assert ext is None, "'ext' must be empty if string is given."
             ext = osp.splitext(osp.basename(fp))[1].lower()
         else:
-            ext = ext if ext else self._DEFAULT_EXT
+            ext = ext or self.ext or self._DEFAULT_EXT
         return ext
 
     def __eq__(self, other):
@@ -455,6 +455,21 @@ def data(self) -> Optional[np.ndarray]:
             self._size = tuple(map(int, data.shape[:2]))
         return data
 
+    def save(
+        self,
+        fp: Union[str, io.IOBase],
+        ext: Optional[str] = None,
+        crypter: Crypter = NULL_CRYPTER,
+    ):
+        new_ext = self._get_ext_to_save(fp, ext)
+
+        if self.ext == new_ext:
+            if isinstance(fp, str):
+                os.makedirs(osp.dirname(fp), exist_ok=True)
+            copyto_image(io.BytesIO(self.bytes), fp, src_crypter=self._crypter, dst_crypter=crypter)
+        else:
+            super().save(fp=fp, ext=ext, crypter=crypter)
+
     def get_data_as_dtype(self, dtype: Optional[np.dtype] = np.uint8) -> Optional[np.ndarray]:
         """Get image data with a specific data type"""
 

diff --git a/src/datumaro/plugins/data_formats/coco/exporter.py b/src/datumaro/plugins/data_formats/coco/exporter.py
@@ -2,7 +2,6 @@
 #
 # SPDX-License-Identifier: MIT
 
-import json
 import logging as log
 import os
 import os.path as osp
@@ -13,7 +12,7 @@
 from typing import Dict, List, Optional, Type, Union
 
 import pycocotools.mask as mask_utils
-from json_stream.writer import streamable_dict, streamable_list
+import rapidjson
 
 import datumaro.util.annotation_util as anno_tools
 import datumaro.util.mask_tools as mask_tools
@@ -106,13 +105,11 @@ def add_anns(self, data: Dict) -> None:
     def merge(self, path: str, header: Dict, min_ann_id: Optional[int]) -> None:
         self.close()
 
-        @streamable_list
         def _gen_images():
             with open(self.imgs.fp.name, "rb") as fp:
                 for line in fp:
                     yield parse_json(line)
 
-        @streamable_list
         def _gen_anns():
             with open(self.anns.fp.name, "rb") as fp:
                 next_id = min_ann_id
@@ -123,24 +120,14 @@ def _gen_anns():
                         next_id += 1
                     yield ann
 
-        @streamable_dict
-        def _gen():
-            yield "licenses", header["licenses"]
-            yield "info", header["info"]
-            yield "categories", header["categories"]
-
-            if not self.imgs.is_empty:
-                yield "images", _gen_images()
-            else:
-                yield "images", []
-
-            if not self.anns.is_empty:
-                yield "annotations", _gen_anns()
-            else:
-                yield "annotations", []
+        data = dict(
+            header,
+            images=[] if self.imgs.is_empty else _gen_images(),
+            annotations=[] if self.anns.is_empty else _gen_anns(),
+        )
 
         with open(path, "w", encoding="utf-8") as fp:
-            json.dump(_gen(), fp)
+            rapidjson.dump(data, fp, indent=None)
 
         self.remove()
 

diff --git a/src/datumaro/util/__init__.py b/src/datumaro/util/__init__.py
@@ -6,11 +6,10 @@
 from functools import wraps
 from inspect import isclass
 from itertools import islice
-from typing import Any, Callable, Dict, Iterable, Tuple, TypeVar, Union
+from typing import Any, Callable, Iterable, Tuple, TypeVar, Union
 
 import attrs
 import orjson
-from json_stream.base import StreamingJSONList, StreamingJSONObject
 
 NOTSET = object()
 
@@ -204,11 +203,3 @@ def dump_json_file(
 
 def current_function_name(depth=1):
     return inspect.getouterframes(inspect.currentframe())[depth].function
-
-
-def to_dict_from_streaming_json(obj: Any) -> Dict[str, Any]:
-    if isinstance(obj, StreamingJSONObject):
-        return {k: to_dict_from_streaming_json(v) for k, v in obj.items()}
-    if isinstance(obj, StreamingJSONList):
-        return [to_dict_from_streaming_json(v) for v in obj]
-    return obj
diff --git a/tests/integration/cli/test_detect_format.py b/tests/integration/cli/test_detect_format.py
@@ -4,14 +4,15 @@
 import os
 import os.path as osp
 import shutil
+from pathlib import Path
 from typing import List
 from unittest.case import TestCase
 
 from datumaro.plugins.data_formats.ade20k2017 import Ade20k2017Importer
 from datumaro.plugins.data_formats.ade20k2020 import Ade20k2020Importer
 from datumaro.plugins.data_formats.camvid import CamvidImporter
 from datumaro.plugins.data_formats.lfw import LfwImporter
-from datumaro.util.os_util import suppress_output
+from datumaro.util.os_util import SPECIAL_MACOS_FOLDERS, is_subpath, suppress_output
 
 from tests.requirements import Requirements, mark_requirement
 from tests.utils.assets import get_test_asset_path
@@ -77,6 +78,29 @@ def test_nested_folders(self):
 
             self.assertEqual([Ade20k2020Importer.NAME], output)
 
+    @mark_requirement(Requirements.DATUM_GENERAL_REQ)
+    def test_can_ignore_special_dirs_in_nested_folders(self):
+        with TestDir() as test_dir:
+            output_file = io.StringIO()
+
+            annotation_dir = osp.join(test_dir, "a", "b", "c", "annotations")
+            os.makedirs(annotation_dir)
+            shutil.copy(osp.join(LFW_DIR, "test", "annotations", "pairs.txt"), annotation_dir)
+
+            for subdir_path in Path(annotation_dir).parents:
+                if not is_subpath(str(subdir_path), test_dir):
+                    continue
+
+                for special_dir_name in SPECIAL_MACOS_FOLDERS:
+                    (subdir_path / special_dir_name).mkdir(exist_ok=True)
+
+            with contextlib.redirect_stdout(output_file):
+                run(self, "detect", test_dir, "--depth", "3")
+
+            output = self._extract_detect_format_name(output_file)
+
+            self.assertEqual([LfwImporter.NAME], output)
+
     @mark_requirement(Requirements.DATUM_GENERAL_REQ)
     def test_ambiguous(self):
         with TestDir() as test_dir:

diff --git a/tests/unit/test_images.py b/tests/unit/test_images.py
@@ -1,13 +1,20 @@
+import itertools
 import os.path as osp
+from io import BytesIO
 from unittest import TestCase
+from unittest.mock import Mock, patch
 
 import numpy as np
 
+from datumaro.components.crypter import NULL_CRYPTER, Crypter
 from datumaro.components.media import Image, ImageFromBytes
 from datumaro.util.image import (
+    ImageBackend,
+    ImageColorChannel,
+    decode_image,
+    decode_image_context,
     encode_image,
     lazy_image,
-    load_image,
     load_image_meta_file,
     save_image,
     save_image_meta_file,
@@ -127,7 +134,6 @@ def test_lazy_image_shape(self):
     @mark_requirement(Requirements.DATUM_GENERAL_REQ)
     def test_ctors(self):
         with TestDir() as test_dir:
-            path = osp.join(test_dir, "path.png")
             image = np.ones([2, 4, 3])
             image_bytes = encode_image(image, "png")
 
@@ -164,6 +170,69 @@ def test_ext_detection_failure(self):
         image = ImageFromBytes(data=image_bytes)
         self.assertEqual(image.ext, None)
 
+    @mark_requirement(Requirements.DATUM_GENERAL_REQ)
+    def test_no_excess_decode_on_image_save(self):
+        def check_decode_call_count(image: Image, expected_call_count: int, **kwargs):
+            with patch(
+                "datumaro.components.media.decode_image", Mock(wraps=decode_image)
+            ) as mock_decode:
+                # Only OpenCV backend implements crypter support, so force it.
+                # https://github.com/cvat-ai/datumaro/issues/92
+                with decode_image_context(
+                    image_backend=ImageBackend.cv2, image_color_channel=ImageColorChannel.UNCHANGED
+                ):
+                    image.save(**kwargs)
+                    assert mock_decode.call_count == expected_call_count
+
+        with TestDir() as test_dir:
+            image_np = np.ones([2, 4, 3])
+
+            implicit_extensions = set(ext for _, ext in ImageFromBytes._FORMAT_MAGICS)
+            extensions = {".png", ".bmp", ".jpg", ".tif", ".pic", ".ras"}
+            assert extensions & implicit_extensions
+            assert extensions - implicit_extensions
+
+            for source_ext, save_ext, save_crypter, explicit_ext in itertools.product(
+                extensions, extensions, [NULL_CRYPTER, Crypter(Crypter.gen_key())], [True, False]
+            ):
+                with self.subTest(
+                    source_ext=source_ext,
+                    save_ext=save_ext,
+                    save_crypter=save_crypter,
+                    explicit_ext=explicit_ext,
+                ):
+                    image_bytes = encode_image(image_np, source_ext)
+                    img = Image.from_bytes(
+                        data=image_bytes, ext=source_ext if explicit_ext else None
+                    )
+
+                    knows_current_extension = source_ext in implicit_extensions or explicit_ext
+
+                    # test determine target extension from path
+                    check_decode_call_count(
+                        img,
+                        (0 if knows_current_extension and source_ext == save_ext else 1),
+                        fp=osp.join(test_dir, f"name{save_ext}"),
+                        crypter=save_crypter,
+                    )
+
+                    # test explicit target extension and fp
+                    check_decode_call_count(
+                        img,
+                        (0 if knows_current_extension and source_ext == save_ext else 1),
+                        fp=BytesIO(),
+                        ext=save_ext,
+                        crypter=save_crypter,
+                    )
+
+                    # test extension not passed
+                    check_decode_call_count(
+                        img,
+                        (0 if knows_current_extension else 1),
+                        fp=BytesIO(),
+                        crypter=save_crypter,
+                    )
+
 
 class ImageMetaTest(TestCase):
     @mark_requirement(Requirements.DATUM_GENERAL_REQ)