Skip to content

Commit

Permalink
Merge branch 'develop' into dl/sync-with-upstream
Browse files Browse the repository at this point in the history
  • Loading branch information
Eldies committed Mar 11, 2025
2 parents b63c636 + f9cdbfb commit 3263886
Show file tree
Hide file tree
Showing 8 changed files with 139 additions and 42 deletions.
9 changes: 8 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
(<https://github.com/openvinotoolkit/datumaro/pull/678>)
- Detection for Cityscapes format
(<https://github.com/openvinotoolkit/datumaro/pull/680>)
- Maximum recursion `--depth` parameter for `detect-dataset` CLI command
- Maximum recursion `--depth` parameter for `detect` CLI command
(<https://github.com/openvinotoolkit/datumaro/pull/680>)
- An option to save a single subset in the `download` command
(<https://github.com/openvinotoolkit/datumaro/pull/697>)
Expand Down Expand Up @@ -52,6 +52,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
(<https://github.com/cvat-ai/datumaro/pull/71>)
- Support for tracks in Ultralytics YOLO formats
(<https://github.com/cvat-ai/datumaro/pull/70>)
- \[API\] `ImageFromBytes.save()` now preserves image extension if no output extension is specified
(<https://github.com/cvat-ai/datumaro/pull/91>)
- \[API\] `ImageFromBytes.save()` now guarantees there will be no extra image encoding/decoding
when possible (e.g. if input and output extension is the same)
(<https://github.com/cvat-ai/datumaro/pull/91>)

### Changed
- `env.detect_dataset()` now returns a list of detected formats at all recursion levels
Expand Down Expand Up @@ -113,6 +118,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
(<https://github.com/cvat-ai/datumaro/pull/45>)
- Failing `resize` transform for RLE masks
(<https://github.com/cvat-ai/datumaro/pull/46>)
- Invalid handling of Mac OS special dirs in format detection
(<https://github.com/cvat-ai/datumaro/pull/88>)

### Security
- TBD
Expand Down
4 changes: 2 additions & 2 deletions requirements-core.txt
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,8 @@ tabulate
# prune
scikit-learn

# Stream JSON parser
json-stream
# Stream JSON dumper
python-rapidjson==1.20

# TabularValidator
nltk
Expand Down
14 changes: 9 additions & 5 deletions src/datumaro/components/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from datumaro.components.cli_plugin import plugin_types
from datumaro.components.format_detection import RejectionReason, detect_dataset_format
from datumaro.components.registry import PluginRegistry
from datumaro.util.os_util import import_foreign_module, split_path
from datumaro.util.os_util import SPECIAL_MACOS_FOLDERS, import_foreign_module, split_path


class Environment:
Expand Down Expand Up @@ -208,7 +208,8 @@ def detect_dataset(
depth: int = 1,
rejection_callback: Optional[Callable[[str, RejectionReason, str], None]] = None,
) -> List[str]:
ignore_dirs = {"__MSOSX", "__MACOSX"}
ignore_dirs = SPECIAL_MACOS_FOLDERS

matched_formats = set()
for _ in range(depth + 1):
detected_formats = detect_dataset_format(
Expand All @@ -225,9 +226,12 @@ def detect_dataset(
elif detected_formats:
matched_formats |= set(detected_formats)

paths = glob.glob(osp.join(path, "*"))
path = "" if len(paths) != 1 else paths[0]
if not osp.isdir(path) or osp.basename(path) in ignore_dirs:
# If there is only a single nested dir, recurse into it up to the allowed level
nested_paths = [
p for p in glob.glob(osp.join(path, "*")) if osp.basename(p) not in ignore_dirs
]
path = "" if len(nested_paths) != 1 else nested_paths[0]
if not path or not osp.isdir(path):
break

return [format.name for format in matched_formats]
Expand Down
17 changes: 16 additions & 1 deletion src/datumaro/components/media.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@ def _get_ext_to_save(self, fp: Union[str, io.IOBase], ext: Optional[str] = None)
assert ext is None, "'ext' must be empty if string is given."
ext = osp.splitext(osp.basename(fp))[1].lower()
else:
ext = ext if ext else self._DEFAULT_EXT
ext = ext or self.ext or self._DEFAULT_EXT
return ext

def __eq__(self, other):
Expand Down Expand Up @@ -455,6 +455,21 @@ def data(self) -> Optional[np.ndarray]:
self._size = tuple(map(int, data.shape[:2]))
return data

def save(
self,
fp: Union[str, io.IOBase],
ext: Optional[str] = None,
crypter: Crypter = NULL_CRYPTER,
):
new_ext = self._get_ext_to_save(fp, ext)

if self.ext == new_ext:
if isinstance(fp, str):
os.makedirs(osp.dirname(fp), exist_ok=True)
copyto_image(io.BytesIO(self.bytes), fp, src_crypter=self._crypter, dst_crypter=crypter)
else:
super().save(fp=fp, ext=ext, crypter=crypter)

def get_data_as_dtype(self, dtype: Optional[np.dtype] = np.uint8) -> Optional[np.ndarray]:
"""Get image data with a specific data type"""

Expand Down
27 changes: 7 additions & 20 deletions src/datumaro/plugins/data_formats/coco/exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
#
# SPDX-License-Identifier: MIT

import json
import logging as log
import os
import os.path as osp
Expand All @@ -13,7 +12,7 @@
from typing import Dict, List, Optional, Type, Union

import pycocotools.mask as mask_utils
from json_stream.writer import streamable_dict, streamable_list
import rapidjson

import datumaro.util.annotation_util as anno_tools
import datumaro.util.mask_tools as mask_tools
Expand Down Expand Up @@ -106,13 +105,11 @@ def add_anns(self, data: Dict) -> None:
def merge(self, path: str, header: Dict, min_ann_id: Optional[int]) -> None:
self.close()

@streamable_list
def _gen_images():
with open(self.imgs.fp.name, "rb") as fp:
for line in fp:
yield parse_json(line)

@streamable_list
def _gen_anns():
with open(self.anns.fp.name, "rb") as fp:
next_id = min_ann_id
Expand All @@ -123,24 +120,14 @@ def _gen_anns():
next_id += 1
yield ann

@streamable_dict
def _gen():
yield "licenses", header["licenses"]
yield "info", header["info"]
yield "categories", header["categories"]

if not self.imgs.is_empty:
yield "images", _gen_images()
else:
yield "images", []

if not self.anns.is_empty:
yield "annotations", _gen_anns()
else:
yield "annotations", []
data = dict(
header,
images=[] if self.imgs.is_empty else _gen_images(),
annotations=[] if self.anns.is_empty else _gen_anns(),
)

with open(path, "w", encoding="utf-8") as fp:
json.dump(_gen(), fp)
rapidjson.dump(data, fp, indent=None)

self.remove()

Expand Down
11 changes: 1 addition & 10 deletions src/datumaro/util/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,10 @@
from functools import wraps
from inspect import isclass
from itertools import islice
from typing import Any, Callable, Dict, Iterable, Tuple, TypeVar, Union
from typing import Any, Callable, Iterable, Tuple, TypeVar, Union

import attrs
import orjson
from json_stream.base import StreamingJSONList, StreamingJSONObject

NOTSET = object()

Expand Down Expand Up @@ -204,11 +203,3 @@ def dump_json_file(

def current_function_name(depth=1):
return inspect.getouterframes(inspect.currentframe())[depth].function


def to_dict_from_streaming_json(obj: Any) -> Dict[str, Any]:
if isinstance(obj, StreamingJSONObject):
return {k: to_dict_from_streaming_json(v) for k, v in obj.items()}
if isinstance(obj, StreamingJSONList):
return [to_dict_from_streaming_json(v) for v in obj]
return obj
26 changes: 25 additions & 1 deletion tests/integration/cli/test_detect_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@
import os
import os.path as osp
import shutil
from pathlib import Path
from typing import List
from unittest.case import TestCase

from datumaro.plugins.data_formats.ade20k2017 import Ade20k2017Importer
from datumaro.plugins.data_formats.ade20k2020 import Ade20k2020Importer
from datumaro.plugins.data_formats.camvid import CamvidImporter
from datumaro.plugins.data_formats.lfw import LfwImporter
from datumaro.util.os_util import suppress_output
from datumaro.util.os_util import SPECIAL_MACOS_FOLDERS, is_subpath, suppress_output

from tests.requirements import Requirements, mark_requirement
from tests.utils.assets import get_test_asset_path
Expand Down Expand Up @@ -77,6 +78,29 @@ def test_nested_folders(self):

self.assertEqual([Ade20k2020Importer.NAME], output)

@mark_requirement(Requirements.DATUM_GENERAL_REQ)
def test_can_ignore_special_dirs_in_nested_folders(self):
with TestDir() as test_dir:
output_file = io.StringIO()

annotation_dir = osp.join(test_dir, "a", "b", "c", "annotations")
os.makedirs(annotation_dir)
shutil.copy(osp.join(LFW_DIR, "test", "annotations", "pairs.txt"), annotation_dir)

for subdir_path in Path(annotation_dir).parents:
if not is_subpath(str(subdir_path), test_dir):
continue

for special_dir_name in SPECIAL_MACOS_FOLDERS:
(subdir_path / special_dir_name).mkdir(exist_ok=True)

with contextlib.redirect_stdout(output_file):
run(self, "detect", test_dir, "--depth", "3")

output = self._extract_detect_format_name(output_file)

self.assertEqual([LfwImporter.NAME], output)

@mark_requirement(Requirements.DATUM_GENERAL_REQ)
def test_ambiguous(self):
with TestDir() as test_dir:
Expand Down
73 changes: 71 additions & 2 deletions tests/unit/test_images.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,20 @@
import itertools
import os.path as osp
from io import BytesIO
from unittest import TestCase
from unittest.mock import Mock, patch

import numpy as np

from datumaro.components.crypter import NULL_CRYPTER, Crypter
from datumaro.components.media import Image, ImageFromBytes
from datumaro.util.image import (
ImageBackend,
ImageColorChannel,
decode_image,
decode_image_context,
encode_image,
lazy_image,
load_image,
load_image_meta_file,
save_image,
save_image_meta_file,
Expand Down Expand Up @@ -127,7 +134,6 @@ def test_lazy_image_shape(self):
@mark_requirement(Requirements.DATUM_GENERAL_REQ)
def test_ctors(self):
with TestDir() as test_dir:
path = osp.join(test_dir, "path.png")
image = np.ones([2, 4, 3])
image_bytes = encode_image(image, "png")

Expand Down Expand Up @@ -164,6 +170,69 @@ def test_ext_detection_failure(self):
image = ImageFromBytes(data=image_bytes)
self.assertEqual(image.ext, None)

@mark_requirement(Requirements.DATUM_GENERAL_REQ)
def test_no_excess_decode_on_image_save(self):
def check_decode_call_count(image: Image, expected_call_count: int, **kwargs):
with patch(
"datumaro.components.media.decode_image", Mock(wraps=decode_image)
) as mock_decode:
# Only OpenCV backend implements crypter support, so force it.
# https://github.com/cvat-ai/datumaro/issues/92
with decode_image_context(
image_backend=ImageBackend.cv2, image_color_channel=ImageColorChannel.UNCHANGED
):
image.save(**kwargs)
assert mock_decode.call_count == expected_call_count

with TestDir() as test_dir:
image_np = np.ones([2, 4, 3])

implicit_extensions = set(ext for _, ext in ImageFromBytes._FORMAT_MAGICS)
extensions = {".png", ".bmp", ".jpg", ".tif", ".pic", ".ras"}
assert extensions & implicit_extensions
assert extensions - implicit_extensions

for source_ext, save_ext, save_crypter, explicit_ext in itertools.product(
extensions, extensions, [NULL_CRYPTER, Crypter(Crypter.gen_key())], [True, False]
):
with self.subTest(
source_ext=source_ext,
save_ext=save_ext,
save_crypter=save_crypter,
explicit_ext=explicit_ext,
):
image_bytes = encode_image(image_np, source_ext)
img = Image.from_bytes(
data=image_bytes, ext=source_ext if explicit_ext else None
)

knows_current_extension = source_ext in implicit_extensions or explicit_ext

# test determine target extension from path
check_decode_call_count(
img,
(0 if knows_current_extension and source_ext == save_ext else 1),
fp=osp.join(test_dir, f"name{save_ext}"),
crypter=save_crypter,
)

# test explicit target extension and fp
check_decode_call_count(
img,
(0 if knows_current_extension and source_ext == save_ext else 1),
fp=BytesIO(),
ext=save_ext,
crypter=save_crypter,
)

# test extension not passed
check_decode_call_count(
img,
(0 if knows_current_extension else 1),
fp=BytesIO(),
crypter=save_crypter,
)


class ImageMetaTest(TestCase):
@mark_requirement(Requirements.DATUM_GENERAL_REQ)
Expand Down

0 comments on commit 3263886

Please sign in to comment.