Skip to content

Commit d79e056

Browse files
authored
Update Datumaro format (#7125)
<!-- Raise an issue to propose your change (https://github.com/opencv/cvat/issues). It helps to avoid duplication of efforts from multiple independent contributors. Discuss your ideas with maintainers to be sure that changes will be approved and merged. Read the [Contribution guide](https://opencv.github.io/cvat/docs/contributing/). --> <!-- Provide a general summary of your changes in the Title above --> ### Motivation and context <!-- Why is this change required? What problem does it solve? If it fixes an open issue, please link to the issue here. Describe your changes in detail, add screenshots. --> Fixes #5424 Fixes #7375 Fixes #8700 Depends on cvat-ai/datumaro#34 This PR improves quality of life when using Datumaro format. - Added support for direct .json uploading of annotations, similarly to the COCO and CVAT formats - Added image metadata when exporting in the Datumaro format without images - For related images in 3d tasks, datumaro export without images will include only the basenames (before: absolute server paths were exported) - <s>Refactored `conv_mask_to_poly` uses to avoid code and logic duplication</s> (will be in another PR) ### How has this been tested? <!-- Please describe in detail how you tested your changes. Include details of your testing environment, and the tests you ran to see how your change affects other areas of the code, etc. --> Unit tests ### Checklist <!-- Go over all the following points, and put an `x` in all the boxes that apply. If an item isn't applicable for some reason, then ~~explicitly strikethrough~~ the whole line. If you don't do that, GitHub will show incorrect progress for the pull request. If you're unsure about any of these, don't hesitate to ask. We're here to help! --> - [ ] I submit my changes into the `develop` branch - [ ] I have created a changelog fragment <!-- see top comment in CHANGELOG.md --> - [ ] I have updated the documentation accordingly - [ ] I have added tests to cover my changes - [ ] I have linked related issues (see [GitHub docs]( https://help.github.com/en/github/managing-your-work-on-github/linking-a-pull-request-to-an-issue#linking-a-pull-request-to-an-issue-using-a-keyword)) - [ ] I have increased versions of npm packages if it is necessary ([cvat-canvas](https://github.com/opencv/cvat/tree/develop/cvat-canvas#versioning), [cvat-core](https://github.com/opencv/cvat/tree/develop/cvat-core#versioning), [cvat-data](https://github.com/opencv/cvat/tree/develop/cvat-data#versioning) and [cvat-ui](https://github.com/opencv/cvat/tree/develop/cvat-ui#versioning)) ### License - [ ] I submit _my code changes_ under the same [MIT License]( https://github.com/opencv/cvat/blob/develop/LICENSE) that covers the project. Feel free to contact the maintainers if that's a concern. <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit - **New Features** - Introduced support for direct .json file import in Datumaro format. - **Bug Fixes** - Fixed an issue where exporting without images in Datumaro format now correctly includes image information. - **Refactor** - Renamed classes and methods in dataset management to support a broader range of media types, enhancing the system's flexibility. - **Tests** - Added new tests to verify the behavior of importing and exporting in Datumaro format, ensuring robustness in dataset handling. <!-- end of auto-generated comment: release notes by coderabbit.ai -->
1 parent 3265e1f commit d79e056

File tree

8 files changed

+332
-121
lines changed

8 files changed

+332
-121
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
### Added
2+
3+
- Support for direct .json file import in Datumaro format
4+
(<https://github.com/opencv/cvat/pull/7125>)
5+
6+
### Fixed
7+
8+
- Export without images in Datumaro format should include image info
9+
(<https://github.com/opencv/cvat/pull/7125>)

cvat/apps/dataset_manager/bindings.py

+115-80
Large diffs are not rendered by default.

cvat/apps/dataset_manager/formats/coco.py

+12-4
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,9 @@
99
from datumaro.components.annotation import AnnotationType
1010
from datumaro.plugins.coco_format.importer import CocoImporter
1111

12-
from cvat.apps.dataset_manager.bindings import GetCVATDataExtractor, detect_dataset, \
13-
import_dm_annotations
12+
from cvat.apps.dataset_manager.bindings import (
13+
GetCVATDataExtractor, NoMediaInAnnotationFileError, import_dm_annotations, detect_dataset
14+
)
1415
from cvat.apps.dataset_manager.util import make_zip_archive
1516

1617
from .registry import dm_env, exporter, importer
@@ -35,6 +36,9 @@ def _import(src_file, temp_dir, instance_data, load_data_callback=None, **kwargs
3536
load_data_callback(dataset, instance_data)
3637
import_dm_annotations(dataset, instance_data)
3738
else:
39+
if load_data_callback:
40+
raise NoMediaInAnnotationFileError()
41+
3842
dataset = Dataset.import_from(src_file.name,
3943
'coco_instances', env=dm_env)
4044
import_dm_annotations(dataset, instance_data)
@@ -52,6 +56,8 @@ def _export(dst_file, temp_dir, instance_data, save_images=False):
5256
def _import(src_file, temp_dir, instance_data, load_data_callback=None, **kwargs):
5357
def remove_extra_annotations(dataset):
5458
for item in dataset:
59+
# Boxes would have invalid (skeleton) labels, so remove them
60+
# TODO: find a way to import boxes
5561
annotations = [ann for ann in item.annotations
5662
if ann.type != AnnotationType.bbox]
5763
item.annotations = annotations
@@ -66,7 +72,9 @@ def remove_extra_annotations(dataset):
6672
load_data_callback(dataset, instance_data)
6773
import_dm_annotations(dataset, instance_data)
6874
else:
69-
dataset = Dataset.import_from(src_file.name,
70-
'coco_person_keypoints', env=dm_env)
75+
if load_data_callback:
76+
raise NoMediaInAnnotationFileError()
77+
78+
dataset = Dataset.import_from(src_file.name, 'coco_person_keypoints', env=dm_env)
7179
remove_extra_annotations(dataset)
7280
import_dm_annotations(dataset, instance_data)

cvat/apps/dataset_manager/formats/cvat.py

+13-4
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,16 @@
2222
from datumaro.util.image import Image
2323
from defusedxml import ElementTree
2424

25-
from cvat.apps.dataset_manager.bindings import (ProjectData, TaskData, JobData, detect_dataset,
26-
get_defaulted_subset,
27-
import_dm_annotations,
28-
match_dm_item)
25+
from cvat.apps.dataset_manager.bindings import (
26+
NoMediaInAnnotationFileError,
27+
ProjectData,
28+
TaskData,
29+
JobData,
30+
detect_dataset,
31+
get_defaulted_subset,
32+
import_dm_annotations,
33+
match_dm_item
34+
)
2935
from cvat.apps.dataset_manager.util import make_zip_archive
3036
from cvat.apps.engine.frame_provider import FrameQuality, FrameOutputType, make_frame_provider
3137

@@ -1456,4 +1462,7 @@ def _import(src_file, temp_dir, instance_data, load_data_callback=None, **kwargs
14561462
for p in anno_paths:
14571463
load_anno(p, instance_data)
14581464
else:
1465+
if load_data_callback:
1466+
raise NoMediaInAnnotationFileError()
1467+
14591468
load_anno(src_file, instance_data)

cvat/apps/dataset_manager/formats/datumaro.py

+27-27
Original file line numberDiff line numberDiff line change
@@ -3,43 +3,40 @@
33
#
44
# SPDX-License-Identifier: MIT
55

6+
import zipfile
67
from datumaro.components.dataset import Dataset
7-
from datumaro.components.extractor import ItemTransform
8-
from datumaro.util.image import Image
98

10-
from pyunpack import Archive
11-
12-
from cvat.apps.dataset_manager.bindings import (GetCVATDataExtractor, detect_dataset,
13-
import_dm_annotations)
9+
from cvat.apps.dataset_manager.bindings import (
10+
GetCVATDataExtractor, import_dm_annotations, NoMediaInAnnotationFileError, detect_dataset
11+
)
1412
from cvat.apps.dataset_manager.util import make_zip_archive
1513
from cvat.apps.engine.models import DimensionType
1614

1715
from .registry import dm_env, exporter, importer
1816

19-
class DeleteImagePath(ItemTransform):
20-
def transform_item(self, item):
21-
image = None
22-
if item.has_image and item.image.has_data:
23-
image = Image(data=item.image.data, size=item.image.size)
24-
return item.wrap(image=image, point_cloud='', related_images=[])
25-
2617

2718
@exporter(name="Datumaro", ext="ZIP", version="1.0")
2819
def _export(dst_file, temp_dir, instance_data, save_images=False):
29-
with GetCVATDataExtractor(instance_data=instance_data, include_images=save_images) as extractor:
20+
with GetCVATDataExtractor(
21+
instance_data=instance_data, include_images=save_images
22+
) as extractor:
3023
dataset = Dataset.from_extractors(extractor, env=dm_env)
31-
if not save_images:
32-
dataset.transform(DeleteImagePath)
3324
dataset.export(temp_dir, 'datumaro', save_images=save_images)
3425

3526
make_zip_archive(temp_dir, dst_file)
3627

37-
@importer(name="Datumaro", ext="ZIP", version="1.0")
28+
@importer(name="Datumaro", ext="JSON, ZIP", version="1.0")
3829
def _import(src_file, temp_dir, instance_data, load_data_callback=None, **kwargs):
39-
Archive(src_file.name).extractall(temp_dir)
30+
if zipfile.is_zipfile(src_file):
31+
zipfile.ZipFile(src_file).extractall(temp_dir)
4032

41-
detect_dataset(temp_dir, format_name='datumaro', importer=dm_env.importers.get('datumaro'))
42-
dataset = Dataset.import_from(temp_dir, 'datumaro', env=dm_env)
33+
detect_dataset(temp_dir, format_name='datumaro', importer=dm_env.importers.get('datumaro'))
34+
dataset = Dataset.import_from(temp_dir, 'datumaro', env=dm_env)
35+
else:
36+
if load_data_callback:
37+
raise NoMediaInAnnotationFileError()
38+
39+
dataset = Dataset.import_from(src_file.name, 'datumaro', env=dm_env)
4340

4441
if load_data_callback is not None:
4542
load_data_callback(dataset, instance_data)
@@ -52,19 +49,22 @@ def _export(dst_file, temp_dir, instance_data, save_images=False):
5249
dimension=DimensionType.DIM_3D,
5350
) as extractor:
5451
dataset = Dataset.from_extractors(extractor, env=dm_env)
55-
56-
if not save_images:
57-
dataset.transform(DeleteImagePath)
5852
dataset.export(temp_dir, 'datumaro', save_images=save_images)
5953

6054
make_zip_archive(temp_dir, dst_file)
6155

62-
@importer(name="Datumaro 3D", ext="ZIP", version="1.0", dimension=DimensionType.DIM_3D)
56+
@importer(name="Datumaro 3D", ext="JSON, ZIP", version="1.0", dimension=DimensionType.DIM_3D)
6357
def _import(src_file, temp_dir, instance_data, load_data_callback=None, **kwargs):
64-
Archive(src_file.name).extractall(temp_dir)
58+
if zipfile.is_zipfile(src_file):
59+
zipfile.ZipFile(src_file).extractall(temp_dir)
60+
61+
detect_dataset(temp_dir, format_name='datumaro', importer=dm_env.importers.get('datumaro'))
62+
dataset = Dataset.import_from(temp_dir, 'datumaro', env=dm_env)
63+
else:
64+
if load_data_callback:
65+
raise NoMediaInAnnotationFileError()
6566

66-
detect_dataset(temp_dir, format_name='datumaro', importer=dm_env.importers.get('datumaro'))
67-
dataset = Dataset.import_from(temp_dir, 'datumaro', env=dm_env)
67+
dataset = Dataset.import_from(src_file.name, 'datumaro', env=dm_env)
6868

6969
if load_data_callback is not None:
7070
load_data_callback(dataset, instance_data)

tests/python/rest_api/test_projects.py

+66-5
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
from typing import Optional, Union
2020

2121
import pytest
22-
from cvat_sdk.api_client import ApiClient, Configuration, models
22+
from cvat_sdk.api_client import ApiClient, Configuration, exceptions, models
2323
from cvat_sdk.api_client.api_client import Endpoint
2424
from cvat_sdk.api_client.exceptions import ForbiddenException
2525
from cvat_sdk.core.helpers import get_paginated_collection
@@ -37,8 +37,10 @@
3737
from shared.utils.helpers import generate_image_files
3838

3939
from .utils import (
40+
DATUMARO_FORMAT_FOR_DIMENSION,
4041
CollectionSimpleFilterTestBase,
4142
create_task,
43+
export_dataset,
4244
export_project_backup,
4345
export_project_dataset,
4446
)
@@ -991,6 +993,68 @@ def test_can_export_and_import_dataset_after_deleting_related_storage(
991993

992994
self._test_import_project(admin_user, project_id, "CVAT 1.1", import_data)
993995

996+
@pytest.mark.parametrize(
997+
"dimension, format_name",
998+
[
999+
*DATUMARO_FORMAT_FOR_DIMENSION.items(),
1000+
("2d", "CVAT 1.1"),
1001+
("3d", "CVAT 1.1"),
1002+
("2d", "COCO 1.0"),
1003+
],
1004+
)
1005+
def test_cant_import_annotations_as_project(self, admin_user, tasks, format_name, dimension):
1006+
task = next(t for t in tasks if t.get("size") if t["dimension"] == dimension)
1007+
1008+
def _export_task(task_id: int, format_name: str) -> io.BytesIO:
1009+
with make_api_client(admin_user) as api_client:
1010+
return io.BytesIO(
1011+
export_dataset(
1012+
api_client.tasks_api,
1013+
api_version=2,
1014+
id=task_id,
1015+
format=format_name,
1016+
save_images=False,
1017+
)
1018+
)
1019+
1020+
if format_name in list(DATUMARO_FORMAT_FOR_DIMENSION.values()):
1021+
with zipfile.ZipFile(_export_task(task["id"], format_name)) as zip_file:
1022+
annotations = zip_file.read("annotations/default.json")
1023+
1024+
dataset_file = io.BytesIO(annotations)
1025+
dataset_file.name = "annotations.json"
1026+
elif format_name == "CVAT 1.1":
1027+
with zipfile.ZipFile(_export_task(task["id"], "CVAT for images 1.1")) as zip_file:
1028+
annotations = zip_file.read("annotations.xml")
1029+
1030+
dataset_file = io.BytesIO(annotations)
1031+
dataset_file.name = "annotations.xml"
1032+
elif format_name == "COCO 1.0":
1033+
with zipfile.ZipFile(_export_task(task["id"], format_name)) as zip_file:
1034+
annotations = zip_file.read("annotations/instances_default.json")
1035+
1036+
dataset_file = io.BytesIO(annotations)
1037+
dataset_file.name = "annotations.json"
1038+
else:
1039+
assert False
1040+
1041+
with make_api_client(admin_user) as api_client:
1042+
project, _ = api_client.projects_api.create(
1043+
project_write_request=models.ProjectWriteRequest(
1044+
name=f"test_annotations_import_as_project {format_name}"
1045+
)
1046+
)
1047+
1048+
import_data = {"dataset_file": dataset_file}
1049+
1050+
with pytest.raises(exceptions.ApiException, match="Dataset file should be zip archive"):
1051+
self._test_import_project(
1052+
admin_user,
1053+
project.id,
1054+
format_name=format_name,
1055+
data=import_data,
1056+
)
1057+
9941058
@pytest.mark.parametrize(
9951059
"export_format, subset_path_template",
9961060
[
@@ -1045,10 +1109,7 @@ def test_creates_subfolders_for_subsets_on_export(
10451109
len([f for f in zip_file.namelist() if f.startswith(folder_prefix)]) > 0
10461110
), f"No {folder_prefix} in {zip_file.namelist()}"
10471111

1048-
def test_export_project_with_honeypots(
1049-
self,
1050-
admin_user: str,
1051-
):
1112+
def test_export_project_with_honeypots(self, admin_user: str):
10521113
project_spec = {
10531114
"name": "Project with honeypots",
10541115
"labels": [{"name": "cat"}],

tests/python/rest_api/test_tasks.py

+83
Original file line numberDiff line numberDiff line change
@@ -64,9 +64,11 @@
6464
)
6565

6666
from .utils import (
67+
DATUMARO_FORMAT_FOR_DIMENSION,
6768
CollectionSimpleFilterTestBase,
6869
compare_annotations,
6970
create_task,
71+
export_dataset,
7072
export_task_backup,
7173
export_task_dataset,
7274
parse_frame_step,
@@ -969,6 +971,46 @@ def test_uses_subset_name(
969971
subset_path in path for path in zip_file.namelist()
970972
), f"No {subset_path} in {zip_file.namelist()}"
971973

974+
@pytest.mark.parametrize(
975+
"dimension, mode", [("2d", "annotation"), ("2d", "interpolation"), ("3d", "annotation")]
976+
)
977+
def test_datumaro_export_without_annotations_includes_image_info(
978+
self, admin_user, tasks, mode, dimension
979+
):
980+
task = next(
981+
t for t in tasks if t.get("size") if t["mode"] == mode if t["dimension"] == dimension
982+
)
983+
984+
with make_api_client(admin_user) as api_client:
985+
dataset_file = io.BytesIO(
986+
export_dataset(
987+
api_client.tasks_api,
988+
api_version=2,
989+
id=task["id"],
990+
format=DATUMARO_FORMAT_FOR_DIMENSION[dimension],
991+
save_images=False,
992+
)
993+
)
994+
995+
with zipfile.ZipFile(dataset_file) as zip_file:
996+
annotations = json.loads(zip_file.read("annotations/default.json"))
997+
998+
assert annotations["items"]
999+
for item in annotations["items"]:
1000+
assert "media" not in item
1001+
1002+
if dimension == "2d":
1003+
assert osp.splitext(item["image"]["path"])[0] == item["id"]
1004+
assert not Path(item["image"]["path"]).is_absolute()
1005+
assert tuple(item["image"]["size"]) > (0, 0)
1006+
elif dimension == "3d":
1007+
assert osp.splitext(osp.basename(item["point_cloud"]["path"]))[0] == item["id"]
1008+
assert not Path(item["point_cloud"]["path"]).is_absolute()
1009+
for related_image in item["related_images"]:
1010+
assert not Path(related_image["path"]).is_absolute()
1011+
if "size" in related_image:
1012+
assert tuple(related_image["size"]) > (0, 0)
1013+
9721014

9731015
@pytest.mark.usefixtures("restore_db_per_function")
9741016
@pytest.mark.usefixtures("restore_cvat_data_per_function")
@@ -5181,6 +5223,47 @@ def test_import_annotations_after_deleting_related_cloud_storage(
51815223
task.import_annotations(self.import_format, file_path)
51825224
self._check_annotations(task_id)
51835225

5226+
@pytest.mark.parametrize("dimension", ["2d", "3d"])
5227+
def test_can_import_datumaro_json(self, admin_user, tasks, dimension):
5228+
task = next(
5229+
t
5230+
for t in tasks
5231+
if t.get("size")
5232+
if t["dimension"] == dimension and t.get("validation_mode") != "gt_pool"
5233+
)
5234+
5235+
with make_api_client(admin_user) as api_client:
5236+
original_annotations = json.loads(
5237+
api_client.tasks_api.retrieve_annotations(task["id"])[1].data
5238+
)
5239+
5240+
dataset_archive = io.BytesIO(
5241+
export_dataset(
5242+
api_client.tasks_api,
5243+
api_version=2,
5244+
id=task["id"],
5245+
format=DATUMARO_FORMAT_FOR_DIMENSION[dimension],
5246+
save_images=False,
5247+
)
5248+
)
5249+
5250+
with zipfile.ZipFile(dataset_archive) as zip_file:
5251+
annotations = zip_file.read("annotations/default.json")
5252+
5253+
with TemporaryDirectory() as tempdir:
5254+
annotations_path = Path(tempdir) / "annotations.json"
5255+
annotations_path.write_bytes(annotations)
5256+
self.client.tasks.retrieve(task["id"]).import_annotations(
5257+
DATUMARO_FORMAT_FOR_DIMENSION[dimension], annotations_path
5258+
)
5259+
5260+
with make_api_client(admin_user) as api_client:
5261+
updated_annotations = json.loads(
5262+
api_client.tasks_api.retrieve_annotations(task["id"])[1].data
5263+
)
5264+
5265+
assert compare_annotations(original_annotations, updated_annotations) == {}
5266+
51845267
@pytest.mark.parametrize(
51855268
"format_name",
51865269
[

0 commit comments

Comments
 (0)