Update Datumaro format (#7125)

zhiltsov-max · web-flow · commit d79e0563f6df · 2024-11-25T16:50:22.000+03:00
### Motivation and context  Fixes #5424 Fixes #7375 Fixes #8700 Depends on cvat-ai/datumaro#34 This PR improves quality of life when using Datumaro format. - Added support for direct .json uploading of annotations, similarly to the COCO and CVAT formats - Added image metadata when exporting in the Datumaro format without images - For related images in 3d tasks, datumaro export without images will include only the basenames (before: absolute server paths were exported) - <s>Refactored `conv_mask_to_poly` uses to avoid code and logic duplication</s> (will be in another PR) ### How has this been tested?  Unit tests ### Checklist  - [ ] I submit my changes into the `develop` branch - [ ] I have created a changelog fragment  - [ ] I have updated the documentation accordingly - [ ] I have added tests to cover my changes - [ ] I have linked related issues (see [GitHub docs]( https://help.github.com/en/github/managing-your-work-on-github/linking-a-pull-request-to-an-issue#linking-a-pull-request-to-an-issue-using-a-keyword)) - [ ] I have increased versions of npm packages if it is necessary ([cvat-canvas](https://github.com/opencv/cvat/tree/develop/cvat-canvas#versioning), [cvat-core](https://github.com/opencv/cvat/tree/develop/cvat-core#versioning), [cvat-data](https://github.com/opencv/cvat/tree/develop/cvat-data#versioning) and [cvat-ui](https://github.com/opencv/cvat/tree/develop/cvat-ui#versioning)) ### License - [ ] I submit _my code changes_ under the same [MIT License]( https://github.com/opencv/cvat/blob/develop/LICENSE) that covers the project. Feel free to contact the maintainers if that's a concern.  ## Summary by CodeRabbit - **New Features** - Introduced support for direct .json file import in Datumaro format. - **Bug Fixes** - Fixed an issue where exporting without images in Datumaro format now correctly includes image information. - **Refactor** - Renamed classes and methods in dataset management to support a broader range of media types, enhancing the system's flexibility. - **Tests** - Added new tests to verify the behavior of importing and exporting in Datumaro format, ensuring robustness in dataset handling.
diff --git a/changelog.d/20231110_175126_mzhiltso_update_dm_format.md b/changelog.d/20231110_175126_mzhiltso_update_dm_format.md
@@ -0,0 +1,9 @@
+### Added
+
+- Support for direct .json file import in Datumaro format
+  (<https://github.com/opencv/cvat/pull/7125>)
+
+### Fixed
+
+- Export without images in Datumaro format should include image info
+  (<https://github.com/opencv/cvat/pull/7125>)
diff --git a/cvat/apps/dataset_manager/bindings.py b/cvat/apps/dataset_manager/bindings.py
diff --git a/cvat/apps/dataset_manager/formats/coco.py b/cvat/apps/dataset_manager/formats/coco.py
@@ -9,8 +9,9 @@
 from datumaro.components.annotation import AnnotationType
 from datumaro.plugins.coco_format.importer import CocoImporter
 
-from cvat.apps.dataset_manager.bindings import GetCVATDataExtractor, detect_dataset, \
-    import_dm_annotations
+from cvat.apps.dataset_manager.bindings import (
+    GetCVATDataExtractor, NoMediaInAnnotationFileError, import_dm_annotations, detect_dataset
+)
 from cvat.apps.dataset_manager.util import make_zip_archive
 
 from .registry import dm_env, exporter, importer
@@ -35,6 +36,9 @@ def _import(src_file, temp_dir, instance_data, load_data_callback=None, **kwargs
             load_data_callback(dataset, instance_data)
         import_dm_annotations(dataset, instance_data)
     else:
+        if load_data_callback:
+            raise NoMediaInAnnotationFileError()
+
         dataset = Dataset.import_from(src_file.name,
             'coco_instances', env=dm_env)
         import_dm_annotations(dataset, instance_data)
@@ -52,6 +56,8 @@ def _export(dst_file, temp_dir, instance_data, save_images=False):
 def _import(src_file, temp_dir, instance_data, load_data_callback=None, **kwargs):
     def remove_extra_annotations(dataset):
         for item in dataset:
+            # Boxes would have invalid (skeleton) labels, so remove them
+            # TODO: find a way to import boxes
             annotations = [ann for ann in item.annotations
                 if ann.type != AnnotationType.bbox]
             item.annotations = annotations
@@ -66,7 +72,9 @@ def remove_extra_annotations(dataset):
             load_data_callback(dataset, instance_data)
         import_dm_annotations(dataset, instance_data)
     else:
-        dataset = Dataset.import_from(src_file.name,
-            'coco_person_keypoints', env=dm_env)
+        if load_data_callback:
+            raise NoMediaInAnnotationFileError()
+
+        dataset = Dataset.import_from(src_file.name, 'coco_person_keypoints', env=dm_env)
         remove_extra_annotations(dataset)
         import_dm_annotations(dataset, instance_data)
diff --git a/cvat/apps/dataset_manager/formats/cvat.py b/cvat/apps/dataset_manager/formats/cvat.py
@@ -22,10 +22,16 @@
 from datumaro.util.image import Image
 from defusedxml import ElementTree
 
-from cvat.apps.dataset_manager.bindings import (ProjectData, TaskData, JobData, detect_dataset,
-                                                get_defaulted_subset,
-                                                import_dm_annotations,
-                                                match_dm_item)
+from cvat.apps.dataset_manager.bindings import (
+    NoMediaInAnnotationFileError,
+    ProjectData,
+    TaskData,
+    JobData,
+    detect_dataset,
+    get_defaulted_subset,
+    import_dm_annotations,
+    match_dm_item
+)
 from cvat.apps.dataset_manager.util import make_zip_archive
 from cvat.apps.engine.frame_provider import FrameQuality, FrameOutputType, make_frame_provider
 
@@ -1456,4 +1462,7 @@ def _import(src_file, temp_dir, instance_data, load_data_callback=None, **kwargs
             for p in anno_paths:
                 load_anno(p, instance_data)
     else:
+        if load_data_callback:
+            raise NoMediaInAnnotationFileError()
+
         load_anno(src_file, instance_data)
diff --git a/cvat/apps/dataset_manager/formats/datumaro.py b/cvat/apps/dataset_manager/formats/datumaro.py
@@ -3,43 +3,40 @@
 #
 # SPDX-License-Identifier: MIT
 
+import zipfile
 from datumaro.components.dataset import Dataset
-from datumaro.components.extractor import ItemTransform
-from datumaro.util.image import Image
 
-from pyunpack import Archive
-
-from cvat.apps.dataset_manager.bindings import (GetCVATDataExtractor, detect_dataset,
-    import_dm_annotations)
+from cvat.apps.dataset_manager.bindings import (
+    GetCVATDataExtractor, import_dm_annotations, NoMediaInAnnotationFileError, detect_dataset
+)
 from cvat.apps.dataset_manager.util import make_zip_archive
 from cvat.apps.engine.models import DimensionType
 
 from .registry import dm_env, exporter, importer
 
-class DeleteImagePath(ItemTransform):
-    def transform_item(self, item):
-        image = None
-        if item.has_image and item.image.has_data:
-            image = Image(data=item.image.data, size=item.image.size)
-        return item.wrap(image=image, point_cloud='', related_images=[])
-
 
 @exporter(name="Datumaro", ext="ZIP", version="1.0")
 def _export(dst_file, temp_dir, instance_data, save_images=False):
-    with GetCVATDataExtractor(instance_data=instance_data, include_images=save_images) as extractor:
+    with GetCVATDataExtractor(
+        instance_data=instance_data, include_images=save_images
+    ) as extractor:
         dataset = Dataset.from_extractors(extractor, env=dm_env)
-        if not save_images:
-            dataset.transform(DeleteImagePath)
         dataset.export(temp_dir, 'datumaro', save_images=save_images)
 
     make_zip_archive(temp_dir, dst_file)
 
-@importer(name="Datumaro", ext="ZIP", version="1.0")
+@importer(name="Datumaro", ext="JSON, ZIP", version="1.0")
 def _import(src_file, temp_dir, instance_data, load_data_callback=None, **kwargs):
-    Archive(src_file.name).extractall(temp_dir)
+    if zipfile.is_zipfile(src_file):
+        zipfile.ZipFile(src_file).extractall(temp_dir)
 
-    detect_dataset(temp_dir, format_name='datumaro', importer=dm_env.importers.get('datumaro'))
-    dataset = Dataset.import_from(temp_dir, 'datumaro', env=dm_env)
+        detect_dataset(temp_dir, format_name='datumaro', importer=dm_env.importers.get('datumaro'))
+        dataset = Dataset.import_from(temp_dir, 'datumaro', env=dm_env)
+    else:
+        if load_data_callback:
+            raise NoMediaInAnnotationFileError()
+
+        dataset = Dataset.import_from(src_file.name, 'datumaro', env=dm_env)
 
     if load_data_callback is not None:
         load_data_callback(dataset, instance_data)
@@ -52,19 +49,22 @@ def _export(dst_file, temp_dir, instance_data, save_images=False):
         dimension=DimensionType.DIM_3D,
     ) as extractor:
         dataset = Dataset.from_extractors(extractor, env=dm_env)
-
-        if not save_images:
-            dataset.transform(DeleteImagePath)
         dataset.export(temp_dir, 'datumaro', save_images=save_images)
 
     make_zip_archive(temp_dir, dst_file)
 
-@importer(name="Datumaro 3D", ext="ZIP", version="1.0", dimension=DimensionType.DIM_3D)
+@importer(name="Datumaro 3D", ext="JSON, ZIP", version="1.0", dimension=DimensionType.DIM_3D)
 def _import(src_file, temp_dir, instance_data, load_data_callback=None, **kwargs):
-    Archive(src_file.name).extractall(temp_dir)
+    if zipfile.is_zipfile(src_file):
+        zipfile.ZipFile(src_file).extractall(temp_dir)
+
+        detect_dataset(temp_dir, format_name='datumaro', importer=dm_env.importers.get('datumaro'))
+        dataset = Dataset.import_from(temp_dir, 'datumaro', env=dm_env)
+    else:
+        if load_data_callback:
+            raise NoMediaInAnnotationFileError()
 
-    detect_dataset(temp_dir, format_name='datumaro', importer=dm_env.importers.get('datumaro'))
-    dataset = Dataset.import_from(temp_dir, 'datumaro', env=dm_env)
+        dataset = Dataset.import_from(src_file.name, 'datumaro', env=dm_env)
 
     if load_data_callback is not None:
         load_data_callback(dataset, instance_data)
diff --git a/tests/python/rest_api/test_projects.py b/tests/python/rest_api/test_projects.py
@@ -19,7 +19,7 @@
 from typing import Optional, Union
 
 import pytest
-from cvat_sdk.api_client import ApiClient, Configuration, models
+from cvat_sdk.api_client import ApiClient, Configuration, exceptions, models
 from cvat_sdk.api_client.api_client import Endpoint
 from cvat_sdk.api_client.exceptions import ForbiddenException
 from cvat_sdk.core.helpers import get_paginated_collection
@@ -37,8 +37,10 @@
 from shared.utils.helpers import generate_image_files
 
 from .utils import (
+    DATUMARO_FORMAT_FOR_DIMENSION,
     CollectionSimpleFilterTestBase,
     create_task,
+    export_dataset,
     export_project_backup,
     export_project_dataset,
 )
@@ -991,6 +993,68 @@ def test_can_export_and_import_dataset_after_deleting_related_storage(
 
             self._test_import_project(admin_user, project_id, "CVAT 1.1", import_data)
 
+    @pytest.mark.parametrize(
+        "dimension, format_name",
+        [
+            *DATUMARO_FORMAT_FOR_DIMENSION.items(),
+            ("2d", "CVAT 1.1"),
+            ("3d", "CVAT 1.1"),
+            ("2d", "COCO 1.0"),
+        ],
+    )
+    def test_cant_import_annotations_as_project(self, admin_user, tasks, format_name, dimension):
+        task = next(t for t in tasks if t.get("size") if t["dimension"] == dimension)
+
+        def _export_task(task_id: int, format_name: str) -> io.BytesIO:
+            with make_api_client(admin_user) as api_client:
+                return io.BytesIO(
+                    export_dataset(
+                        api_client.tasks_api,
+                        api_version=2,
+                        id=task_id,
+                        format=format_name,
+                        save_images=False,
+                    )
+                )
+
+        if format_name in list(DATUMARO_FORMAT_FOR_DIMENSION.values()):
+            with zipfile.ZipFile(_export_task(task["id"], format_name)) as zip_file:
+                annotations = zip_file.read("annotations/default.json")
+
+            dataset_file = io.BytesIO(annotations)
+            dataset_file.name = "annotations.json"
+        elif format_name == "CVAT 1.1":
+            with zipfile.ZipFile(_export_task(task["id"], "CVAT for images 1.1")) as zip_file:
+                annotations = zip_file.read("annotations.xml")
+
+            dataset_file = io.BytesIO(annotations)
+            dataset_file.name = "annotations.xml"
+        elif format_name == "COCO 1.0":
+            with zipfile.ZipFile(_export_task(task["id"], format_name)) as zip_file:
+                annotations = zip_file.read("annotations/instances_default.json")
+
+            dataset_file = io.BytesIO(annotations)
+            dataset_file.name = "annotations.json"
+        else:
+            assert False
+
+        with make_api_client(admin_user) as api_client:
+            project, _ = api_client.projects_api.create(
+                project_write_request=models.ProjectWriteRequest(
+                    name=f"test_annotations_import_as_project {format_name}"
+                )
+            )
+
+            import_data = {"dataset_file": dataset_file}
+
+            with pytest.raises(exceptions.ApiException, match="Dataset file should be zip archive"):
+                self._test_import_project(
+                    admin_user,
+                    project.id,
+                    format_name=format_name,
+                    data=import_data,
+                )
+
     @pytest.mark.parametrize(
         "export_format, subset_path_template",
         [
@@ -1045,10 +1109,7 @@ def test_creates_subfolders_for_subsets_on_export(
                     len([f for f in zip_file.namelist() if f.startswith(folder_prefix)]) > 0
                 ), f"No {folder_prefix} in {zip_file.namelist()}"
 
-    def test_export_project_with_honeypots(
-        self,
-        admin_user: str,
-    ):
+    def test_export_project_with_honeypots(self, admin_user: str):
         project_spec = {
             "name": "Project with honeypots",
             "labels": [{"name": "cat"}],
diff --git a/tests/python/rest_api/test_tasks.py b/tests/python/rest_api/test_tasks.py
@@ -64,9 +64,11 @@
 )
 
 from .utils import (
+    DATUMARO_FORMAT_FOR_DIMENSION,
     CollectionSimpleFilterTestBase,
     compare_annotations,
     create_task,
+    export_dataset,
     export_task_backup,
     export_task_dataset,
     parse_frame_step,
@@ -969,6 +971,46 @@ def test_uses_subset_name(
                     subset_path in path for path in zip_file.namelist()
                 ), f"No {subset_path} in {zip_file.namelist()}"
 
+    @pytest.mark.parametrize(
+        "dimension, mode", [("2d", "annotation"), ("2d", "interpolation"), ("3d", "annotation")]
+    )
+    def test_datumaro_export_without_annotations_includes_image_info(
+        self, admin_user, tasks, mode, dimension
+    ):
+        task = next(
+            t for t in tasks if t.get("size") if t["mode"] == mode if t["dimension"] == dimension
+        )
+
+        with make_api_client(admin_user) as api_client:
+            dataset_file = io.BytesIO(
+                export_dataset(
+                    api_client.tasks_api,
+                    api_version=2,
+                    id=task["id"],
+                    format=DATUMARO_FORMAT_FOR_DIMENSION[dimension],
+                    save_images=False,
+                )
+            )
+
+        with zipfile.ZipFile(dataset_file) as zip_file:
+            annotations = json.loads(zip_file.read("annotations/default.json"))
+
+        assert annotations["items"]
+        for item in annotations["items"]:
+            assert "media" not in item
+
+            if dimension == "2d":
+                assert osp.splitext(item["image"]["path"])[0] == item["id"]
+                assert not Path(item["image"]["path"]).is_absolute()
+                assert tuple(item["image"]["size"]) > (0, 0)
+            elif dimension == "3d":
+                assert osp.splitext(osp.basename(item["point_cloud"]["path"]))[0] == item["id"]
+                assert not Path(item["point_cloud"]["path"]).is_absolute()
+                for related_image in item["related_images"]:
+                    assert not Path(related_image["path"]).is_absolute()
+                    if "size" in related_image:
+                        assert tuple(related_image["size"]) > (0, 0)
+
 
 @pytest.mark.usefixtures("restore_db_per_function")
 @pytest.mark.usefixtures("restore_cvat_data_per_function")
@@ -5181,6 +5223,47 @@ def test_import_annotations_after_deleting_related_cloud_storage(
         task.import_annotations(self.import_format, file_path)
         self._check_annotations(task_id)
 
+    @pytest.mark.parametrize("dimension", ["2d", "3d"])
+    def test_can_import_datumaro_json(self, admin_user, tasks, dimension):
+        task = next(
+            t
+            for t in tasks
+            if t.get("size")
+            if t["dimension"] == dimension and t.get("validation_mode") != "gt_pool"
+        )
+
+        with make_api_client(admin_user) as api_client:
+            original_annotations = json.loads(
+                api_client.tasks_api.retrieve_annotations(task["id"])[1].data
+            )
+
+            dataset_archive = io.BytesIO(
+                export_dataset(
+                    api_client.tasks_api,
+                    api_version=2,
+                    id=task["id"],
+                    format=DATUMARO_FORMAT_FOR_DIMENSION[dimension],
+                    save_images=False,
+                )
+            )
+
+        with zipfile.ZipFile(dataset_archive) as zip_file:
+            annotations = zip_file.read("annotations/default.json")
+
+        with TemporaryDirectory() as tempdir:
+            annotations_path = Path(tempdir) / "annotations.json"
+            annotations_path.write_bytes(annotations)
+            self.client.tasks.retrieve(task["id"]).import_annotations(
+                DATUMARO_FORMAT_FOR_DIMENSION[dimension], annotations_path
+            )
+
+        with make_api_client(admin_user) as api_client:
+            updated_annotations = json.loads(
+                api_client.tasks_api.retrieve_annotations(task["id"])[1].data
+            )
+
+        assert compare_annotations(original_annotations, updated_annotations) == {}
+
     @pytest.mark.parametrize(
         "format_name",
         [
diff --git a/tests/python/rest_api/utils.py b/tests/python/rest_api/utils.py