Merge pull request #81 from I-GUIDE/80-mediaobject-additional-attributes

Additional attributes for MediaObject schema model
I-GUIDE · Apr 17, 2024 · 2952217 · 2952217
2 parents b76d4bf + a84ad13
commit 2952217
Show file tree

Hide file tree

Showing 7 changed files with 221 additions and 8 deletions.
diff --git a/Makefile b/Makefile
@@ -18,10 +18,6 @@ down:
 build:
 	docker-compose build
 
-.PHONY: test
-test:
-	docker-compose exec api pytest tests
-
 .PHONY: format
 format:
 	docker-compose run api $(isort)

diff --git a/api/adapters/hydroshare.py b/api/adapters/hydroshare.py
@@ -127,6 +127,7 @@ def to_dataset_media_object(self):
         media_object.encodingFormat = self.content_type
         media_object.contentSize = f"{self.size/1000.00} KB"
         media_object.name = self.file_name
+        media_object.sha256 = self.checksum
         return media_object
 
 

diff --git a/api/models/schema.py b/api/models/schema.py
@@ -160,6 +160,13 @@ class IsPartOf(CreativeWork):
     )
 
 
+class MediaObjectPartOf(CreativeWork):
+    url: Optional[HttpUrl] = Field(title="URL", description="The URL address to the related metadata document.")
+    description: Optional[str] = Field(
+        description="Information about a related metadata document."
+    )
+
+
 class SubjectOf(CreativeWork):
     url: Optional[HttpUrl] = Field(
         title="URL",
@@ -410,6 +417,11 @@ class MediaObject(SchemaBaseModel):
         title="Source organization",
         description="The organization that provided the media object."
     )
+    sha256: Optional[str] = Field(title="SHA-256", description="The SHA-256 hash of the media object.")
+    isPartOf: Optional[List[MediaObjectPartOf]] = Field(
+        title="Is part of",
+        description="Link to or citation for a related metadata document that this media object is a part of",
+    )
 
     @validator('contentSize')
     def validate_content_size(cls, v):
@@ -438,6 +450,15 @@ def validate_content_size(cls, v):
 
         return v
 
+    # TODO: not validating the SHA-256 hash for now as the hydroshare content file hash is in md5 format
+    # @validator('sha256')
+    # def validate_sha256_string_format(cls, v):
+    #     if v:
+    #         v = v.strip()
+    #         if v and not re.match(r"^[a-fA-F0-9]{64}$", v):
+    #             raise ValueError('invalid SHA-256 format')
+    #     return v
+
 
 class CoreMetadata(SchemaBaseModel):
     context: HttpUrl = Field(

diff --git a/api/models/schemas/schema.json b/api/models/schemas/schema.json
@@ -1655,11 +1655,56 @@
             "required": [
               "name"
             ]
+          },
+          "sha256": {
+            "title": "SHA-256",
+            "description": "The SHA-256 hash of the media object.",
+            "type": "string"
+          },
+          "isPartOf": {
+            "title": "Is part of",
+            "description": "Link to or citation for a related metadata document that this media object is a part of",
+            "type": "array",
+            "items": {
+              "title": "MediaObjectPartOf",
+              "type": "object",
+              "properties": {
+                "@type": {
+                  "title": "@Type",
+                  "description": "Submission type can include various forms of content, such as datasets, software source code, digital documents, etc.",
+                  "default": "CreativeWork",
+                  "type": "string"
+                },
+                "name": {
+                  "title": "Name or title",
+                  "description": "Submission's name or title",
+                  "type": "string"
+                },
+                "url": {
+                  "title": "URL",
+                  "description": "The URL address to the related metadata document.",
+                  "minLength": 1,
+                  "maxLength": 2083,
+                  "type": "string",
+                  "pattern": "^(http:\\/\\/www\\.|https:\\/\\/www\\.|http:\\/\\/|https:\\/\\/)?[a-z0-9]+([\\-\\.]{1}[a-z0-9]+)*\\.[a-z]{2,5}(:[0-9]{1,5})?(\\/.*)?$",
+                  "errorMessage": {
+                    "pattern": "must match format \"url\""
+                  }
+                },
+                "description": {
+                  "title": "Description",
+                  "description": "Information about a related metadata document.",
+                  "type": "string"
+                }
+              },
+              "required": [
+                "name"
+              ]
+            }
           }
         },
         "required": [
           "contentUrl",
-          "encodingFormat",
           "contentSize",
           "name"
         ]
@@ -2813,6 +2858,42 @@
         "name"
       ]
     },
+    "MediaObjectPartOf": {
+      "title": "MediaObjectPartOf",
+      "type": "object",
+      "properties": {
+        "@type": {
+          "title": "@Type",
+          "description": "Submission type can include various forms of content, such as datasets, software source code, digital documents, etc.",
+          "default": "CreativeWork",
+          "type": "string"
+        },
+        "name": {
+          "title": "Name or title",
+          "description": "Submission's name or title",
+          "type": "string"
+        },
+        "url": {
+          "title": "URL",
+          "description": "The URL address to the related metadata document.",
+          "minLength": 1,
+          "maxLength": 2083,
+          "type": "string",
+          "pattern": "^(http:\\/\\/www\\.|https:\\/\\/www\\.|http:\\/\\/|https:\\/\\/)?[a-z0-9]+([\\-\\.]{1}[a-z0-9]+)*\\.[a-z]{2,5}(:[0-9]{1,5})?(\\/.*)?$",
+          "errorMessage": {
+            "pattern": "must match format \"url\""
+          }
+        },
+        "description": {
+          "title": "Description",
+          "description": "Information about a related metadata document.",
+          "type": "string"
+        }
+      },
+      "required": [
+        "name"
+      ]
+    },
     "MediaObject": {
       "title": "MediaObject",
       "type": "object",
@@ -3490,11 +3571,56 @@
           "required": [
             "name"
           ]
+        },
+        "sha256": {
+          "title": "SHA-256",
+          "description": "The SHA-256 hash of the media object.",
+          "type": "string"
+        },
+        "isPartOf": {
+          "title": "Is part of",
+          "description": "Link to or citation for a related metadata document that this media object is a part of",
+          "type": "array",
+          "items": {
+            "title": "MediaObjectPartOf",
+            "type": "object",
+            "properties": {
+              "@type": {
+                "title": "@Type",
+                "description": "Submission type can include various forms of content, such as datasets, software source code, digital documents, etc.",
+                "default": "CreativeWork",
+                "type": "string"
+              },
+              "name": {
+                "title": "Name or title",
+                "description": "Submission's name or title",
+                "type": "string"
+              },
+              "url": {
+                "title": "URL",
+                "description": "The URL address to the related metadata document.",
+                "minLength": 1,
+                "maxLength": 2083,
+                "type": "string",
+                "pattern": "^(http:\\/\\/www\\.|https:\\/\\/www\\.|http:\\/\\/|https:\\/\\/)?[a-z0-9]+([\\-\\.]{1}[a-z0-9]+)*\\.[a-z]{2,5}(:[0-9]{1,5})?(\\/.*)?$",
+                "errorMessage": {
+                  "pattern": "must match format \"url\""
+                }
+              },
+              "description": {
+                "title": "Description",
+                "description": "Information about a related metadata document.",
+                "type": "string"
+              }
+            },
+            "required": [
+              "name"
+            ]
+          }
         }
       },
       "required": [
         "contentUrl",
-        "encodingFormat",
         "contentSize",
         "name"
       ]

diff --git a/tests/data/core_metadata.json b/tests/data/core_metadata.json
@@ -80,7 +80,8 @@
 		"geo": {
 			"@type": "GeoShape",
 			"box": "40.1126 -88.2249 40.1126 -88.2249"
-		}
+		},
+		"additionalProperty": []
 	},
 	"hasPart": [
 		{
@@ -104,7 +105,14 @@
 			"contentUrl": "https://www.hydroshare.org/resource/51d1539bf6e94b15ac33f7631228118c/data/contents/USGS_Harvey_gages_TxLaMsAr.csv",
 			"encodingFormat": "text/csv",
 			"contentSize": "0.17 MB",
-			"name": "USGS gage locations within the Harvey-affected areas in Texas"
+			"name": "USGS gage locations within the Harvey-affected areas in Texas",
+            "sha256": "830f4b50e78e8a8fb0f7eee7369171dacbcaa43cc2c4deb59cef8e4fd2f641c5",
+			"additionalProperty": [],
+			"variableMeasured": null,
+			"spatialCoverage": null,
+			"temporalCoverage": null,
+			"sourceOrganization": null,
+			"isPartOf": null
 		}
 	],
 	"citation": ["Citation for the dataset"]

diff --git a/tests/test_core_schema.py b/tests/test_core_schema.py
@@ -286,13 +286,15 @@ async def test_core_schema_associated_media_cardinality(core_data, core_model, m
                 "contentUrl": "https://www.hydroshare.org/resource/51d1539bf6e94b15ac33f7631228118c/data/contents/USGS_Harvey_gages_TxLaMsAr.csv",
                 "encodingFormat": "text/csv",
                 "contentSize": "0.17 GB",
+                "sha256": "2fba6f2ebac562dac6a57acf0fdc5fdfabc9654b3c910aa6ef69cf4385997e19",
                 "name": "USGS gage locations within the Harvey-affected areas in Texas",
             },
             {
                 "@type": "VideoObject",
                 "contentUrl": "https://www.hydroshare.org/resource/81cb3f6c0dde4433ae4f43a26a889864/data/contents/HydroClientMovie.mp4",
                 "encodingFormat": "video/mp4",
                 "contentSize": "79.2 MB",
+                "sha256": "2fba6f2ebac562dac6a57acf0fdc5fdfabc9654b3c910aa6ef69cf4385997e20",
                 "name": "HydroClient Video",
             },
         ]
@@ -306,6 +308,7 @@ async def test_core_schema_associated_media_cardinality(core_data, core_model, m
                 "contentUrl": "https://www.hydroshare.org/resource/51d1539bf6e94b15ac33f7631228118c/data/contents/USGS_Harvey_gages_TxLaMsAr.csv",
                 "encodingFormat": "text/csv",
                 "contentSize": "0.17 MB",
+                "sha256": "2fba6f2ebac562dac6a57acf0fdc5fdfabc9654b3c910aa6ef69cf4385997e19",
                 "name": "USGS gage locations within the Harvey-affected areas in Texas",
             }
         ]
@@ -326,11 +329,17 @@ async def test_core_schema_associated_media_cardinality(core_data, core_model, m
         assert core_model_instance.associatedMedia[1].contentSize == associated_media[1]["contentSize"]
         assert core_model_instance.associatedMedia[0].encodingFormat == associated_media[0]["encodingFormat"]
         assert core_model_instance.associatedMedia[1].encodingFormat == associated_media[1]["encodingFormat"]
+        assert core_model_instance.associatedMedia[0].contentUrl == associated_media[0]["contentUrl"]
+        assert core_model_instance.associatedMedia[1].contentUrl == associated_media[1]["contentUrl"]
+        assert core_model_instance.associatedMedia[0].sha256 == associated_media[0]["sha256"]
+        assert core_model_instance.associatedMedia[1].sha256 == associated_media[1]["sha256"]
     elif multiple_media is not None:
         assert core_model_instance.associatedMedia[0].type == associated_media[0]["@type"]
         assert core_model_instance.associatedMedia[0].name == associated_media[0]["name"]
         assert core_model_instance.associatedMedia[0].contentSize == associated_media[0]["contentSize"]
         assert core_model_instance.associatedMedia[0].encodingFormat == associated_media[0]["encodingFormat"]
+        assert core_model_instance.associatedMedia[0].contentUrl == associated_media[0]["contentUrl"]
+        assert core_model_instance.associatedMedia[0].sha256 == associated_media[0]["sha256"]
 
 
 @pytest.mark.parametrize(
@@ -369,6 +378,7 @@ async def test_core_schema_associated_media_content_size(
             "contentUrl": "https://www.hydroshare.org/resource/51d1539bf6e94b15ac33f7631228118c/data/contents/USGS_Harvey_gages_TxLaMsAr.csv",
             "encodingFormat": "text/csv",
             "contentSize": content_size_format,
+            "sha256": "2fba6f2ebac562dac6a57acf0fdc5fdfabc9654b3c910aa6ef69cf4385997e19",
             "name": "USGS gage locations within the Harvey-affected areas in Texas",
         }
     ]
@@ -397,6 +407,7 @@ async def test_core_schema_associated_media_encoding_format_optional(
             "@type": "MediaObject",
             "contentUrl": "https://www.hydroshare.org/resource/51d1539bf6e94b15ac33f7631228118c/data/contents/USGS_Harvey_gages_TxLaMsAr.csv",
             "contentSize": "100.17 KB",
+            "sha256": "2fba6f2ebac562dac6a57acf0fdc5fdfabc9654b3c910aa6ef69cf4385997e19",
             "name": "USGS gage locations within the Harvey-affected areas in Texas",
         }
     ]
@@ -406,6 +417,53 @@ async def test_core_schema_associated_media_encoding_format_optional(
     assert core_model_instance.associatedMedia[0].encodingFormat is None
 
 
+@pytest.mark.parametrize("set_is_part_of", [True, False])
+@pytest.mark.asyncio
+async def test_core_schema_associated_media_is_part_of_optional(
+    core_data, core_model, set_is_part_of
+):
+    """Test that a core metadata pydantic model can be created from core metadata json.
+    Purpose of the test is to validate core metadata schema as defined by the pydantic model where we are testing
+    that isPartOf attribute of the associatedMedia property is optional.
+    Note: This test does nat add a record to the database.
+    """
+
+    core_data = core_data
+    core_model = core_model
+
+    core_data["associatedMedia"] = [
+        {
+            "@type": "MediaObject",
+            "contentUrl": "https://www.hydroshare.org/resource/51d1539bf6e94b15ac33f7631228118c/data/contents/logan.nc",
+            "contentSize": "100.17 KB",
+            "encodingFormat": "application/x-netcdf",
+            "sha256": "2fba6f2ebac562dac6a57acf0fdc5fdfabc9654b3c910aa6ef69cf4385997e19",
+            "name": "logan.nc",
+        }
+    ]
+
+    if set_is_part_of:
+        core_data["associatedMedia"][0]["isPartOf"] = [
+            {
+                "@type": "CreativeWork",
+                "name": "logan.nc.json",
+                "url": "https://www.hydroshare.org/resource/51d1539bf6e94b15ac33f7631228118c/data/contents/logan.nc.json",
+            }
+        ]
+    # validate the data model
+    core_model_instance = await utils.validate_data_model(core_data, core_model)
+    if set_is_part_of:
+        assert len(core_model_instance.associatedMedia[0].isPartOf) == 1
+        assert core_model_instance.associatedMedia[0].isPartOf[0].type == "CreativeWork"
+        assert core_model_instance.associatedMedia[0].isPartOf[0].name == "logan.nc.json"
+        assert (
+            core_model_instance.associatedMedia[0].isPartOf[0].url
+            == "https://www.hydroshare.org/resource/51d1539bf6e94b15ac33f7631228118c/data/contents/logan.nc.json"
+        )
+    else:
+        assert core_model_instance.associatedMedia[0].isPartOf is None
+
+
 @pytest.mark.parametrize("set_additional_property", [True, False])
 @pytest.mark.asyncio
 async def test_core_schema_associated_media_additional_property(

diff --git a/tests/test_hydroshare_meta_adapter.py b/tests/test_hydroshare_meta_adapter.py
@@ -86,15 +86,18 @@ async def test_hydroshare_resource_meta_adapter(hydroshare_resource_metadata, co
             assert media.contentUrl == f"{media_base_url}/model-program/V.dat"
             assert media.encodingFormat == "None"
             assert media.contentSize == "124.144 KB"
+            assert media.sha256 == "a0b00d911d09e69bdbee0033e40414f9"
         elif media.name == "Qsi.nc":
             assert media.contentUrl == f"{media_base_url}/model-program/Qsi.nc"
             assert media.encodingFormat == "application/x-netcdf"
             assert media.contentSize == "20.144 KB"
+            assert media.sha256 == "93b546c41fca467496900d0f2415c1de"
         else:
             assert media.name == "README.md"
             assert media.contentUrl == f"{media_base_url}/README.md"
             assert media.encodingFormat == "text/markdown"
             assert media.contentSize == "4.422 KB"
+            assert media.sha256 == "7d460cb12903a965d144cddcb2b62eac"
 
 
 @pytest.mark.asyncio