Skip to content

Commit

Permalink
add ORCID index validation to Datacite and OpenAIRE processors
Browse files Browse the repository at this point in the history
  • Loading branch information
arcangelo7 committed Jan 14, 2025
1 parent 1725ccb commit 26d24f5
Show file tree
Hide file tree
Showing 4 changed files with 127 additions and 15 deletions.
28 changes: 23 additions & 5 deletions oc_ds_converter/datacite/datacite_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -718,19 +718,37 @@ def add_authors_to_agent_list(self, item: dict, ag_list: list) -> list:
return agent_list

#added
def find_datacite_orcid(self, all_author_ids):
def find_datacite_orcid(self, all_author_ids, doi=None):
"""Find and validate ORCID from Datacite data
Args:
orcid_ids (list): List of ORCID identifiers
doi (str, optional): DOI to check in ORCID index. Defaults to None.
"""
orcid = ""
if all_author_ids:
for identifier in all_author_ids:
norm_orcid = self.orcid_m.normalise(identifier, include_prefix = True)
norm_orcid = self.orcid_m.normalise(identifier, include_prefix=True)
## Check orcid presence in memory and storage before validating the id
validity_value_orcid = self.validated_as({"identifier": norm_orcid, "schema": "orcid"})
if validity_value_orcid is True:
orcid = norm_orcid
break
elif validity_value_orcid is None:
norm_id_dict = {"id": norm_orcid, "schema": "orcid"}
if norm_orcid in self.to_validated_id_list(norm_id_dict):
orcid = norm_orcid
# Check in ORCID index using provided DOI before any API validation
if doi:
found_orcids = self.orcid_finder(doi)
if found_orcids and norm_orcid.split(':')[1] in found_orcids:
self.tmp_orcid_m.storage_manager.set_value(norm_orcid, True)
orcid = norm_orcid

# If not found in index, proceed with normal validation
if not orcid:
norm_id_dict = {"id": norm_orcid, "schema": "orcid"}
if norm_orcid in self.to_validated_id_list(norm_id_dict):
orcid = norm_orcid
break

return orcid


Expand Down
25 changes: 16 additions & 9 deletions oc_ds_converter/openaire/openaire_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -609,7 +609,7 @@ def add_authors_to_agent_list(self, item: dict, ag_list: list) -> list:

return agent_list

def find_openaire_orcid(self, all_author_ids):
def find_openaire_orcid(self, all_author_ids, doi=None):
orcid = ""
if all_author_ids:
for id in all_author_ids:
Expand All @@ -618,19 +618,26 @@ def find_openaire_orcid(self, all_author_ids):
if isinstance(schema, str):
if schema.lower().strip() == "orcid":
if isinstance(identifier, str):
norm_orcid = self.orcid_m.normalise(identifier, include_prefix =True)
norm_orcid = self.orcid_m.normalise(identifier, include_prefix=True)
## Check orcid presence in memory and storage before validating the id
validity_value_orcid = self.validated_as({"identifier":norm_orcid, "schema": schema})
if validity_value_orcid is True:
orcid = norm_orcid
elif validity_value_orcid is None:
#if self.RA_redis.get(norm_orcid):
if norm_orcid in self._redis_values_ra:
orcid = norm_orcid
# if the id is not in redis db, validate it before appending
elif self.tmp_orcid_m.is_valid(norm_orcid):
orcid = norm_orcid

# Check in ORCID index using provided DOI before any API validation
if doi:
found_orcids = self.orcid_finder(doi)
if found_orcids and norm_orcid.split(':')[1] in found_orcids:
self.tmp_orcid_m.storage_manager.set_value(norm_orcid, True)
orcid = norm_orcid

# If not found in index, check Redis and API
if not orcid:
if norm_orcid in self._redis_values_ra:
orcid = norm_orcid
# if the id is not in redis db, validate it before appending
elif self.tmp_orcid_m.is_valid(norm_orcid):
orcid = norm_orcid

return orcid

Expand Down
48 changes: 47 additions & 1 deletion test/datacite_processing_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -1071,4 +1071,50 @@ def test_get_agents_strings_list_inverted_names(self):
authors_strings_list, _ = datacite_processor.get_agents_strings_list('10.12753/2066-026x-14-246',
agents_list)
expected_authors_list = ['Viorel, Cojocaru', 'Cojocaru, John', 'Ciprian, Panait']
self.assertEqual(authors_strings_list, expected_authors_list)
self.assertEqual(authors_strings_list, expected_authors_list)

def test_find_datacite_orcid_with_index(self):
"""Test ORCID validation using ORCID index before API validation"""
# Setup
test_doi = "10.1234/test123"
test_orcid = "0000-0002-1234-5678"
test_name = "Smith, John"

# Create DataciteProcessing instance with ORCID index
dp = DataciteProcessing()
dp.orcid_index.add_value(test_doi, f"{test_name} [orcid:{test_orcid}]")

# Test Case 1: ORCID found in index
inp_1 = [test_orcid]
out_1 = dp.find_datacite_orcid(inp_1, test_doi)
exp_1 = f"orcid:{test_orcid}"
self.assertEqual(out_1, exp_1)
# Verify it was added to temporary storage
self.assertTrue(dp.tmp_orcid_m.storage_manager.get_value(f"orcid:{test_orcid}"))

# Test Case 2: ORCID not in index but valid via API
inp_2 = ["0000-0003-4082-1500"]
out_2 = dp.find_datacite_orcid(inp_2, test_doi)
exp_2 = "orcid:0000-0003-4082-1500"
self.assertEqual(out_2, exp_2)

# Test Case 3: ORCID not in index and invalid
inp_3 = ["0000-0000-0000-0000"]
out_3 = dp.find_datacite_orcid(inp_3, test_doi)
exp_3 = ""
self.assertEqual(out_3, exp_3)

# Test Case 4: Valid ORCID but no DOI provided (retrocompatibilità)
inp_4 = [test_orcid]
out_4 = dp.find_datacite_orcid(inp_4) # No DOI
exp_4 = f"orcid:{test_orcid}" # Should still validate via API
self.assertEqual(out_4, exp_4)

# Test Case 5: Multiple ORCIDs, first one valid
inp_5 = [test_orcid, "0000-0000-0000-0000"]
out_5 = dp.find_datacite_orcid(inp_5, test_doi)
exp_5 = f"orcid:{test_orcid}"
self.assertEqual(out_5, exp_5)

# Cleanup
dp.storage_manager.delete_storage()
41 changes: 41 additions & 0 deletions test/processing_oroci_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -1616,6 +1616,47 @@ def update_redis_values_real_redis(self):
def test_update_redis_values_real_redis(self):
self.real_redis_test_case(self.update_redis_values_real_redis)

def test_find_openaire_orcid_with_index(self):
"""Test ORCID validation using ORCID index before API validation"""
# Setup
test_doi = "10.1234/test123"
test_orcid = "0000-0002-1234-5678"
test_name = "Smith, John"

# Create OpenaireProcessing instance with ORCID index
op = OpenaireProcessing()
# Correct format for add_value: id_string -> value
op.orcid_index.add_value(test_doi, f"{test_name} [orcid:{test_orcid}]")

# Test Case 1: ORCID found in index
inp_1 = [{'identifier': test_orcid, 'schema': 'ORCID'}]
out_1 = op.find_openaire_orcid(inp_1, test_doi)
exp_1 = f"orcid:{test_orcid}"
self.assertEqual(out_1, exp_1)
# Verify it was added to temporary storage
self.assertTrue(op.tmp_orcid_m.storage_manager.get_value(f"orcid:{test_orcid}"))

# Test Case 2: ORCID not in index but valid via API
inp_2 = [{'identifier': '0000-0003-4082-1500', 'schema': 'ORCID'}]
out_2 = op.find_openaire_orcid(inp_2, test_doi)
exp_2 = "orcid:0000-0003-4082-1500"
self.assertEqual(out_2, exp_2)

# Test Case 3: ORCID not in index and invalid
inp_3 = [{'identifier': '0000-0000-0000-0000', 'schema': 'ORCID'}]
out_3 = op.find_openaire_orcid(inp_3, test_doi)
exp_3 = ""
self.assertEqual(out_3, exp_3)

# Test Case 4: Valid ORCID but no DOI provided
inp_4 = [{'identifier': test_orcid, 'schema': 'ORCID'}]
out_4 = op.find_openaire_orcid(inp_4) # No DOI
exp_4 = f"orcid:{test_orcid}" # Should still validate via API
self.assertEqual(out_4, exp_4)

# Cleanup
op.storage_manager.delete_storage()



if __name__ == '__main__':
Expand Down

0 comments on commit 26d24f5

Please sign in to comment.