Skip to content

Commit

Permalink
Merge pull request #10 from eliarizzetto/main
Browse files Browse the repository at this point in the history
Add support for Crossref member IDs to oc_idmanager
  • Loading branch information
ariannamorettj authored Apr 8, 2024
2 parents d57581a + 55b842f commit accbd54
Show file tree
Hide file tree
Showing 4 changed files with 318 additions and 1 deletion.
1 change: 1 addition & 0 deletions oc_ds_converter/oc_idmanager/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,5 @@
from oc_ds_converter.oc_idmanager.wikidata import WikidataManager
from oc_ds_converter.oc_idmanager.wikipedia import WikipediaManager
from oc_ds_converter.oc_idmanager.openalex import OpenAlexManager
from oc_ds_converter.oc_idmanager.crossref import CrossrefManager

107 changes: 107 additions & 0 deletions oc_ds_converter/oc_idmanager/crossref.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
#!python
# Copyright 2019, Silvio Peroni <essepuntato@gmail.com>
# Copyright 2022, Giuseppe Grieco <giuseppe.grieco3@unibo.it>, Arianna Moretti <arianna.moretti4@unibo.it>, Elia Rizzetto <elia.rizzetto@studio.unibo.it>, Arcangelo Massari <arcangelo.massari@unibo.it>
#
# Permission to use, copy, modify, and/or distribute this software for any purpose
# with or without fee is hereby granted, provided that the above copyright notice
# and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
# FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT,
# OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
# DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
# SOFTWARE.

from oc_ds_converter.oc_idmanager.base import IdentifierManager
from oc_ds_converter.oc_idmanager.oc_data_storage.storage_manager import StorageManager
from oc_ds_converter.oc_idmanager.oc_data_storage.in_memory_manager import InMemoryStorageManager
from oc_ds_converter.oc_idmanager.support import call_api
from re import sub, match
from typing import Optional


class CrossrefManager(IdentifierManager):
"""This class implements an identifier manager for Crossref member identifier"""

def __init__(self, use_api_service=True, storage_manager: Optional[StorageManager] = None):
"""Crossref member ID manager constructor."""
super(CrossrefManager, self).__init__()
if storage_manager is None:
self.storage_manager = InMemoryStorageManager()
else:
self.storage_manager = storage_manager
self._api = "https://api.crossref.org/members/"
self._api_works_route = r"https://api.openalex.org/works/"
self._api_sources_route = r"https://api.openalex.org/sources/"
self._use_api_service = use_api_service
self._p = "crossref:"
self._url_id_pref = "https://openalex.org/"

def is_valid(self, cr_member_id, get_extra_info=False):
cr_member_id = self.normalise(cr_member_id, include_prefix=True)

if cr_member_id is None:
return False
else:
id_validation_value = self.storage_manager.get_value(cr_member_id)
if isinstance(id_validation_value, bool):
return id_validation_value
else:
if get_extra_info:
info = self.exists(cr_member_id, get_extra_info=True)
self.storage_manager.set_full_value(cr_member_id, info[1])
return (info[0] and self.syntax_ok(cr_member_id)), info[1]
validity_check = self.exists(cr_member_id) and self.syntax_ok(cr_member_id)
self.storage_manager.set_value(cr_member_id, validity_check)

return validity_check

def normalise(self, id_string, include_prefix=False):
try:
if id_string.startswith(self._p):
oal_string = id_string[len(self._p):]
else:
oal_string = id_string

oal_string = sub(r"\D", "", oal_string)

return "%s%s" % (
self._p if include_prefix else "",
oal_string.strip(),
)
except:
# Any error in processing the OpenAlex ID will return None
return None

def syntax_ok(self, id_string):

if not id_string.startswith("crossref:"):
id_string = self._p + id_string
return True if match(r"^crossref:\d+$", id_string) else False

def exists(self, cr_member_id_full, get_extra_info=False, allow_extra_api=None):
valid_bool = True
cr_member_id_full = self._p + cr_member_id_full if not cr_member_id_full.startswith(self._p) else cr_member_id_full
if self._use_api_service:
cr_member_id = self.normalise(cr_member_id_full) # returns None or unprefixed ID (include_prefix is set to False)
pref_cr_member_id = self._p + cr_member_id if cr_member_id else None
if pref_cr_member_id is not None:
json_res = call_api(url=self._api+cr_member_id, headers=self._headers)
if json_res:
valid_bool = str(json_res['message']['id']) == cr_member_id
if get_extra_info:
extra_info_result = {'id': pref_cr_member_id, 'valid': valid_bool}
return valid_bool, extra_info_result
return valid_bool
valid_bool = False
else:
return (False, {'id': None, 'valid': False}) if get_extra_info else False
return (valid_bool, {'id': cr_member_id_full, 'valid': valid_bool}) if get_extra_info else valid_bool

def extra_info(self, api_response, choose_api=None, info_dict={}):
result = {}
result["valid"] = True
# to be implemented
return result
8 changes: 7 additions & 1 deletion test/data/glob.json
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,13 @@

"openalex:W7836728310": {"id": "openalex:W7836728310", "valid": false},

"openalex:S4263287381": {"id": "openalex:S4263287381", "valid": false}
"openalex:S4263287381": {"id": "openalex:S4263287381", "valid": false},

"crossref:297": {"id": "crossref:297", "valid": true},

"crossref:4443": {"id": "crossref:4443", "valid": true},

"crossref:342427": {"id": "crossref:342427", "valid": false},

"crossref:0123": {"id": "crossref:0123", "valid": false}
}
203 changes: 203 additions & 0 deletions test/idm_crossref_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
import json
import sqlite3
import os.path
import unittest
from os import makedirs
from os.path import exists, join

import xmltodict
from oc_ds_converter.oc_idmanager import *
from oc_ds_converter.oc_idmanager.base import IdentifierManager
from requests import ReadTimeout, get
from requests.exceptions import ConnectionError
from oc_ds_converter.oc_idmanager.oc_data_storage.storage_manager import StorageManager
from oc_ds_converter.oc_idmanager.oc_data_storage.in_memory_manager import InMemoryStorageManager
from oc_ds_converter.oc_idmanager.oc_data_storage.sqlite_manager import SqliteStorageManager

class CrossrefIdentifierManagerTest(unittest.TestCase):
"""This class aim at testing identifiers manager."""

def setUp(self):
if not exists("tmp"):
makedirs("tmp")

self.test_dir = join("test", "data")
self.test_json_path = join(self.test_dir, "glob.json")
with open(self.test_json_path, encoding="utf-8") as fp:
self.data = json.load(fp)

self.valid_crmid1 = "297"
self.valid_crmid2 = "4443"
self.invalid_crmid1 = "342427"
self.invalid_crmid2 = "0123"

def test_crossref_is_valid(self):
crmngr_nofile = CrossrefManager()
self.assertTrue(crmngr_nofile.is_valid(self.valid_crmid1))
self.assertTrue(crmngr_nofile.is_valid(self.valid_crmid2))
self.assertFalse(crmngr_nofile.is_valid(self.invalid_crmid1))
self.assertFalse(crmngr_nofile.is_valid(self.invalid_crmid2))

crmngr_file = CrossrefManager(use_api_service=False, storage_manager=InMemoryStorageManager(self.test_json_path))
self.assertTrue(crmngr_file.normalise(self.valid_crmid1, include_prefix=True) in self.data)
self.assertTrue(crmngr_file.normalise(self.invalid_crmid1, include_prefix=True) in self.data)
self.assertTrue(crmngr_file.is_valid(self.valid_crmid1))
self.assertFalse(crmngr_file.is_valid(self.invalid_crmid1))

crmngr_nofile_noapi = CrossrefManager(storage_manager=InMemoryStorageManager(self.test_json_path), use_api_service=False)
self.assertTrue(crmngr_nofile_noapi.is_valid(self.valid_crmid1))
self.assertTrue(crmngr_nofile_noapi.is_valid(self.valid_crmid2))

def test_exists(self):
with self.subTest(msg="get_extra_info=True, allow_extra_api=None"):
crmngr = CrossrefManager()
output = crmngr.exists(self.valid_crmid1, get_extra_info=True, allow_extra_api=None)
expected_output = (True, {'valid': True})
self.assertEqual(expected_output[0], output[0])
# self.assertCountEqual({k:v for k,v in expected_output[1].items() if k!= "author"}, {k:v for k,v in output[1].items() if k!= "author"})
# self.assertCountEqual(expected_output[1]["author"], output[1]["author"])

with self.subTest(msg="get_extra_info=False, allow_extra_api=None"):
crmngr = CrossrefManager()
output = crmngr.exists(self.valid_crmid2, get_extra_info=False, allow_extra_api=None)
expected_output = True
self.assertEqual(output, expected_output)


def test_openalex_normalise(self):
crmngr = CrossrefManager()

self.assertEqual(
self.valid_crmid1, crmngr.normalise("crossref:" + self.valid_crmid1)
)
self.assertEqual(
self.valid_crmid1, crmngr.normalise(self.valid_crmid1.replace("", " "))
)
self.assertEqual(
self.valid_crmid1,
crmngr.normalise("https://api.crossref.org/members/" + self.valid_crmid1),
)
self.assertEqual(
crmngr.normalise(self.valid_crmid1),
crmngr.normalise(' ' + self.valid_crmid1),
)
self.assertEqual(
crmngr.normalise(self.valid_crmid2),
crmngr.normalise("https://api.crossref.org/members/" + self.valid_crmid2),
)

dm_file = CrossrefManager(storage_manager=InMemoryStorageManager(self.test_json_path), use_api_service=False)
self.assertTrue(dm_file.normalise(self.valid_crmid1, include_prefix=True) in self.data)
self.assertTrue(dm_file.normalise(self.invalid_crmid2, include_prefix=True) in self.data)
self.assertTrue(dm_file.is_valid(self.valid_crmid1))
self.assertFalse(dm_file.is_valid(self.invalid_crmid2))

def test_crossref_default(self):
mngr = CrossrefManager()
# No support files (it generates it)
# Default storage manager : in Memory + generates file on method call (not automatically)
# uses API
self.assertTrue(mngr.is_valid(self.valid_crmid1))
self.assertTrue(mngr.is_valid(self.valid_crmid2))
self.assertFalse(mngr.is_valid(self.invalid_crmid2))
self.assertFalse(mngr.is_valid(self.invalid_crmid1))
mngr.storage_manager.store_file()
validated_ids = [self.valid_crmid1, self.valid_crmid2, self.invalid_crmid1, self.invalid_crmid2]
validated = [mngr.normalise(x, include_prefix=True) for x in validated_ids if mngr.normalise(x, include_prefix=True)]
# check that the support file was correctly created
self.assertTrue(os.path.exists("storage/id_value.json"))
lj = open("storage/id_value.json")
load_dict = json.load(lj)
lj.close()
stored = [mngr.normalise(x, include_prefix=True) for x in load_dict if mngr.normalise(x, include_prefix=True)]

# check that all the validated ids are stored in the json file
self.assertTrue(all(x in stored for x in validated))
mngr.storage_manager.delete_storage()
# check that the support file was correctly deleted
self.assertFalse(os.path.exists("storage/id_value.json"))

def test_crossref_memory_file_noapi(self):
# Uses support file (without updating it)
# Uses InMemoryStorageManager storage manager
# does not use API (so a syntactically correct id is considered to be valid)
am_file = CrossrefManager(storage_manager=InMemoryStorageManager(self.test_json_path), use_api_service=False)
self.assertTrue(am_file.normalise(self.valid_crmid1, include_prefix=True) in self.data)
self.assertTrue(am_file.normalise(self.invalid_crmid2, include_prefix=True) in self.data)
self.assertFalse(am_file.is_valid(self.invalid_crmid2)) # is stored in support file as invalid
# self.assertTrue(am_file.is_valid(am_file.normalise(self.invalid_wid, include_prefix=True))) # is not stored in support file as invalid, does not exist but has correct syntax

def test_crossref_memory_file_api(self):
# Uses support file (without updating it)
# Uses InMemoryStorageManager storage manager
# uses API (so a syntactically correct id which is not valid is considered to be invalid)
am_file = CrossrefManager(storage_manager=InMemoryStorageManager(self.test_json_path), use_api_service=True)
self.assertFalse(am_file.is_valid(self.invalid_crmid1))

def test_crossref_memory_nofile_noapi(self):
# Does not use support file
# Uses InMemoryStorageManager storage manager
# Does not use API (so a syntactically correct id which is not valid is considered to be valid)
am_nofile_noapi = CrossrefManager(storage_manager=InMemoryStorageManager(), use_api_service=False)
self.assertTrue(am_nofile_noapi.is_valid(self.valid_crmid1))
self.assertTrue(am_nofile_noapi.is_valid(self.invalid_crmid1))
am_nofile_noapi.storage_manager.delete_storage()

def test_crossref_sqlite_nofile_api(self):
# No support files (it generates it)
# storage manager : SqliteStorageManager
# uses API
sql_am_nofile = CrossrefManager(storage_manager=SqliteStorageManager())
self.assertTrue(sql_am_nofile.is_valid(self.valid_crmid1))
self.assertTrue(sql_am_nofile.is_valid(self.valid_crmid2))
self.assertFalse(sql_am_nofile.is_valid(self.invalid_crmid1))
self.assertFalse(sql_am_nofile.is_valid(self.invalid_crmid2))
# check that the support db was correctly created and that it contains all the validated ids
self.assertTrue(os.path.exists("storage/id_valid_dict.db"))
validated_ids = [self.valid_crmid1, self.valid_crmid2, self.invalid_crmid1, self.invalid_crmid2]
all_ids_stored = sql_am_nofile.storage_manager.get_all_keys()
# check that all the validated ids are stored in the json file
stored = [x for x in all_ids_stored]
validated = [sql_am_nofile.normalise(x, include_prefix=True) for x in validated_ids if sql_am_nofile.normalise(x, include_prefix=True)]
self.assertTrue(all(x in stored for x in validated))
sql_am_nofile.storage_manager.delete_storage()
# check that the support file was correctly deleted
self.assertFalse(os.path.exists("storage/id_valid_dict.db"))

def test_crossref_sqlite_file_api(self):
# Uses support file
# Uses SqliteStorageManager storage manager
# does not use API (so a syntactically correct id is considered to be valid)
# db creation
test_sqlite_db = os.path.join(self.test_dir, "database.db")
if os.path.exists(test_sqlite_db):
os.remove(test_sqlite_db)
#con = sqlite3.connect(test_sqlite_db)
#cur = con.cursor()
to_insert = [self.invalid_crmid1, self.valid_crmid1]
sql_file = CrossrefManager(storage_manager=SqliteStorageManager(test_sqlite_db), use_api_service=True)
for id in to_insert:
norm_id = sql_file.normalise(id, include_prefix=True)
is_valid = 1 if sql_file.is_valid(norm_id) else 0
insert_tup = (norm_id, is_valid)
sql_file.storage_manager.cur.execute(f"INSERT OR REPLACE INTO info VALUES (?,?)", insert_tup)
sql_file.storage_manager.con.commit()
sql_file.storage_manager.con.close()

sql_no_api = CrossrefManager(storage_manager=SqliteStorageManager(test_sqlite_db), use_api_service=False)
all_db_keys = sql_no_api.storage_manager.get_all_keys()
#check that all the normalised ind in the list were correctly inserted in the db
self.assertTrue(all(sql_no_api.normalise(x, include_prefix=True) in all_db_keys for x in to_insert))
self.assertTrue(sql_no_api.is_valid(self.valid_crmid1)) # is stored in support file as valid
self.assertFalse(sql_no_api.is_valid(self.invalid_crmid1)) # is stored in support file as invalid
self.assertTrue(sql_no_api.is_valid(sql_no_api.normalise(self.invalid_crmid2, include_prefix=True))) # is not stored in support file as invalid, does not exist but has correct syntax
sql_no_api.storage_manager.delete_storage()

def test_crossref_sqlite_nofile_noapi(self):
# Does not use support file
# Uses SqliteStorageManager storage manager
# Does not use API (so a syntactically correct id which is not valid is considered to be valid)
am_nofile_noapi = CrossrefManager(storage_manager=SqliteStorageManager(), use_api_service=False)
self.assertTrue(am_nofile_noapi.is_valid(self.valid_crmid1))
self.assertTrue(am_nofile_noapi.is_valid(self.invalid_crmid2))
am_nofile_noapi.storage_manager.delete_storage()

0 comments on commit accbd54

Please sign in to comment.