-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #10 from eliarizzetto/main
Add support for Crossref member IDs to oc_idmanager
- Loading branch information
Showing
4 changed files
with
318 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
#!python | ||
# Copyright 2019, Silvio Peroni <essepuntato@gmail.com> | ||
# Copyright 2022, Giuseppe Grieco <giuseppe.grieco3@unibo.it>, Arianna Moretti <arianna.moretti4@unibo.it>, Elia Rizzetto <elia.rizzetto@studio.unibo.it>, Arcangelo Massari <arcangelo.massari@unibo.it> | ||
# | ||
# Permission to use, copy, modify, and/or distribute this software for any purpose | ||
# with or without fee is hereby granted, provided that the above copyright notice | ||
# and this permission notice appear in all copies. | ||
# | ||
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH | ||
# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND | ||
# FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, | ||
# OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, | ||
# DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS | ||
# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS | ||
# SOFTWARE. | ||
|
||
from oc_ds_converter.oc_idmanager.base import IdentifierManager | ||
from oc_ds_converter.oc_idmanager.oc_data_storage.storage_manager import StorageManager | ||
from oc_ds_converter.oc_idmanager.oc_data_storage.in_memory_manager import InMemoryStorageManager | ||
from oc_ds_converter.oc_idmanager.support import call_api | ||
from re import sub, match | ||
from typing import Optional | ||
|
||
|
||
class CrossrefManager(IdentifierManager): | ||
"""This class implements an identifier manager for Crossref member identifier""" | ||
|
||
def __init__(self, use_api_service=True, storage_manager: Optional[StorageManager] = None): | ||
"""Crossref member ID manager constructor.""" | ||
super(CrossrefManager, self).__init__() | ||
if storage_manager is None: | ||
self.storage_manager = InMemoryStorageManager() | ||
else: | ||
self.storage_manager = storage_manager | ||
self._api = "https://api.crossref.org/members/" | ||
self._api_works_route = r"https://api.openalex.org/works/" | ||
self._api_sources_route = r"https://api.openalex.org/sources/" | ||
self._use_api_service = use_api_service | ||
self._p = "crossref:" | ||
self._url_id_pref = "https://openalex.org/" | ||
|
||
def is_valid(self, cr_member_id, get_extra_info=False): | ||
cr_member_id = self.normalise(cr_member_id, include_prefix=True) | ||
|
||
if cr_member_id is None: | ||
return False | ||
else: | ||
id_validation_value = self.storage_manager.get_value(cr_member_id) | ||
if isinstance(id_validation_value, bool): | ||
return id_validation_value | ||
else: | ||
if get_extra_info: | ||
info = self.exists(cr_member_id, get_extra_info=True) | ||
self.storage_manager.set_full_value(cr_member_id, info[1]) | ||
return (info[0] and self.syntax_ok(cr_member_id)), info[1] | ||
validity_check = self.exists(cr_member_id) and self.syntax_ok(cr_member_id) | ||
self.storage_manager.set_value(cr_member_id, validity_check) | ||
|
||
return validity_check | ||
|
||
def normalise(self, id_string, include_prefix=False): | ||
try: | ||
if id_string.startswith(self._p): | ||
oal_string = id_string[len(self._p):] | ||
else: | ||
oal_string = id_string | ||
|
||
oal_string = sub(r"\D", "", oal_string) | ||
|
||
return "%s%s" % ( | ||
self._p if include_prefix else "", | ||
oal_string.strip(), | ||
) | ||
except: | ||
# Any error in processing the OpenAlex ID will return None | ||
return None | ||
|
||
def syntax_ok(self, id_string): | ||
|
||
if not id_string.startswith("crossref:"): | ||
id_string = self._p + id_string | ||
return True if match(r"^crossref:\d+$", id_string) else False | ||
|
||
def exists(self, cr_member_id_full, get_extra_info=False, allow_extra_api=None): | ||
valid_bool = True | ||
cr_member_id_full = self._p + cr_member_id_full if not cr_member_id_full.startswith(self._p) else cr_member_id_full | ||
if self._use_api_service: | ||
cr_member_id = self.normalise(cr_member_id_full) # returns None or unprefixed ID (include_prefix is set to False) | ||
pref_cr_member_id = self._p + cr_member_id if cr_member_id else None | ||
if pref_cr_member_id is not None: | ||
json_res = call_api(url=self._api+cr_member_id, headers=self._headers) | ||
if json_res: | ||
valid_bool = str(json_res['message']['id']) == cr_member_id | ||
if get_extra_info: | ||
extra_info_result = {'id': pref_cr_member_id, 'valid': valid_bool} | ||
return valid_bool, extra_info_result | ||
return valid_bool | ||
valid_bool = False | ||
else: | ||
return (False, {'id': None, 'valid': False}) if get_extra_info else False | ||
return (valid_bool, {'id': cr_member_id_full, 'valid': valid_bool}) if get_extra_info else valid_bool | ||
|
||
def extra_info(self, api_response, choose_api=None, info_dict={}): | ||
result = {} | ||
result["valid"] = True | ||
# to be implemented | ||
return result |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,203 @@ | ||
import json | ||
import sqlite3 | ||
import os.path | ||
import unittest | ||
from os import makedirs | ||
from os.path import exists, join | ||
|
||
import xmltodict | ||
from oc_ds_converter.oc_idmanager import * | ||
from oc_ds_converter.oc_idmanager.base import IdentifierManager | ||
from requests import ReadTimeout, get | ||
from requests.exceptions import ConnectionError | ||
from oc_ds_converter.oc_idmanager.oc_data_storage.storage_manager import StorageManager | ||
from oc_ds_converter.oc_idmanager.oc_data_storage.in_memory_manager import InMemoryStorageManager | ||
from oc_ds_converter.oc_idmanager.oc_data_storage.sqlite_manager import SqliteStorageManager | ||
|
||
class CrossrefIdentifierManagerTest(unittest.TestCase): | ||
"""This class aim at testing identifiers manager.""" | ||
|
||
def setUp(self): | ||
if not exists("tmp"): | ||
makedirs("tmp") | ||
|
||
self.test_dir = join("test", "data") | ||
self.test_json_path = join(self.test_dir, "glob.json") | ||
with open(self.test_json_path, encoding="utf-8") as fp: | ||
self.data = json.load(fp) | ||
|
||
self.valid_crmid1 = "297" | ||
self.valid_crmid2 = "4443" | ||
self.invalid_crmid1 = "342427" | ||
self.invalid_crmid2 = "0123" | ||
|
||
def test_crossref_is_valid(self): | ||
crmngr_nofile = CrossrefManager() | ||
self.assertTrue(crmngr_nofile.is_valid(self.valid_crmid1)) | ||
self.assertTrue(crmngr_nofile.is_valid(self.valid_crmid2)) | ||
self.assertFalse(crmngr_nofile.is_valid(self.invalid_crmid1)) | ||
self.assertFalse(crmngr_nofile.is_valid(self.invalid_crmid2)) | ||
|
||
crmngr_file = CrossrefManager(use_api_service=False, storage_manager=InMemoryStorageManager(self.test_json_path)) | ||
self.assertTrue(crmngr_file.normalise(self.valid_crmid1, include_prefix=True) in self.data) | ||
self.assertTrue(crmngr_file.normalise(self.invalid_crmid1, include_prefix=True) in self.data) | ||
self.assertTrue(crmngr_file.is_valid(self.valid_crmid1)) | ||
self.assertFalse(crmngr_file.is_valid(self.invalid_crmid1)) | ||
|
||
crmngr_nofile_noapi = CrossrefManager(storage_manager=InMemoryStorageManager(self.test_json_path), use_api_service=False) | ||
self.assertTrue(crmngr_nofile_noapi.is_valid(self.valid_crmid1)) | ||
self.assertTrue(crmngr_nofile_noapi.is_valid(self.valid_crmid2)) | ||
|
||
def test_exists(self): | ||
with self.subTest(msg="get_extra_info=True, allow_extra_api=None"): | ||
crmngr = CrossrefManager() | ||
output = crmngr.exists(self.valid_crmid1, get_extra_info=True, allow_extra_api=None) | ||
expected_output = (True, {'valid': True}) | ||
self.assertEqual(expected_output[0], output[0]) | ||
# self.assertCountEqual({k:v for k,v in expected_output[1].items() if k!= "author"}, {k:v for k,v in output[1].items() if k!= "author"}) | ||
# self.assertCountEqual(expected_output[1]["author"], output[1]["author"]) | ||
|
||
with self.subTest(msg="get_extra_info=False, allow_extra_api=None"): | ||
crmngr = CrossrefManager() | ||
output = crmngr.exists(self.valid_crmid2, get_extra_info=False, allow_extra_api=None) | ||
expected_output = True | ||
self.assertEqual(output, expected_output) | ||
|
||
|
||
def test_openalex_normalise(self): | ||
crmngr = CrossrefManager() | ||
|
||
self.assertEqual( | ||
self.valid_crmid1, crmngr.normalise("crossref:" + self.valid_crmid1) | ||
) | ||
self.assertEqual( | ||
self.valid_crmid1, crmngr.normalise(self.valid_crmid1.replace("", " ")) | ||
) | ||
self.assertEqual( | ||
self.valid_crmid1, | ||
crmngr.normalise("https://api.crossref.org/members/" + self.valid_crmid1), | ||
) | ||
self.assertEqual( | ||
crmngr.normalise(self.valid_crmid1), | ||
crmngr.normalise(' ' + self.valid_crmid1), | ||
) | ||
self.assertEqual( | ||
crmngr.normalise(self.valid_crmid2), | ||
crmngr.normalise("https://api.crossref.org/members/" + self.valid_crmid2), | ||
) | ||
|
||
dm_file = CrossrefManager(storage_manager=InMemoryStorageManager(self.test_json_path), use_api_service=False) | ||
self.assertTrue(dm_file.normalise(self.valid_crmid1, include_prefix=True) in self.data) | ||
self.assertTrue(dm_file.normalise(self.invalid_crmid2, include_prefix=True) in self.data) | ||
self.assertTrue(dm_file.is_valid(self.valid_crmid1)) | ||
self.assertFalse(dm_file.is_valid(self.invalid_crmid2)) | ||
|
||
def test_crossref_default(self): | ||
mngr = CrossrefManager() | ||
# No support files (it generates it) | ||
# Default storage manager : in Memory + generates file on method call (not automatically) | ||
# uses API | ||
self.assertTrue(mngr.is_valid(self.valid_crmid1)) | ||
self.assertTrue(mngr.is_valid(self.valid_crmid2)) | ||
self.assertFalse(mngr.is_valid(self.invalid_crmid2)) | ||
self.assertFalse(mngr.is_valid(self.invalid_crmid1)) | ||
mngr.storage_manager.store_file() | ||
validated_ids = [self.valid_crmid1, self.valid_crmid2, self.invalid_crmid1, self.invalid_crmid2] | ||
validated = [mngr.normalise(x, include_prefix=True) for x in validated_ids if mngr.normalise(x, include_prefix=True)] | ||
# check that the support file was correctly created | ||
self.assertTrue(os.path.exists("storage/id_value.json")) | ||
lj = open("storage/id_value.json") | ||
load_dict = json.load(lj) | ||
lj.close() | ||
stored = [mngr.normalise(x, include_prefix=True) for x in load_dict if mngr.normalise(x, include_prefix=True)] | ||
|
||
# check that all the validated ids are stored in the json file | ||
self.assertTrue(all(x in stored for x in validated)) | ||
mngr.storage_manager.delete_storage() | ||
# check that the support file was correctly deleted | ||
self.assertFalse(os.path.exists("storage/id_value.json")) | ||
|
||
def test_crossref_memory_file_noapi(self): | ||
# Uses support file (without updating it) | ||
# Uses InMemoryStorageManager storage manager | ||
# does not use API (so a syntactically correct id is considered to be valid) | ||
am_file = CrossrefManager(storage_manager=InMemoryStorageManager(self.test_json_path), use_api_service=False) | ||
self.assertTrue(am_file.normalise(self.valid_crmid1, include_prefix=True) in self.data) | ||
self.assertTrue(am_file.normalise(self.invalid_crmid2, include_prefix=True) in self.data) | ||
self.assertFalse(am_file.is_valid(self.invalid_crmid2)) # is stored in support file as invalid | ||
# self.assertTrue(am_file.is_valid(am_file.normalise(self.invalid_wid, include_prefix=True))) # is not stored in support file as invalid, does not exist but has correct syntax | ||
|
||
def test_crossref_memory_file_api(self): | ||
# Uses support file (without updating it) | ||
# Uses InMemoryStorageManager storage manager | ||
# uses API (so a syntactically correct id which is not valid is considered to be invalid) | ||
am_file = CrossrefManager(storage_manager=InMemoryStorageManager(self.test_json_path), use_api_service=True) | ||
self.assertFalse(am_file.is_valid(self.invalid_crmid1)) | ||
|
||
def test_crossref_memory_nofile_noapi(self): | ||
# Does not use support file | ||
# Uses InMemoryStorageManager storage manager | ||
# Does not use API (so a syntactically correct id which is not valid is considered to be valid) | ||
am_nofile_noapi = CrossrefManager(storage_manager=InMemoryStorageManager(), use_api_service=False) | ||
self.assertTrue(am_nofile_noapi.is_valid(self.valid_crmid1)) | ||
self.assertTrue(am_nofile_noapi.is_valid(self.invalid_crmid1)) | ||
am_nofile_noapi.storage_manager.delete_storage() | ||
|
||
def test_crossref_sqlite_nofile_api(self): | ||
# No support files (it generates it) | ||
# storage manager : SqliteStorageManager | ||
# uses API | ||
sql_am_nofile = CrossrefManager(storage_manager=SqliteStorageManager()) | ||
self.assertTrue(sql_am_nofile.is_valid(self.valid_crmid1)) | ||
self.assertTrue(sql_am_nofile.is_valid(self.valid_crmid2)) | ||
self.assertFalse(sql_am_nofile.is_valid(self.invalid_crmid1)) | ||
self.assertFalse(sql_am_nofile.is_valid(self.invalid_crmid2)) | ||
# check that the support db was correctly created and that it contains all the validated ids | ||
self.assertTrue(os.path.exists("storage/id_valid_dict.db")) | ||
validated_ids = [self.valid_crmid1, self.valid_crmid2, self.invalid_crmid1, self.invalid_crmid2] | ||
all_ids_stored = sql_am_nofile.storage_manager.get_all_keys() | ||
# check that all the validated ids are stored in the json file | ||
stored = [x for x in all_ids_stored] | ||
validated = [sql_am_nofile.normalise(x, include_prefix=True) for x in validated_ids if sql_am_nofile.normalise(x, include_prefix=True)] | ||
self.assertTrue(all(x in stored for x in validated)) | ||
sql_am_nofile.storage_manager.delete_storage() | ||
# check that the support file was correctly deleted | ||
self.assertFalse(os.path.exists("storage/id_valid_dict.db")) | ||
|
||
def test_crossref_sqlite_file_api(self): | ||
# Uses support file | ||
# Uses SqliteStorageManager storage manager | ||
# does not use API (so a syntactically correct id is considered to be valid) | ||
# db creation | ||
test_sqlite_db = os.path.join(self.test_dir, "database.db") | ||
if os.path.exists(test_sqlite_db): | ||
os.remove(test_sqlite_db) | ||
#con = sqlite3.connect(test_sqlite_db) | ||
#cur = con.cursor() | ||
to_insert = [self.invalid_crmid1, self.valid_crmid1] | ||
sql_file = CrossrefManager(storage_manager=SqliteStorageManager(test_sqlite_db), use_api_service=True) | ||
for id in to_insert: | ||
norm_id = sql_file.normalise(id, include_prefix=True) | ||
is_valid = 1 if sql_file.is_valid(norm_id) else 0 | ||
insert_tup = (norm_id, is_valid) | ||
sql_file.storage_manager.cur.execute(f"INSERT OR REPLACE INTO info VALUES (?,?)", insert_tup) | ||
sql_file.storage_manager.con.commit() | ||
sql_file.storage_manager.con.close() | ||
|
||
sql_no_api = CrossrefManager(storage_manager=SqliteStorageManager(test_sqlite_db), use_api_service=False) | ||
all_db_keys = sql_no_api.storage_manager.get_all_keys() | ||
#check that all the normalised ind in the list were correctly inserted in the db | ||
self.assertTrue(all(sql_no_api.normalise(x, include_prefix=True) in all_db_keys for x in to_insert)) | ||
self.assertTrue(sql_no_api.is_valid(self.valid_crmid1)) # is stored in support file as valid | ||
self.assertFalse(sql_no_api.is_valid(self.invalid_crmid1)) # is stored in support file as invalid | ||
self.assertTrue(sql_no_api.is_valid(sql_no_api.normalise(self.invalid_crmid2, include_prefix=True))) # is not stored in support file as invalid, does not exist but has correct syntax | ||
sql_no_api.storage_manager.delete_storage() | ||
|
||
def test_crossref_sqlite_nofile_noapi(self): | ||
# Does not use support file | ||
# Uses SqliteStorageManager storage manager | ||
# Does not use API (so a syntactically correct id which is not valid is considered to be valid) | ||
am_nofile_noapi = CrossrefManager(storage_manager=SqliteStorageManager(), use_api_service=False) | ||
self.assertTrue(am_nofile_noapi.is_valid(self.valid_crmid1)) | ||
self.assertTrue(am_nofile_noapi.is_valid(self.invalid_crmid2)) | ||
am_nofile_noapi.storage_manager.delete_storage() |