-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #8 from eliarizzetto/main
Identifier manager: add support for OpenAlex IDs
- Loading branch information
Showing
4 changed files
with
360 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,151 @@ | ||
#!python | ||
# Copyright 2019, Silvio Peroni <essepuntato@gmail.com> | ||
# Copyright 2022, Giuseppe Grieco <giuseppe.grieco3@unibo.it>, Arianna Moretti <arianna.moretti4@unibo.it>, Elia Rizzetto <elia.rizzetto@studio.unibo.it>, Arcangelo Massari <arcangelo.massari@unibo.it> | ||
# | ||
# Permission to use, copy, modify, and/or distribute this software for any purpose | ||
# with or without fee is hereby granted, provided that the above copyright notice | ||
# and this permission notice appear in all copies. | ||
# | ||
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH | ||
# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND | ||
# FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, | ||
# OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, | ||
# DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS | ||
# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS | ||
# SOFTWARE. | ||
|
||
from oc_ds_converter.oc_idmanager.base import IdentifierManager | ||
from oc_ds_converter.oc_idmanager.oc_data_storage.storage_manager import StorageManager | ||
from oc_ds_converter.oc_idmanager.oc_data_storage.in_memory_manager import InMemoryStorageManager | ||
from re import sub, match | ||
from requests import ReadTimeout, get | ||
from requests.exceptions import ConnectionError | ||
from json import loads | ||
from time import sleep | ||
from typing import Optional | ||
|
||
|
||
class OpenAlexManager(IdentifierManager): | ||
"""This class implements an identifier manager for openalex identifier""" | ||
|
||
def __init__(self, use_api_service=True, storage_manager: Optional[StorageManager] = None): | ||
"""OpenAlex manager constructor.""" | ||
super(OpenAlexManager, self).__init__() | ||
if storage_manager is None: | ||
self.storage_manager = InMemoryStorageManager() | ||
else: | ||
self.storage_manager = storage_manager | ||
self._api = "https://api.openalex.org/" | ||
self._api_works_route = r"https://api.openalex.org/works/" | ||
self._api_sources_route = r"https://api.openalex.org/sources/" | ||
self._use_api_service = use_api_service | ||
self._p = "openalex:" | ||
self._url_id_pref = "https://openalex.org/" | ||
self._headers = { | ||
"User-Agent": "Identifier Manager / OpenCitations Indexes " | ||
"(http://opencitations.net; mailto:contact@opencitations.net)" | ||
} | ||
|
||
def is_valid(self, oal_id, get_extra_info=False): | ||
oal_id = self.normalise(oal_id, include_prefix=True) | ||
|
||
if oal_id is None: | ||
return False | ||
else: | ||
id_validation_value = self.storage_manager.get_value(oal_id) | ||
if isinstance(id_validation_value, bool): | ||
return id_validation_value | ||
else: | ||
if get_extra_info: | ||
info = self.exists(oal_id, get_extra_info=True) | ||
self.storage_manager.set_full_value(oal_id,info[1]) | ||
return (info[0] and self.syntax_ok(oal_id)), info[1] | ||
validity_check = self.exists(oal_id) and self.syntax_ok(oal_id) | ||
self.storage_manager.set_value(oal_id, validity_check) | ||
|
||
return validity_check | ||
|
||
def normalise(self, id_string, include_prefix=False): | ||
try: | ||
if id_string.startswith(self._p): | ||
oal_string = id_string[len(self._p):] | ||
else: | ||
oal_string = id_string | ||
|
||
oal_string = sub("\0+", "", (sub("\s+", "", oal_string))) | ||
|
||
oal_string = oal_string.removeprefix(self._api_works_route) | ||
oal_string = oal_string.removeprefix(self._api_sources_route) | ||
oal_string = oal_string.removeprefix(self._api) | ||
oal_string = oal_string.removeprefix(self._url_id_pref) | ||
|
||
oal_string = oal_string.upper() | ||
return "%s%s" % ( | ||
self._p if include_prefix else "", | ||
oal_string.strip(), | ||
) | ||
except: | ||
# Any error in processing the OpenAlex ID will return None | ||
return None | ||
|
||
def syntax_ok(self, id_string): | ||
|
||
if not id_string.startswith("openalex:"): | ||
id_string = self._p + id_string | ||
return True if match("^openalex:[WS][1-9]\\d*$", id_string) else False | ||
|
||
def exists(self, openalex_id_full, get_extra_info=False, allow_extra_api=None): | ||
valid_bool = True | ||
openalex_id_full = self._p + openalex_id_full if not openalex_id_full.startswith(self._p) else openalex_id_full | ||
if self._use_api_service: | ||
oal_id = self.normalise(openalex_id_full) # returns None or unprefixed ID (include_prefix is set to False) | ||
pref_oalid = self._p + oal_id if oal_id else None | ||
if pref_oalid is not None: | ||
tentative = 3 | ||
while tentative: | ||
tentative -= 1 | ||
try: | ||
r = get(self._api + oal_id, headers=self._headers, timeout=30) | ||
if r.status_code == 200: | ||
r.encoding = "utf-8" | ||
json_res = loads(r.text) | ||
if get_extra_info: | ||
extra_info_result = {'id': pref_oalid} | ||
try: | ||
result = True if json_res['id'] == (self._url_id_pref + oal_id) else False | ||
extra_info_result['valid'] = result | ||
return result, extra_info_result | ||
except KeyError: | ||
extra_info_result['valid'] = False | ||
return False, extra_info_result | ||
try: | ||
return True if json_res['id'] == (self._url_id_pref + oal_id) else False | ||
except KeyError: | ||
return False | ||
if r.status_code == 429: | ||
sleep(1) # only handles per-second rate limits (not per-day rate limits) | ||
elif 400 <= r.status_code < 500: | ||
if get_extra_info: | ||
return False, {'id': pref_oalid, 'valid': False} | ||
return False | ||
except ReadTimeout: | ||
# Do nothing, just try again | ||
pass | ||
except ConnectionError: | ||
# Sleep 5 seconds, then try again | ||
sleep(5) | ||
valid_bool = False | ||
else: | ||
if get_extra_info: | ||
return False, {'id': pref_oalid, 'valid': False} | ||
return False | ||
|
||
if get_extra_info: | ||
return valid_bool, {'id': openalex_id_full, 'valid': valid_bool} | ||
return valid_bool | ||
|
||
def extra_info(self, api_response, choose_api=None, info_dict={}): | ||
result = {} | ||
result["valid"] = True | ||
# to be implemented | ||
return result |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,203 @@ | ||
import json | ||
import sqlite3 | ||
import os.path | ||
import unittest | ||
from os import makedirs | ||
from os.path import exists, join | ||
|
||
import xmltodict | ||
from oc_ds_converter.oc_idmanager import * | ||
from oc_ds_converter.oc_idmanager.base import IdentifierManager | ||
from requests import ReadTimeout, get | ||
from requests.exceptions import ConnectionError | ||
from oc_ds_converter.oc_idmanager.oc_data_storage.storage_manager import StorageManager | ||
from oc_ds_converter.oc_idmanager.oc_data_storage.in_memory_manager import InMemoryStorageManager | ||
from oc_ds_converter.oc_idmanager.oc_data_storage.sqlite_manager import SqliteStorageManager | ||
|
||
class OpenAlexIdentifierManagerTest(unittest.TestCase): | ||
"""This class aim at testing identifiers manager.""" | ||
|
||
def setUp(self): | ||
if not exists("tmp"): | ||
makedirs("tmp") | ||
|
||
self.test_dir = join("test", "data") | ||
self.test_json_path = join(self.test_dir, "glob.json") | ||
with open(self.test_json_path, encoding="utf-8") as fp: | ||
self.data = json.load(fp) | ||
|
||
self.valid_wid = "W2013228336" | ||
self.valid_sid = "S4210229581" | ||
self.invalid_wid = "W7836728310" | ||
self.invalid_sid = "S4263287381" | ||
|
||
def test_openalex_is_valid(self): | ||
oalm_nofile = OpenAlexManager() | ||
self.assertTrue(oalm_nofile.is_valid(self.valid_wid)) | ||
self.assertTrue(oalm_nofile.is_valid(self.valid_sid)) | ||
self.assertFalse(oalm_nofile.is_valid(self.invalid_wid)) | ||
self.assertFalse(oalm_nofile.is_valid(self.invalid_sid)) | ||
|
||
oalm_file = OpenAlexManager(use_api_service=False, storage_manager=InMemoryStorageManager(self.test_json_path)) | ||
self.assertTrue(oalm_file.normalise(self.valid_wid, include_prefix=True) in self.data) | ||
self.assertTrue(oalm_file.normalise(self.invalid_wid, include_prefix=True) in self.data) | ||
self.assertTrue(oalm_file.is_valid(self.valid_wid)) | ||
self.assertFalse(oalm_file.is_valid(self.invalid_wid)) | ||
|
||
oalm_nofile_noapi = OpenAlexManager(storage_manager=InMemoryStorageManager(self.test_json_path), use_api_service=False) | ||
self.assertTrue(oalm_nofile_noapi.is_valid(self.valid_wid)) | ||
self.assertTrue(oalm_nofile_noapi.is_valid(self.valid_sid)) | ||
|
||
def test_exists(self): | ||
with self.subTest(msg="get_extra_info=True, allow_extra_api=None"): | ||
oalm = OpenAlexManager() | ||
output = oalm.exists('openalex:W748315831', get_extra_info=True, allow_extra_api=None) | ||
expected_output = (True, {'valid': True}) | ||
self.assertEqual(expected_output[0], output[0]) | ||
# self.assertCountEqual({k:v for k,v in expected_output[1].items() if k!= "author"}, {k:v for k,v in output[1].items() if k!= "author"}) | ||
# self.assertCountEqual(expected_output[1]["author"], output[1]["author"]) | ||
|
||
with self.subTest(msg="get_extra_info=False, allow_extra_api=None"): | ||
oalm = OpenAlexManager() | ||
output = oalm.exists('S4210229581', get_extra_info=False, allow_extra_api=None) | ||
expected_output = True | ||
self.assertEqual(output, expected_output) | ||
|
||
|
||
def test_openalex_normalise(self): | ||
oalm = OpenAlexManager() | ||
|
||
self.assertEqual( | ||
self.valid_wid, oalm.normalise("openalex:" + self.valid_wid) | ||
) | ||
self.assertEqual( | ||
self.valid_wid, oalm.normalise(self.valid_wid.replace("", " ")) | ||
) | ||
self.assertEqual( | ||
self.valid_wid, | ||
oalm.normalise("https://openalex.org/" + self.valid_wid), | ||
) | ||
self.assertEqual( | ||
oalm.normalise(self.valid_wid), | ||
oalm.normalise(' ' + self.valid_wid), | ||
) | ||
self.assertEqual( | ||
oalm.normalise(self.valid_sid), | ||
oalm.normalise("https://api.openalex.org/sources/" + self.valid_sid), | ||
) | ||
|
||
dm_file = OpenAlexManager(storage_manager=InMemoryStorageManager(self.test_json_path), use_api_service=False) | ||
self.assertTrue(dm_file.normalise(self.valid_wid, include_prefix=True) in self.data) | ||
self.assertTrue(dm_file.normalise(self.invalid_sid, include_prefix=True) in self.data) | ||
self.assertTrue(dm_file.is_valid(self.valid_wid)) | ||
self.assertFalse(dm_file.is_valid(self.invalid_sid)) | ||
|
||
def test_openalex_default(self): | ||
mngr = OpenAlexManager() | ||
# No support files (it generates it) | ||
# Default storage manager : in Memory + generates file on method call (not automatically) | ||
# uses API | ||
self.assertTrue(mngr.is_valid(self.valid_wid)) | ||
self.assertTrue(mngr.is_valid(self.valid_sid)) | ||
self.assertFalse(mngr.is_valid(self.invalid_sid)) | ||
self.assertFalse(mngr.is_valid(self.invalid_wid)) | ||
mngr.storage_manager.store_file() | ||
validated_ids = [self.valid_wid, self.valid_sid, self.invalid_wid, self.invalid_sid] | ||
validated = [mngr.normalise(x, include_prefix=True) for x in validated_ids if mngr.normalise(x, include_prefix=True)] | ||
# check that the support file was correctly created | ||
self.assertTrue(os.path.exists("storage/id_value.json")) | ||
lj = open("storage/id_value.json") | ||
load_dict = json.load(lj) | ||
lj.close() | ||
stored = [mngr.normalise(x, include_prefix=True) for x in load_dict if mngr.normalise(x, include_prefix=True)] | ||
|
||
# check that all the validated ids are stored in the json file | ||
self.assertTrue(all(x in stored for x in validated)) | ||
mngr.storage_manager.delete_storage() | ||
# check that the support file was correctly deleted | ||
self.assertFalse(os.path.exists("storage/id_value.json")) | ||
|
||
def test_openalex_memory_file_noapi(self): | ||
# Uses support file (without updating it) | ||
# Uses InMemoryStorageManager storage manager | ||
# does not use API (so a syntactically correct id is considered to be valid) | ||
am_file = OpenAlexManager(storage_manager=InMemoryStorageManager(self.test_json_path), use_api_service=False) | ||
self.assertTrue(am_file.normalise(self.valid_wid, include_prefix=True) in self.data) | ||
self.assertTrue(am_file.normalise(self.invalid_sid, include_prefix=True) in self.data) | ||
self.assertFalse(am_file.is_valid(self.invalid_sid)) # is stored in support file as invalid | ||
# self.assertTrue(am_file.is_valid(am_file.normalise(self.invalid_wid, include_prefix=True))) # is not stored in support file as invalid, does not exist but has correct syntax | ||
|
||
def test_openalex_memory_file_api(self): | ||
# Uses support file (without updating it) | ||
# Uses InMemoryStorageManager storage manager | ||
# uses API (so a syntactically correct id which is not valid is considered to be invalid) | ||
am_file = OpenAlexManager(storage_manager=InMemoryStorageManager(self.test_json_path), use_api_service=True) | ||
self.assertFalse(am_file.is_valid(self.invalid_wid)) | ||
|
||
def test_openalex_memory_nofile_noapi(self): | ||
# Does not use support file | ||
# Uses InMemoryStorageManager storage manager | ||
# Does not use API (so a syntactically correct id which is not valid is considered to be valid) | ||
am_nofile_noapi = OpenAlexManager(storage_manager=InMemoryStorageManager(), use_api_service=False) | ||
self.assertTrue(am_nofile_noapi.is_valid(self.valid_wid)) | ||
self.assertTrue(am_nofile_noapi.is_valid(self.invalid_wid)) | ||
am_nofile_noapi.storage_manager.delete_storage() | ||
|
||
def test_openalex_sqlite_nofile_api(self): | ||
# No support files (it generates it) | ||
# storage manager : SqliteStorageManager | ||
# uses API | ||
sql_am_nofile = OpenAlexManager(storage_manager=SqliteStorageManager()) | ||
self.assertTrue(sql_am_nofile.is_valid(self.valid_wid)) | ||
self.assertTrue(sql_am_nofile.is_valid(self.valid_sid)) | ||
self.assertFalse(sql_am_nofile.is_valid(self.invalid_wid)) | ||
self.assertFalse(sql_am_nofile.is_valid(self.invalid_sid)) | ||
# check that the support db was correctly created and that it contains all the validated ids | ||
self.assertTrue(os.path.exists("storage/id_valid_dict.db")) | ||
validated_ids = [self.valid_wid, self.valid_sid, self.invalid_wid, self.invalid_sid] | ||
all_ids_stored = sql_am_nofile.storage_manager.get_all_keys() | ||
# check that all the validated ids are stored in the json file | ||
stored = [x for x in all_ids_stored] | ||
validated = [sql_am_nofile.normalise(x, include_prefix=True) for x in validated_ids if sql_am_nofile.normalise(x, include_prefix=True)] | ||
self.assertTrue(all(x in stored for x in validated)) | ||
sql_am_nofile.storage_manager.delete_storage() | ||
# check that the support file was correctly deleted | ||
self.assertFalse(os.path.exists("storage/id_valid_dict.db")) | ||
|
||
def test_openalex_sqlite_file_api(self): | ||
# Uses support file | ||
# Uses SqliteStorageManager storage manager | ||
# does not use API (so a syntactically correct id is considered to be valid) | ||
# db creation | ||
test_sqlite_db = os.path.join(self.test_dir, "database.db") | ||
if os.path.exists(test_sqlite_db): | ||
os.remove(test_sqlite_db) | ||
#con = sqlite3.connect(test_sqlite_db) | ||
#cur = con.cursor() | ||
to_insert = [self.invalid_wid, self.valid_wid] | ||
sql_file = OpenAlexManager(storage_manager=SqliteStorageManager(test_sqlite_db), use_api_service=True) | ||
for id in to_insert: | ||
norm_id = sql_file.normalise(id, include_prefix=True) | ||
is_valid = 1 if sql_file.is_valid(norm_id) else 0 | ||
insert_tup = (norm_id, is_valid) | ||
sql_file.storage_manager.cur.execute(f"INSERT OR REPLACE INTO info VALUES (?,?)", insert_tup) | ||
sql_file.storage_manager.con.commit() | ||
sql_file.storage_manager.con.close() | ||
|
||
sql_no_api = OpenAlexManager(storage_manager=SqliteStorageManager(test_sqlite_db), use_api_service=False) | ||
all_db_keys = sql_no_api.storage_manager.get_all_keys() | ||
#check that all the normalised ind in the list were correctly inserted in the db | ||
self.assertTrue(all(sql_no_api.normalise(x, include_prefix=True) in all_db_keys for x in to_insert)) | ||
self.assertTrue(sql_no_api.is_valid(self.valid_wid)) # is stored in support file as valid | ||
self.assertFalse(sql_no_api.is_valid(self.invalid_wid)) # is stored in support file as invalid | ||
self.assertTrue(sql_no_api.is_valid(sql_no_api.normalise(self.invalid_sid, include_prefix=True))) # is not stored in support file as invalid, does not exist but has correct syntax | ||
sql_no_api.storage_manager.delete_storage() | ||
|
||
def test_openalex_sqlite_nofile_noapi(self): | ||
# Does not use support file | ||
# Uses SqliteStorageManager storage manager | ||
# Does not use API (so a syntactically correct id which is not valid is considered to be valid) | ||
am_nofile_noapi = OpenAlexManager(storage_manager=SqliteStorageManager(), use_api_service=False) | ||
self.assertTrue(am_nofile_noapi.is_valid(self.valid_wid)) | ||
self.assertTrue(am_nofile_noapi.is_valid(self.invalid_sid)) | ||
am_nofile_noapi.storage_manager.delete_storage() |