diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index eaa055a..8a5a573 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -14,7 +14,6 @@ # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS # SOFTWARE. - name: Run tests on: push: @@ -27,7 +26,17 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.8", "3.9", "3.10"] + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"] + services: + redis: + image: redis + ports: + - 6379:6379 + options: >- + --health-cmd "redis-cli ping" + --health-interval 10s + --health-timeout 5s + --health-retries 5 steps: - uses: actions/checkout@v3.0.2 - name: Set up Python ${{ matrix.python-version }} @@ -39,7 +48,7 @@ jobs: pip3 install poetry sudo apt install -y python-is-python3 poetry install - - name: Check the overage + - name: Check the coverage run: | poetry run coverage run --rcfile=test/coverage/.coveragerc - name: Generate the badge diff --git a/oc_ds_converter/datasource/redis.py b/oc_ds_converter/datasource/redis.py index 6e53313..8dbcc53 100644 --- a/oc_ds_converter/datasource/redis.py +++ b/oc_ds_converter/datasource/redis.py @@ -78,8 +78,11 @@ def mget(self, resources_id): # for i, v in enumerate(self._r.mget(resources_id)) # } - def flushall(self): - self._r.flushall() + def flushdb(self): + batch_size = 1000 + keys = self._r.keys('*') + for i in range(0, len(keys), batch_size): + self._r.delete(*keys[i:i+batch_size]) def delete(self, resource_id): self._r.delete(resource_id) diff --git a/oc_ds_converter/lib/master_of_regex.py b/oc_ds_converter/lib/master_of_regex.py index 69df355..382fe7e 100644 --- a/oc_ds_converter/lib/master_of_regex.py +++ b/oc_ds_converter/lib/master_of_regex.py @@ -1,116 +1,98 @@ -''' -Split by ';' outside '[]' (any spaces before and after ';'). -''' -semicolon_in_people_field = '\s*;\s*(?=[^\]]*(?:\[|$))' +# Split by ';' outside '[]' (any spaces before and after ';'). +semicolon_in_people_field = r'\s*;\s*(?=[^\]]*(?:\[|$))' -''' -It gets string inside '[]' ignoring any space between (ex: [ TARGET ] --> TARGET). -An id schema must be present, followed by a colon. -Before the colon, there must be any character that is not a square bracket -to prevent that in strings like 'Boezaart, Andr[eacute] [omid:123]' the id captured is '[eacute] [omid:123]'. -Alternatively, empty square brackets containing one or more spaces also represent a valid match. -''' -ids_inside_square_brackets = '\[\s*((?:[^\s]+:[^\s]+)?(?:\s+[^\s]+:[^\s]+)*)\s*\]' +# It gets string inside '[]' ignoring any space between (ex: [ TARGET ] --> TARGET). +# An id schema must be present, followed by a colon. +# Before the colon, there must be any character that is not a square bracket +# to prevent that in strings like 'Boezaart, Andr[eacute] [omid:123]' the id captured is '[eacute] [omid:123]'. +# Alternatively, empty square brackets containing one or more spaces also represent a valid match. +ids_inside_square_brackets = r'\[\s*((?:[^\s]+:[^\s]+)?(?:\s+[^\s]+:[^\s]+)*)\s*\]' -''' -It gets the name and ids in two capturing groups. -As for ids, it gets the string inside '[]' ignoring any space between (ex: [ TARGET ] --> TARGET). -An id schema must be present, followed by a colon. -''' -name_and_ids = f'\s*(.*?)\s*{ids_inside_square_brackets}' +# It gets the name and ids in two capturing groups. +# As for ids, it gets the string inside '[]' ignoring any space between (ex: [ TARGET ] --> TARGET). +# An id schema must be present, followed by a colon. +name_and_ids = fr'\s*(.*?)\s*{ids_inside_square_brackets}' -''' -It captures a colon preceded and followed by zero or more spaces. -''' -colon_and_spaces = '\s*:\s*' +# It captures a colon preceded and followed by zero or more spaces. +colon_and_spaces = r'\s*:\s*' -''' -It captures a colon preceded and followed by zero or more spaces. -''' -comma_and_spaces = '\s*,\s*' +# It captures a comma preceded and followed by zero or more spaces. +comma_and_spaces = r'\s*,\s*' -''' -It captures one or more spaces. -''' -one_or_more_spaces = '\s+' +# It captures one or more spaces. +one_or_more_spaces = r'\s+' -''' -It captures any pages range separator. -''' -pages_separator = '[^A-Za-z\d]+(?=[A-Za-z\d]+)' +# It captures any pages range separator. +pages_separator = r'[^A-Za-z\d]+(?=[A-Za-z\d]+)' -''' -It captures an ORCID -''' -orcid_pattern = '([0-9]{4}-){3}[0-9]{3}[0-9X]' +# It captures an ORCID +orcid_pattern = r'([0-9]{4}-){3}[0-9]{3}[0-9X]' -''' -A series of patterns useful to clean invalid "volume" and "issue" fields -''' -good_sep = '\-' -bad_sep = '&\/_,\.:+;\(\[\|' +# A series of patterns useful to clean invalid "volume" and "issue" fields +good_sep = r'\-' +bad_sep = r'&\/_,\.:+;\(\[\|' separators = good_sep + bad_sep -alphabets = 'a-zà-öø-ÿ\u0430-я\u0391-ω' # basic latin, latin-1 supplement, cyrillic, greek -vi_pattern = f'((?:[{alphabets}]*\d[{alphabets}\d]*|[ivxlcdm]+)(?:\s?(?:[{separators}]|and|\()\s?[{alphabets}\d]+\)?)*?)' -numero = '(?:№|no?(?!v)\.?|n[º°]\.?|n[uú]m(?:[eé]ro)?|number)' -year_pattern = '(\d{4})' +alphabets = r'a-zà-öø-ÿ\u0430-я\u0391-ω' # basic latin, latin-1 supplement, cyrillic, greek +vi_pattern = fr'((?:[{alphabets}]*\d[{alphabets}\d]*|[ivxlcdm]+)(?:\s?(?:[{separators}]|and|\()\s?[{alphabets}\d]+\)?)*?)' +numero = r'(?:№|no?(?!v)\.?|n[º°]\.?|n[uú]m(?:[eé]ro)?|number)' +year_pattern = r'(\d{4})' valid_vi_patterns = [ vi_pattern, - f'[‹<\()]?[{alphabets}]+?([{separators}\s]?[{alphabets}])*[\)›>]?', - f'[{alphabets}{separators}\s]+{vi_pattern}', - f"[{alphabets}\d\-'/]*\d[{alphabets}\d\-'/]*(,?\s[{alphabets}\d\-'/]+)+", - f'\(?s(uppl([eéi]ment(ary|um)?))?\)?\s?(part)?\s?(s?{vi_pattern})?', - f'({vi_pattern}_)?({vi_pattern}\s)?[\(_]?supp?(plement|pl)?l?[\s\._]*({vi_pattern}|[{alphabets}])?\)?\.?', - f'{vi_pattern}*,?\s?part[\s_]{vi_pattern}(\sof\s{vi_pattern})?(,\sno?\.\s?{vi_pattern})?', - f'{vi_pattern}*[_\s]?pt?[_\s\.]{vi_pattern}', + fr'[‹<\()]?[{alphabets}]+?([{separators}\s]?[{alphabets}])*[\)›>]?', + fr'[{alphabets}{separators}\s]+{vi_pattern}', + fr"[{alphabets}\d\-'/]*\d[{alphabets}\d\-'/]*(,?\s[{alphabets}\d\-'/]+)+", + fr'\(?s(uppl([eéi]ment(ary|um)?))?\)?\s?(part)?\s?(s?{vi_pattern})?', + fr'({vi_pattern}_)?({vi_pattern}\s)?[\(_]?supp?(plement|pl)?l?[\s\._]*({vi_pattern}|[{alphabets}])?\)?\.?', + fr'{vi_pattern}*,?\s?part[\s_]{vi_pattern}(\sof\s{vi_pattern})?(,\sno?\.\s?{vi_pattern})?', + fr'{vi_pattern}*[_\s]?pt?[_\s\.]{vi_pattern}', '(ed|pt|d)\sinside(d|r)', 'p(ublish\s)?a(head\sof\s)?p(rint)?', '預刊文章', '[\u0621-\u064A]+', - f'\[{year_pattern}\]\s(\d\s)?[{alphabets}]+', - f'{vi_pattern}\s\[\+CDROM\]', - f'{vi_pattern}[{separators}\s]?\({vi_pattern}\)(\s{vi_pattern})?', - f'([{alphabets}]+\.)?[{alphabets}]+\.?', - f'[{alphabets}]+-\d+', - f'[{alphabets}]+(_[{alphabets}]+)+', - f'{numero}:?\s?{vi_pattern}(,?\s({year_pattern}|\({vi_pattern}\)))?', + fr'\[{year_pattern}\]\s(\d\s)?[{alphabets}]+', + fr'{vi_pattern}\s\[\+CDROM\]', + fr'{vi_pattern}[{separators}\s]?\({vi_pattern}\)(\s{vi_pattern})?', + fr'([{alphabets}]+\.)?[{alphabets}]+\.?', + fr'[{alphabets}]+-\d+', + fr'[{alphabets}]+(_[{alphabets}]+)+', + fr'{numero}:?\s?{vi_pattern}(,?\s({year_pattern}|\({vi_pattern}\)))?', 'historica\svol\.\s\d+(,\d+(-\d+)?)?', '\d+\(\d+\)\d{2,4}', - f'(\[{year_pattern}\]\s)?(\d+\s)?vl?r(\s\([a-z]+\))?', - f'\({vi_pattern}\/{vi_pattern}\)\s[{alphabets}]+(-[{alphabets}]+)?' + fr'(\[{year_pattern}\]\s)?(\d+\s)?vl?r(\s\([a-z]+\))?', + fr'\({vi_pattern}\/{vi_pattern}\)\s[{alphabets}]+(-[{alphabets}]+)?' ] volumes_valid_patterns = [ 'original\sseries,\svolume\s\d+', - f'(vol(ume)?|tome|cilt)\s?[{separators}]?\s?{vi_pattern}' + fr'(vol(ume)?|tome|cilt)\s?[{separators}]?\s?{vi_pattern}' ] issues_valid_patterns = [ - f'issue[\.,]?\s{vi_pattern}', - f'({vi_pattern}\s)?e?sp?e?(ecial)?[\s_\-\.](issue)?(_number_)?[\s_-]?({vi_pattern})?(["“][{alphabets}\s]+?["”])?', - f'ö(zel)?(\ss(ayı)?|\(special\))?(\s?{vi_pattern})?', - f'({numero}[{separators}\s]?)?hors[{separators}\s]série[{separators}\s]{vi_pattern}', + fr'issue[\.,]?\s{vi_pattern}', + fr'({vi_pattern}\s)?e?sp?e?(ecial)?[\s_\-\.](issue)?(_number_)?[\s_-]?({vi_pattern})?(["“][{alphabets}\s]+?["”])?', + fr'ö(zel)?(\ss(ayı)?|\(special\))?(\s?{vi_pattern})?', + fr'({numero}[{separators}\s]?)?hors[{separators}\s]série[{separators}\s]{vi_pattern}', '특별호', - f'([{alphabets}]+\s{year_pattern}\s)?\(?(jan(uary)?|feb(ruary)?|mar(ch)?|apr(il)?|may|jun(e)?|jul(y)?|aug(ust)?|sep(tember)?|oct(ober)?|(nov|dec)(ember)?|spring|summer|autumn|winter)(\s{year_pattern})?\)?', - f'{vi_pattern},\spart\s{vi_pattern}\sof\s{vi_pattern}', - f'sayı[{separators}\s]\s?{vi_pattern}', - f'issues?\s{vi_pattern},\s(supplement|part)\s{vi_pattern}', - f'issues?\s{vi_pattern}\.?\spp\.\s[a-z\d]+[^a-z\d]+[a-z\d]+' + fr'([{alphabets}]+\s{year_pattern}\s)?\(?(jan(uary)?|feb(ruary)?|mar(ch)?|apr(il)?|may|jun(e)?|jul(y)?|aug(ust)?|sep(tember)?|oct(ober)?|(nov|dec)(ember)?|spring|summer|autumn|winter)(\s{year_pattern})?\)?', + fr'{vi_pattern},\spart\s{vi_pattern}\sof\s{vi_pattern}', + fr'sayı[{separators}\s]\s?{vi_pattern}', + fr'issues?\s{vi_pattern},\s(supplement|part)\s{vi_pattern}', + fr'issues?\s{vi_pattern}\.?\spp\.\s[a-z\d]+[^a-z\d]+[a-z\d]+' ] invalid_vi_patterns = { - f'.*?(?:vol\.?(?:ume)?|tome)(?:[{separators}]?\s?){vi_pattern}[\-&\/_,\.:+;\(\)\[\]|\s]*(?:{numero}|issues?)[{separators}|\s]*(?:sp[eé]cial\s)?{vi_pattern}': 'vol_iss', - f'{vi_pattern},\s?{numero}\s?{vi_pattern}': 'vol_iss', - f'tập\s?{vi_pattern},?\s?số\s?{vi_pattern}': 'vol_iss', - f'issues?\s{vi_pattern}\svol\.?(?:ume)?\s{vi_pattern}(?:.*?{year_pattern}.*?)?': 'iss_vol_year', - f"{vi_pattern}\s?\({vi_pattern}'{year_pattern}\)": 'vol_iss_year', - f'cilt[{separators}\s]\s?{vi_pattern}[{separators}\s]sayı[{separators}\s]\s?{vi_pattern}(?:[{separators}\s]\s?temmuz\s{year_pattern})?': 'vol_iss_year', - '&na;|n\/a|not\savailable': 'del', - '[\:\-\.`ё/]': 'del', - f'\${{[{alphabets}]+(\.[{alphabets}]+)?}}': 'del', - f"[&\/_,:+;\|`'#]\s*{vi_pattern}": 'all', - f'[\->+]{vi_pattern}': 'do_nothing', - f"{vi_pattern}[\.+]": "do_nothing", - f"{numero}?[{separators}]?\s?{vi_pattern}[&\/_,:;\|`'\(\[\{{]": 'all', - f'{vi_pattern}\(\)': 'all', - f'n[�?]+{vi_pattern}': 'all', - f'{vi_pattern}(?:â\x80[\x92\x93\x94]|�+|â|\?+){vi_pattern}': 'sep', - f'{vi_pattern}\s?\(first\sserie': 's)' + fr'.*?(?:vol\.?(?:ume)?|tome)(?:[{separators}]?\s?){vi_pattern}[\-&\/_,\.:+;\(\)\[\]|\s]*(?:{numero}|issues?)[{separators}|\s]*(?:sp[eé]cial\s)?{vi_pattern}': 'vol_iss', + fr'{vi_pattern},\s?{numero}\s?{vi_pattern}': 'vol_iss', + fr'tập\s?{vi_pattern},?\s?số\s?{vi_pattern}': 'vol_iss', + fr'issues?\s{vi_pattern}\svol\.?(?:ume)?\s{vi_pattern}(?:.*?{year_pattern}.*?)?': 'iss_vol_year', + fr"{vi_pattern}\s?\({vi_pattern}'{year_pattern}\)": 'vol_iss_year', + fr'cilt[{separators}\s]\s?{vi_pattern}[{separators}\s]sayı[{separators}\s]\s?{vi_pattern}(?:[{separators}\s]\s?temmuz\s{year_pattern})?': 'vol_iss_year', + r'&na;|n\/a|not\savailable': 'del', + r'[\:\-\.`ё/]': 'del', + fr'\${{[{alphabets}]+(\.[{alphabets}]+)?}}': 'del', + fr"[&\/_,:+;\|`'#]\s*{vi_pattern}": 'all', + fr'[\->+]{vi_pattern}': 'do_nothing', + fr"{vi_pattern}[\.+]": "do_nothing", + fr"{numero}?[{separators}]?\s?{vi_pattern}[&\/_,:;\|`'\(\[\{{]": 'all', + fr'{vi_pattern}\(\)': 'all', + fr'n[�?]+{vi_pattern}': 'all', + fr'{vi_pattern}(?:â\x80[\x92\x93\x94]|�+|â|\?+){vi_pattern}': 'sep', + fr'{vi_pattern}\s?\(first\sserie': 's)' } \ No newline at end of file diff --git a/oc_ds_converter/oc_idmanager/arxiv.py b/oc_ds_converter/oc_idmanager/arxiv.py index 49dbe07..c550226 100644 --- a/oc_ds_converter/oc_idmanager/arxiv.py +++ b/oc_ds_converter/oc_idmanager/arxiv.py @@ -83,7 +83,7 @@ def is_valid(self, id_string, get_extra_info=False): info = self.exists(arxiv, get_extra_info=True) self.storage_manager.set_full_value(arxiv,info[1]) return (info[0] and self.syntax_ok(arxiv)), info[1] - validity_check = self.exists(arxiv) and self.syntax_ok(arxiv) + validity_check = self.syntax_ok(arxiv) and self.exists(arxiv) self.storage_manager.set_value(arxiv, validity_check) return validity_check diff --git a/oc_ds_converter/oc_idmanager/crossref.py b/oc_ds_converter/oc_idmanager/crossref.py index cf6c3ed..0c27614 100644 --- a/oc_ds_converter/oc_idmanager/crossref.py +++ b/oc_ds_converter/oc_idmanager/crossref.py @@ -53,7 +53,7 @@ def is_valid(self, cr_member_id, get_extra_info=False): info = self.exists(cr_member_id, get_extra_info=True) self.storage_manager.set_full_value(cr_member_id, info[1]) return (info[0] and self.syntax_ok(cr_member_id)), info[1] - validity_check = self.exists(cr_member_id) and self.syntax_ok(cr_member_id) + validity_check = self.syntax_ok(cr_member_id) and self.exists(cr_member_id) self.storage_manager.set_value(cr_member_id, validity_check) return validity_check diff --git a/oc_ds_converter/oc_idmanager/doi.py b/oc_ds_converter/oc_idmanager/doi.py index df2d564..7af16c0 100644 --- a/oc_ds_converter/oc_idmanager/doi.py +++ b/oc_ds_converter/oc_idmanager/doi.py @@ -124,14 +124,14 @@ def is_valid(self, id_string, get_extra_info=False): info = self.exists(doi, get_extra_info=True) self.storage_manager.set_full_value(doi,info[1]) return (info[0] and self.syntax_ok(doi)), info[1] - validity_check = self.exists(doi) and self.syntax_ok(doi) + validity_check = self.syntax_ok(doi) and self.exists(doi) self.storage_manager.set_value(doi, validity_check) return validity_check def base_normalise(self, id_string): try: id_string = sub( - "\0+", "", sub("\s+", "", unquote(id_string[id_string.index("10.") :])) + r"\0+", "", sub(r"\s+", "", unquote(id_string[id_string.index("10.") :])) ) return id_string.lower().strip() if id_string else None except: @@ -191,7 +191,7 @@ def clean_doi(self, doi:str) -> Tuple[str, dict]: def syntax_ok(self, id_string): if not id_string.startswith(self._p): id_string = self._p+id_string - return True if match("^doi:10\.(\d{4,9}|[^\s/]+(\.[^\s/]+)*)/[^\s]+$", id_string, re.IGNORECASE) else False + return True if match(r"^doi:10\.(\d{4,9}|[^\s/]+(\.[^\s/]+)*)/[^\s]+$", id_string, re.IGNORECASE) else False def exists(self, doi_full, get_extra_info=False, allow_extra_api=None): valid_bool = True diff --git a/oc_ds_converter/oc_idmanager/issn.py b/oc_ds_converter/oc_idmanager/issn.py index c9f9c30..f2c66c5 100644 --- a/oc_ds_converter/oc_idmanager/issn.py +++ b/oc_ds_converter/oc_idmanager/issn.py @@ -65,7 +65,7 @@ def check_digit(self,issn): issn = issn[spl:] issn = issn.replace('-', '') if len(issn) != 8: - raise ValueError('ISSN of len 8 or 9 required (e.g. 00000949 or 0000-0949)') + return False ss = sum([int(digit) * f for digit, f in zip(issn, range(8, 1, -1))]) _, mod = divmod(ss, 11) checkdigit = 0 if mod == 0 else 11 - mod diff --git a/oc_ds_converter/oc_idmanager/jid.py b/oc_ds_converter/oc_idmanager/jid.py index d46ef68..965b8b3 100644 --- a/oc_ds_converter/oc_idmanager/jid.py +++ b/oc_ds_converter/oc_idmanager/jid.py @@ -62,7 +62,7 @@ def is_valid(self, jid, get_extra_info=False): info = self.exists(jid, get_extra_info=True) self.storage_manager.set_full_value(jid, info[1]) return (info[0] and self.syntax_ok(jid)), info[1] - validity_check = self.exists(jid) and self.syntax_ok(jid) + validity_check = self.syntax_ok(jid) and self.exists(jid) self.storage_manager.set_value(jid, validity_check) return validity_check diff --git a/oc_ds_converter/oc_idmanager/oc_data_storage/redis_manager.py b/oc_ds_converter/oc_idmanager/oc_data_storage/redis_manager.py index 99e132e..bb23152 100644 --- a/oc_ds_converter/oc_idmanager/oc_data_storage/redis_manager.py +++ b/oc_ds_converter/oc_idmanager/oc_data_storage/redis_manager.py @@ -118,7 +118,7 @@ def del_value(self, id: str) -> None: def delete_storage(self): - self.PROCESS_redis.flushall() + self.PROCESS_redis.flushdb() def get_all_keys(self): result = [x for x in self.PROCESS_redis.scan_iter('*')] diff --git a/oc_ds_converter/oc_idmanager/openalex.py b/oc_ds_converter/oc_idmanager/openalex.py index ca127a3..49d07fe 100644 --- a/oc_ds_converter/oc_idmanager/openalex.py +++ b/oc_ds_converter/oc_idmanager/openalex.py @@ -60,7 +60,7 @@ def is_valid(self, oal_id, get_extra_info=False): info = self.exists(oal_id, get_extra_info=True) self.storage_manager.set_full_value(oal_id,info[1]) return (info[0] and self.syntax_ok(oal_id)), info[1] - validity_check = self.exists(oal_id) and self.syntax_ok(oal_id) + validity_check = self.syntax_ok(oal_id) and self.exists(oal_id) self.storage_manager.set_value(oal_id, validity_check) return validity_check @@ -72,7 +72,7 @@ def normalise(self, id_string, include_prefix=False): else: oal_string = id_string - oal_string = sub("\0+", "", (sub("\s+", "", oal_string))) + oal_string = sub(r"\0+", "", (sub(r"\s+", "", oal_string))) oal_string = oal_string.replace(self._api_works_route, '', 1) oal_string = oal_string.replace(self._api_sources_route, '', 1) diff --git a/oc_ds_converter/oc_idmanager/orcid.py b/oc_ds_converter/oc_idmanager/orcid.py index ebfe533..90ec7cf 100644 --- a/oc_ds_converter/oc_idmanager/orcid.py +++ b/oc_ds_converter/oc_idmanager/orcid.py @@ -70,7 +70,7 @@ def is_valid(self, id_string, get_extra_info=False): info = self.exists(orcid, get_extra_info=True) self.storage_manager.set_full_value(orcid,info[1]) return (info[0] and self.check_digit(orcid) and self.syntax_ok(orcid)), info[1] - validity_check = self.exists(orcid) and self.syntax_ok(orcid) and self.check_digit(orcid) + validity_check = self.syntax_ok(orcid) and self.check_digit(orcid) and self.exists(orcid) self.storage_manager.set_value(orcid, validity_check) return validity_check diff --git a/oc_ds_converter/oc_idmanager/pmcid.py b/oc_ds_converter/oc_idmanager/pmcid.py index b16025f..96d9717 100644 --- a/oc_ds_converter/oc_idmanager/pmcid.py +++ b/oc_ds_converter/oc_idmanager/pmcid.py @@ -72,7 +72,7 @@ def is_valid(self, pmcid, get_extra_info=False): info = self.exists(pmcid, get_extra_info=True) self.storage_manager.set_full_value(pmcid,info[1]) return (info[0] and self.syntax_ok(pmcid)), info[1] - validity_check = self.exists(pmcid) and self.syntax_ok(pmcid) + validity_check = self.syntax_ok(pmcid) and self.exists(pmcid) self.storage_manager.set_value(pmcid, validity_check) return validity_check @@ -85,7 +85,7 @@ def normalise(self, id_string, include_prefix=False): id_string = id_string pmcid_string = sub( - "\0+", "", sub("\s+", "", unquote(id_string[id_string.index("PMC"):])) + r"\0+", "", sub(r"\s+", "", unquote(id_string[id_string.index("PMC"):])) ) return "%s%s" % ( self._p if include_prefix else "", diff --git a/oc_ds_converter/oc_idmanager/pmid.py b/oc_ds_converter/oc_idmanager/pmid.py index 9d7f849..5fba668 100644 --- a/oc_ds_converter/oc_idmanager/pmid.py +++ b/oc_ds_converter/oc_idmanager/pmid.py @@ -85,7 +85,7 @@ def is_valid(self, pmid, get_extra_info=False): info = self.exists(pmid, get_extra_info=True) self.storage_manager.set_full_value(pmid,info[1]) return (info[0] and self.syntax_ok(pmid)), info[1] - validity_check = self.exists(pmid) and self.syntax_ok(pmid) + validity_check = self.syntax_ok(pmid) and self.exists(pmid) self.storage_manager.set_value(pmid, validity_check) return validity_check @@ -95,7 +95,7 @@ def is_valid(self, pmid, get_extra_info=False): def normalise(self, id_string, include_prefix=False): id_string = str(id_string) try: - pmid_string = sub("^0+", "", sub("\0+", "", (sub("[^\d+]", "", id_string)))) + pmid_string = sub(r"^0+", "", sub(r"\0+", "", (sub(r"[^\d+]", "", id_string)))) return "%s%s" % (self._p if include_prefix else "", pmid_string) except: # Any error in processing the PMID will return None @@ -104,7 +104,7 @@ def normalise(self, id_string, include_prefix=False): def syntax_ok(self, id_string): if not id_string.startswith(self._p): id_string = self._p + id_string - return True if match("^pmid:[1-9]\d*$", id_string) else False + return True if match(r"^pmid:[1-9]\d*$", id_string) else False def exists(self, pmid_full, get_extra_info=False, allow_extra_api=None): valid_bool = True @@ -166,8 +166,8 @@ def extra_info(self, api_response, choose_api=None, info_dict={}): for matchNum_tit, match_tit in enumerate(match_title, start=1): m_title = match_tit.group() if m_title: - ts = re.sub("\s+", " ", m_title) - t = re.sub("\n", " ", ts) + ts = re.sub(r"\s+", " ", m_title) + t = re.sub(r"\n", " ", ts) norm_title = t.strip() if norm_title is not None: title = norm_title @@ -183,8 +183,8 @@ def extra_info(self, api_response, choose_api=None, info_dict={}): for matchNum_aut, match_au in enumerate(fa_aut, start=1): m_aut = match_au.group() if m_aut: - fau = re.sub("\s+", " ", m_aut) - nlfau = re.sub("\n", " ", fau) + fau = re.sub(r"\s+", " ", m_aut) + nlfau = re.sub(r"\n", " ", fau) norm_fau = nlfau.strip() if norm_fau is not None: authors.add(norm_fau) @@ -200,7 +200,7 @@ def extra_info(self, api_response, choose_api=None, info_dict={}): re.IGNORECASE, ).group(1) re_search = re.search( - "(\d{4})\s+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+((3[0-1])|([1-2][0-9])|([0]?[1-9]))", + r"(\d{4})\s+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+((3[0-1])|([1-2][0-9])|([0]?[1-9]))", date, re.IGNORECASE, ) @@ -210,7 +210,7 @@ def extra_info(self, api_response, choose_api=None, info_dict={}): pmid_date = datetime.strftime(datetime_object, "%Y-%m-%d") else: re_search = re.search( - "(\d{4})\s+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)", + r"(\d{4})\s+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)", date, re.IGNORECASE, ) @@ -219,9 +219,9 @@ def extra_info(self, api_response, choose_api=None, info_dict={}): datetime_object = datetime.strptime(src, "%Y %b") pmid_date = datetime.strftime(datetime_object, "%Y-%m") else: - re_search = re.search("(\d{4})", date) + re_search = re.search(r"(\d{4})", date) if re_search is not None: - src = re.search("(\d{4})", date).group(0) + src = re.search(r"(\d{4})", date).group(0) datetime_object = datetime.strptime(src, "%Y") pmid_date = datetime.strftime(datetime_object, "%Y") else: @@ -251,8 +251,8 @@ def extra_info(self, api_response, choose_api=None, info_dict={}): for matchNum_title, match_tit in enumerate(fa_jur_title, start=1): m_title = match_tit.group() if m_title: - s_jt = re.sub("\s+", " ", m_title) - n_jt = re.sub("\n", " ", s_jt) + s_jt = re.sub(r"\s+", " ", m_title) + n_jt = re.sub(r"\n", " ", s_jt) norm_jour = n_jt.strip() if norm_jour is not None: jur_title = norm_jour @@ -269,7 +269,7 @@ def extra_info(self, api_response, choose_api=None, info_dict={}): for matchNum_volume, match_vol in enumerate(fa_volume, start=1): m_vol = match_vol.group() if m_vol: - vol = re.sub("\s+", " ", m_vol) + vol = re.sub(r"\s+", " ", m_vol) norm_volume = vol.strip() if norm_volume is not None: volume = norm_volume @@ -285,8 +285,8 @@ def extra_info(self, api_response, choose_api=None, info_dict={}): for matchNum_issue, match_issue in enumerate(fa_issue, start=1): m_issue = match_issue.group() if m_issue: - s_issue = re.sub("\s+", " ", m_issue) - n_issue = re.sub("\n", " ", s_issue) + s_issue = re.sub(r"\s+", " ", m_issue) + n_issue = re.sub(r"\n", " ", s_issue) norm_issue = n_issue.strip() if norm_issue is not None: issue = norm_issue @@ -302,8 +302,8 @@ def extra_info(self, api_response, choose_api=None, info_dict={}): for matchNum_pag, match_pag in enumerate(fa_pag, start=1): m_pag = match_pag.group() if m_pag: - s_pg = re.sub("\s+", " ", m_pag) - n_pg = re.sub("\n", " ", s_pg) + s_pg = re.sub(r"\s+", " ", m_pag) + n_pg = re.sub(r"\n", " ", s_pg) norm_pag = n_pg.strip() if norm_pag is not None: pag = norm_pag @@ -319,8 +319,8 @@ def extra_info(self, api_response, choose_api=None, info_dict={}): for matchNum_types, match_types in enumerate(types, start=1): m_type = match_types.group() if m_type: - s_ty = re.sub("\s+", " ", m_type) - b_ty = re.sub("\n", " ", s_ty) + s_ty = re.sub(r"\s+", " ", m_type) + b_ty = re.sub(r"\n", " ", s_ty) norm_type = b_ty.strip().lower() if norm_type is not None: pub_types.add(norm_type) @@ -336,8 +336,8 @@ def extra_info(self, api_response, choose_api=None, info_dict={}): for matchNum_publishers, match_publishers in enumerate(publishers, start=1): m_publishers = match_publishers.group() if m_publishers: - s_pbs = re.sub("\s+", " ", m_publishers) - n_pbs = re.sub("\n", " ", s_pbs) + s_pbs = re.sub(r"\s+", " ", m_publishers) + n_pbs = re.sub(r"\n", " ", s_pbs) norm_pbs = n_pbs.strip() if norm_pbs is not None: publisher.add(norm_pbs) @@ -353,8 +353,8 @@ def extra_info(self, api_response, choose_api=None, info_dict={}): for matchNum_editors, match_editors in enumerate(editors, start=1): m_editors = match_editors.group() if m_editors: - s_ed = re.sub("\s+", " ", m_editors) - n_ed = re.sub("\n", " ", s_ed) + s_ed = re.sub(r"\s+", " ", m_editors) + n_ed = re.sub(r"\n", " ", s_ed) norm_ed = n_ed.strip() if norm_ed is not None: editor.add(norm_ed) @@ -370,8 +370,8 @@ def extra_info(self, api_response, choose_api=None, info_dict={}): for matchNum_doi, match_doi in enumerate(map_doi, start=1): m_doi = match_doi.group() if m_doi: - id = re.sub("\s+", " ", m_doi) - n_id = re.sub("\n", " ", id) + id = re.sub(r"\s+", " ", m_doi) + n_id = re.sub(r"\n", " ", id) n_id_strip = n_id.strip() if n_id_strip.endswith('[doi]'): diff --git a/oc_ds_converter/oc_idmanager/ror.py b/oc_ds_converter/oc_idmanager/ror.py index 28ad8a3..f6261ab 100644 --- a/oc_ds_converter/oc_idmanager/ror.py +++ b/oc_ds_converter/oc_idmanager/ror.py @@ -48,7 +48,7 @@ def is_valid(self, ror_id, get_extra_info=False): self._data[ror_id] = info[1] return (info[0] and self.syntax_ok(ror_id)), info[1] self._data[ror_id] = dict() - self._data[ror_id]["valid"] = True if (self.exists(ror_id) and self.syntax_ok(ror_id)) else False + self._data[ror_id]["valid"] = True if (self.syntax_ok(ror_id) and self.exists(ror_id)) else False return self._data[ror_id].get("valid") if get_extra_info: return self._data[ror_id].get("valid"), self._data[ror_id] @@ -61,7 +61,7 @@ def normalise(self, id_string, include_prefix=False): else: ror_id_string = id_string # normalize + remove protocol and domain name if they are included in the ID - ror_id_string = sub("\0+", "", sub("(https://)?ror\\.org/", "", sub('\s+', "", unquote(ror_id_string)))) + ror_id_string = sub(r"\0+", "", sub(r"^(https?://)?(www\.)?(ror\.org/)?", "", sub(r'\s+', "", unquote(ror_id_string)))) return "%s%s" % ( self._p if include_prefix else "", @@ -70,12 +70,13 @@ def normalise(self, id_string, include_prefix=False): except: # Any error in processing the ROR ID will return None return None - + def syntax_ok(self, id_string): if not id_string.startswith("ror:"): id_string = self._p + id_string - # the regex admits the identifier with or without the protocol and the domain name - return True if match(r"^ror:((https:\/\/)?ror\.org\/)?0[a-hj-km-np-tv-z|0-9]{6}[0-9]{2}$", id_string) else False + + # Check if the ID matches the correct format without protocol or domain + return True if match(r"^ror:0[a-hj-km-np-tv-z|0-9]{6}[0-9]{2}$", id_string) else False def exists(self, ror_id_full, get_extra_info=False, allow_extra_api=None): valid_bool = True diff --git a/oc_ds_converter/oc_idmanager/url.py b/oc_ds_converter/oc_idmanager/url.py index 7d4e1d2..dd61116 100644 --- a/oc_ds_converter/oc_idmanager/url.py +++ b/oc_ds_converter/oc_idmanager/url.py @@ -48,7 +48,7 @@ def is_valid(self, url, get_extra_info=False): self._data[url] = info[1] return (info[0] and self.syntax_ok(url)), info[1] self._data[url] = dict() - self._data[url]["valid"] = True if (self.exists(url) and self.syntax_ok(url)) else False + self._data[url]["valid"] = True if (self.syntax_ok(url) and self.exists(url)) else False return self._data[url].get("valid") if get_extra_info: @@ -85,49 +85,37 @@ def exists(self, url_full, get_extra_info=False, allow_extra_api=None): if self._use_api_service: url = self.normalise(url_full) if url is not None: - tentative = 3 - while tentative: - tentative -= 1 - try: - r = get(self._scheme_https + url, - headers=self._headers, - timeout=30, - ) - if r.status_code == 200: - if get_extra_info: - return True, {"valid": True} - return True - elif r.status_code == 404: - if get_extra_info: - return False, {"valid": False} - return False - - except ReadTimeout: - # Do nothing, just try again - pass - except ConnectionError: - # Sleep 5 seconds, then try again - sleep(5) - - try: - r = get(self._scheme_http + url, - headers=self._headers, - timeout=30, + variations = [ + f"https://www.{url}", + f"https://{url}", + f"http://www.{url}", + f"http://{url}" + ] + + for variation in variations: + tentative = 3 + while tentative: + tentative -= 1 + try: + r = get(variation, + headers=self._headers, + timeout=30, ) - if r.status_code == 200: - if get_extra_info: - return True, {"valid": True} - return True - elif r.status_code == 404: - if get_extra_info: - return False, {"valid": False} - return False - except ReadTimeout: - # Do nothing, just try again - pass - except ConnectionError: - # Sleep 5 seconds, then try again - sleep(5) + if r.status_code == 200: + if get_extra_info: + return True, {"valid": True} + return True + elif r.status_code == 404: + if get_extra_info: + return False, {"valid": False} + return False + + except ReadTimeout: + # Do nothing, just try again + pass + except ConnectionError: + # Sleep 5 seconds, then try again + sleep(5) valid_bool = False diff --git a/oc_ds_converter/oc_idmanager/viaf.py b/oc_ds_converter/oc_idmanager/viaf.py index 4efe047..802a4f2 100644 --- a/oc_ds_converter/oc_idmanager/viaf.py +++ b/oc_ds_converter/oc_idmanager/viaf.py @@ -67,7 +67,7 @@ def is_valid(self, viaf_id, get_extra_info=False): info = self.exists(viaf, get_extra_info=True) self.storage_manager.set_full_value(viaf,info[1]) return (info[0] and self.syntax_ok(viaf)), info[1] - validity_check = self.exists(viaf) and self.syntax_ok(viaf) + validity_check = self.syntax_ok(viaf) and self.exists(viaf) self.storage_manager.set_value(viaf, validity_check) return validity_check diff --git a/oc_ds_converter/oc_idmanager/wikidata.py b/oc_ds_converter/oc_idmanager/wikidata.py index 0e0529e..0213fcf 100644 --- a/oc_ds_converter/oc_idmanager/wikidata.py +++ b/oc_ds_converter/oc_idmanager/wikidata.py @@ -48,7 +48,7 @@ def is_valid(self, wikidata_id, get_extra_info=False): self._data[wikidata_id] = info[1] return (info[0] and self.syntax_ok(wikidata_id)), info[1] self._data[wikidata_id] = dict() - self._data[wikidata_id]["valid"] = True if (self.exists(wikidata_id) and self.syntax_ok(wikidata_id)) else False + self._data[wikidata_id]["valid"] = True if (self.syntax_ok(wikidata_id) and self.exists(wikidata_id)) else False return self._data[wikidata_id].get("valid") if get_extra_info: return self._data[wikidata_id].get("valid"), self._data[wikidata_id] diff --git a/oc_ds_converter/oc_idmanager/wikipedia.py b/oc_ds_converter/oc_idmanager/wikipedia.py index fda232a..61db67e 100644 --- a/oc_ds_converter/oc_idmanager/wikipedia.py +++ b/oc_ds_converter/oc_idmanager/wikipedia.py @@ -49,8 +49,7 @@ def is_valid(self, wikipedia_id, get_extra_info=False): self._data[wikipedia_id] = info[1] return (info[0] and self.syntax_ok(wikipedia_id)), info[1] self._data[wikipedia_id] = dict() - self._data[wikipedia_id]["valid"] = True if (self.exists(wikipedia_id) and self.syntax_ok( - wikipedia_id)) else False + self._data[wikipedia_id]["valid"] = True if (self.syntax_ok(wikipedia_id) and self.exists(wikipedia_id)) else False return self._data[wikipedia_id].get("valid") if get_extra_info: return self._data[wikipedia_id].get("valid"), self._data[wikipedia_id] diff --git a/oc_ds_converter/pubmed/finder_nih.py b/oc_ds_converter/pubmed/finder_nih.py index 0621cbe..a1218f1 100644 --- a/oc_ds_converter/pubmed/finder_nih.py +++ b/oc_ds_converter/pubmed/finder_nih.py @@ -69,7 +69,7 @@ def _get_date(self, txt_obj): re.IGNORECASE, ).group(1) re_search = re.search( - "(\d{4})\s+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+((3[0-1])|([1-2][0-9])|([0]?[1-9]))", + r"(\d{4})\s+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+((3[0-1])|([1-2][0-9])|([0]?[1-9]))", date, re.IGNORECASE, ) @@ -79,7 +79,7 @@ def _get_date(self, txt_obj): pmid_date = datetime.strftime(datetime_object, "%Y-%m-%d") else: re_search = re.search( - "(\d{4})\s+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)", + r"(\d{4})\s+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)", date, re.IGNORECASE, ) @@ -88,9 +88,9 @@ def _get_date(self, txt_obj): datetime_object = datetime.strptime(src, "%Y %b") pmid_date = datetime.strftime(datetime_object, "%Y-%m") else: - re_search = re.search("(\d{4})", date) + re_search = re.search(r"(\d{4})", date) if re_search is not None: - src = re.search("(\d{4})", date).group(0) + src = re.search(r"(\d{4})", date).group(0) datetime_object = datetime.strptime(src, "%Y") pmid_date = datetime.strftime(datetime_object, "%Y") return pmid_date diff --git a/oc_ds_converter/run/crossref_process.py b/oc_ds_converter/run/crossref_process.py index 94b9377..29b0f8a 100644 --- a/oc_ds_converter/run/crossref_process.py +++ b/oc_ds_converter/run/crossref_process.py @@ -130,7 +130,6 @@ def preprocess(crossref_json_dir:str, publishers_filepath:str, orcid_doi_filepat if os.path.exists(lock_file): os.remove(lock_file) pbar.close() if verbose else None - # added to avoid order-releted issues in sequential tests runs if testing: storage_manager = get_storage_manager(storage_path, redis_storage_manager, testing=testing) diff --git a/pyproject.toml b/pyproject.toml index 981d996..cdaba29 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "oc-ds-converter" -version = "1.0.3" +version = "1.0.4" description = "A library for converting metadata provided by various data sources, e.g. Crossref, DataCite, JaLC, and mEDRA, into the format used by OpenCitations Meta." authors = ["arcangelo7 "] license = "ISC" @@ -8,7 +8,7 @@ readme = "README.md" packages = [{include = "oc_ds_converter"}] [tool.poetry.dependencies] -python = "^3.8" +python = "^3.8,<3.14" beautifulsoup4 = "^4.12.1" requests = "^2.28.2" tqdm = "^4.65.0" diff --git a/test/crossref_process_test.py b/test/crossref_process_test.py index 7a6ecfb..9c439ef 100644 --- a/test/crossref_process_test.py +++ b/test/crossref_process_test.py @@ -227,14 +227,12 @@ def test_any_db_creation_redis_no_testing(self): except: run_test = False print("test skipped: 'test_any_db_creation_redis_no_testing': Connect to redis before running the test") - if run_test: rsm.del_value("TEST VALUE") if not len(rsm.get_all_keys()): preprocess(crossref_json_dir=self.targz_cited_input, publishers_filepath=self.publisher_mapping, orcid_doi_filepath=self.iod, csv_dir=self.output, redis_storage_manager=True, - storage_path=self.db, cache=self.cache) - + storage_path=self.db, cache=self.cache, verbose=True) rsm.delete_storage() diff --git a/test/idm_orcid_test.py b/test/idm_orcid_test.py index cab2459..6c8021e 100644 --- a/test/idm_orcid_test.py +++ b/test/idm_orcid_test.py @@ -1,20 +1,19 @@ -from oc_ds_converter.oc_idmanager.orcid import ORCIDManager import json -import sqlite3 import os.path +import re import unittest from os import makedirs from os.path import exists, join -import xmltodict from oc_ds_converter.oc_idmanager import * -from oc_ds_converter.oc_idmanager.base import IdentifierManager -from requests import ReadTimeout, get -from requests.exceptions import ConnectionError -from oc_ds_converter.oc_idmanager.oc_data_storage.storage_manager import StorageManager -from oc_ds_converter.oc_idmanager.oc_data_storage.in_memory_manager import InMemoryStorageManager -from oc_ds_converter.oc_idmanager.oc_data_storage.sqlite_manager import SqliteStorageManager -from oc_ds_converter.oc_idmanager.oc_data_storage.redis_manager import RedisStorageManager +from oc_ds_converter.oc_idmanager.oc_data_storage.in_memory_manager import \ + InMemoryStorageManager +from oc_ds_converter.oc_idmanager.oc_data_storage.redis_manager import \ + RedisStorageManager +from oc_ds_converter.oc_idmanager.oc_data_storage.sqlite_manager import \ + SqliteStorageManager +from oc_ds_converter.oc_idmanager.orcid import ORCIDManager + class orcidIdentifierManagerTest(unittest.TestCase): """This class aim at testing identifiers manager.""" @@ -75,8 +74,18 @@ def test_exists(self): with self.subTest(msg="get_extra_info=True, allow_extra_api=None"): orcid_manager = ORCIDManager() output = orcid_manager.exists(self.valid_orcid_2, get_extra_info=True, allow_extra_api=None) - expected_output = (True, {'id': '0000-0001-5506-523X', 'valid': True, 'family_name': 'Shotton', 'given_name': 'David', 'email': "", 'external_identifiers': {}, 'submission_date': '2012-10-31', 'update_date': '2024-03-19'}) - self.assertEqual(output, expected_output) + self.assertTrue(output[0]) # Check if exists + info = output[1] + self.assertEqual(info['id'], '0000-0001-5506-523X') + self.assertTrue(info['valid']) + self.assertEqual(info['family_name'], 'Shotton') + self.assertEqual(info['given_name'], 'David') + self.assertEqual(info['email'], "") + self.assertEqual(info['external_identifiers'], {}) + self.assertEqual(info['submission_date'], '2012-10-31') + # Check if update_date is a valid date string and not earlier than submission_date + self.assertTrue(re.match(r'\d{4}-\d{2}-\d{2}', info['update_date'])) + self.assertGreaterEqual(info['update_date'], info['submission_date']) with self.subTest(msg="get_extra_info=False, allow_extra_api=None"): orcid_manager = ORCIDManager() output = orcid_manager.exists(orcid_manager.normalise(self.valid_orcid_1), get_extra_info=False, allow_extra_api=None) diff --git a/test/pubmed_processing_test.py b/test/pubmed_processing_test.py index a384a6a..b863e6a 100644 --- a/test/pubmed_processing_test.py +++ b/test/pubmed_processing_test.py @@ -364,8 +364,8 @@ def test_redis_db(self): self.assertEqual(tabular_data_w_redis_data['id'], 'pmid:5 doi:10.1016/a_fake_doi') pubmed_processor.doi_m.storage_manager.delete_storage() - pubmed_processor.BR_redis.flushall() - pubmed_processor.RA_redis.flushall() + pubmed_processor.BR_redis.flushdb() + pubmed_processor.RA_redis.flushdb() def test_get_citations(self): inp_ent = {'pmid': '5', 'doi': '10.1016/0006-291x(75)90508-2',