From 024e94713adee46fc6eebb75b0602bbf414fbcc2 Mon Sep 17 00:00:00 2001 From: Peter Lombaers Date: Thu, 29 Feb 2024 14:57:42 +0100 Subject: [PATCH 01/19] Add code that retrieves citations --- asreviewcontrib/datatools/snowballing.py | 119 +++++++++++++++++++++++ setup.py | 2 +- tests/test_snowballing.py | 36 +++++++ 3 files changed, 156 insertions(+), 1 deletion(-) create mode 100644 asreviewcontrib/datatools/snowballing.py create mode 100644 tests/test_snowballing.py diff --git a/asreviewcontrib/datatools/snowballing.py b/asreviewcontrib/datatools/snowballing.py new file mode 100644 index 0000000..8c76f95 --- /dev/null +++ b/asreviewcontrib/datatools/snowballing.py @@ -0,0 +1,119 @@ +import os + +import pyalex +from dotenv import load_dotenv +from pyalex import Works + +load_dotenv() + +# OpenAlex polite pool: +# https://docs.openalex.org/how-to-use-the-api/rate-limits-and-authentication#the-polite-pool +pyalex.config.email = os.environ.get("OPENALEX_EMAIL") +# Maximum number of statements joined by a logical OR in a call to OpenAlex. +OPENALEX_MAX_OR_LENGTH = 100 +OPENALEX_MAX_PAGE_LENGTH = 200 +# OpenAlex data fields to retrieve. +USED_FIELDS = [ + "id", + "doi", + "title", + "abstract_inverted_index", + "referenced_works", + "publication_date", +] + + +def forwards_snowballing(identifiers: list[str]) -> dict[str, list[dict]]: + """Get all works citing a work with the OpenAlex identifier from the list. + + Parameters + ---------- + identifiers : list[str] + List of OpenAlex identifiers. + + Returns + ------- + dict[str, list[dict]] + Dictionary of the form + `{input OpenAlex identifier : list of OpenAlex works}` + where each work in the list references the work with the input identifier and + it is a dictionary of the form `{field_name : field_value}`. + """ + citing_works = {} + for idx, openalex_id in enumerate(identifiers): + print(f"{idx}. Getting cited works for {openalex_id}") + works_citing_id = Works().filter(cites=openalex_id).select(USED_FIELDS).get() + citing_works[openalex_id] = [ + { + key: work[key] + for key in [ + col if col != "abstract_inverted_index" else "abstract" + for col in USED_FIELDS + ] + } + for work in works_citing_id + ] + return citing_works + + +def backwards_snowballing(identifiers: list[str]) -> dict[str, list[dict]]: + """Get all works cited by a work with the OpenAlex identifier from the list. + + Parameters + ---------- + identifiers : list[str] + List of OpenAlex identifiers. + + Returns + ------- + dict[str, list[dict]] + Dictionary of the form + `{input OpenAlex identifier : list of OpenAlex works}` + where each work in the list is referenced by the work with the input identifier + and it is a dictionary of the form `{field_name : field_value}`. + """ + # Get the referenced works. + referenced_works = {} + page_length = min(OPENALEX_MAX_OR_LENGTH, OPENALEX_MAX_PAGE_LENGTH) + for i in range(0, len(identifiers), page_length): + fltr = "|".join(identifiers[i : i + page_length]) + pager = ( + Works() + .filter(openalex=fltr) + .select("id,referenced_works") + .paginate(per_page=page_length) + ) + for page in pager: + for work in page: + referenced_works[work["id"]] = work["referenced_works"] + + # Get the fields for the referenced works. + all_identifiers = [] + for reference_list in referenced_works.values(): + all_identifiers += reference_list + + all_referenced_works = {} + for i in range(0, len(all_identifiers), page_length): + fltr = "|".join(all_identifiers[i : i + page_length]) + pager = ( + Works() + .filter(openalex=fltr) + .select(USED_FIELDS) + .paginate(per_page=page_length) + ) + for page in pager: + for work in page: + all_referenced_works[work["id"]] = { + key: work[key] + for key in [ + col if col != "abstract_inverted_index" else "abstract" + for col in USED_FIELDS + ] + } + + # Connect the referenced works back to the input works. + for identifier, ref_id_list in referenced_works.items(): + referenced_works[identifier] = [ + all_referenced_works[ref_id] for ref_id in ref_id_list + ] + return referenced_works diff --git a/setup.py b/setup.py index 96798d4..ef61ffe 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ keywords="asreview datatools", packages=find_namespace_packages(include=["asreviewcontrib.*"]), python_requires=">=3.7", - install_requires=["asreview>=1.1,<2", "pandas"], + install_requires=["asreview>=1.1,<2", "pandas", "pyalex"], extras_require={}, entry_points={ "asreview.entry_points": [ diff --git a/tests/test_snowballing.py b/tests/test_snowballing.py new file mode 100644 index 0000000..59a537b --- /dev/null +++ b/tests/test_snowballing.py @@ -0,0 +1,36 @@ +from asreviewcontrib.datatools.snowballing import ( + backwards_snowballing, + forwards_snowballing, +) + + +def test_backwards_snowballing(): + identifiers = [ + "https://openalex.org/W4281483266", + "https://openalex.org/W2008620264", + ] + + backwards_citations = backwards_snowballing(identifiers) + + assert "https://openalex.org/W1864285629" in [ + field_dict["id"] for field_dict in backwards_citations[identifiers[0]] + ] + assert "https://openalex.org/W950821216" in [ + field_dict["id"] for field_dict in backwards_citations[identifiers[1]] + ] + + +def test_forwards_snowballing(): + identifiers = [ + "https://openalex.org/W4281483266", + "https://openalex.org/W2008620264", + ] + + forwards_citations = forwards_snowballing(identifiers) + + assert "https://openalex.org/W4386305682" in [ + field_dict["id"] for field_dict in forwards_citations[identifiers[0]] + ] + assert "https://openalex.org/W2124637492" in [ + field_dict["id"] for field_dict in forwards_citations[identifiers[1]] + ] From e560b43a7cb504110520b5594934169400e09ab2 Mon Sep 17 00:00:00 2001 From: Peter Lombaers Date: Thu, 29 Feb 2024 17:21:34 +0100 Subject: [PATCH 02/19] Connect to entrypoint --- asreviewcontrib/datatools/entrypoint.py | 12 ++- asreviewcontrib/datatools/snowballing.py | 97 +++++++++++++++++++++--- tests/test_snowballing.py | 12 +-- 3 files changed, 102 insertions(+), 19 deletions(-) diff --git a/asreviewcontrib/datatools/entrypoint.py b/asreviewcontrib/datatools/entrypoint.py index c23cafa..0a9c517 100644 --- a/asreviewcontrib/datatools/entrypoint.py +++ b/asreviewcontrib/datatools/entrypoint.py @@ -9,10 +9,12 @@ from asreviewcontrib.datatools.convert import convert from asreviewcontrib.datatools.describe import _parse_arguments_describe from asreviewcontrib.datatools.describe import describe +from asreviewcontrib.datatools.snowballing import _parse_arguments_snowballing +from asreviewcontrib.datatools.snowballing import snowballing from asreviewcontrib.datatools.stack import _parse_arguments_vstack from asreviewcontrib.datatools.stack import vstack -DATATOOLS = ["describe", "dedup", "convert", "compose", "vstack"] +DATATOOLS = ["describe", "dedup", "convert", "compose", "vstack", "snowballing"] class DataEntryPoint(BaseEntryPoint): @@ -27,9 +29,8 @@ def __init__(self): self.version = __version__ def execute(self, argv): - + print(argv) if len(argv) > 1 and argv[0] in DATATOOLS: - if argv[0] == "describe": args_describe_parser = _parse_arguments_describe() args_describe = vars(args_describe_parser.parse_args(argv[1:])) @@ -97,7 +98,10 @@ def execute(self, argv): order=args_compose.hierarchy, resolve=args_compose.conflict_resolve, ) - + if argv[0] == "snowballing": + args_snowballing_parser = _parse_arguments_snowballing() + args_snowballing = vars(args_snowballing_parser.parse_args(argv[1:])) + snowballing(**args_snowballing) if argv[0] == "vstack": args_vstack_parser = _parse_arguments_vstack() args_vstack = args_vstack_parser.parse_args(argv[1:]) diff --git a/asreviewcontrib/datatools/snowballing.py b/asreviewcontrib/datatools/snowballing.py index 8c76f95..0b9650c 100644 --- a/asreviewcontrib/datatools/snowballing.py +++ b/asreviewcontrib/datatools/snowballing.py @@ -1,14 +1,11 @@ -import os +import argparse +from pathlib import Path +import pandas as pd import pyalex -from dotenv import load_dotenv +from asreview import ASReviewData, load_data from pyalex import Works -load_dotenv() - -# OpenAlex polite pool: -# https://docs.openalex.org/how-to-use-the-api/rate-limits-and-authentication#the-polite-pool -pyalex.config.email = os.environ.get("OPENALEX_EMAIL") # Maximum number of statements joined by a logical OR in a call to OpenAlex. OPENALEX_MAX_OR_LENGTH = 100 OPENALEX_MAX_PAGE_LENGTH = 200 @@ -23,7 +20,7 @@ ] -def forwards_snowballing(identifiers: list[str]) -> dict[str, list[dict]]: +def forward_snowballing(identifiers: list[str]) -> dict[str, list[dict]]: """Get all works citing a work with the OpenAlex identifier from the list. Parameters @@ -56,7 +53,7 @@ def forwards_snowballing(identifiers: list[str]) -> dict[str, list[dict]]: return citing_works -def backwards_snowballing(identifiers: list[str]) -> dict[str, list[dict]]: +def backward_snowballing(identifiers: list[str]) -> dict[str, list[dict]]: """Get all works cited by a work with the OpenAlex identifier from the list. Parameters @@ -117,3 +114,85 @@ def backwards_snowballing(identifiers: list[str]) -> dict[str, list[dict]]: all_referenced_works[ref_id] for ref_id in ref_id_list ] return referenced_works + + +def snowballing( + input_path: Path, + output_path: Path, + forward: bool, + backward: bool, + use_all: bool = False, + email: str = None, +) -> None: + data = load_data(input_path) + + if not (forward or backward): + raise ValueError("At least one of 'forward' or 'backward' should be True.") + + if "openalex_id" not in data.df.columns: + raise ValueError( + "Dataset should contain a column 'openalex_id' containing OpenAlex" + " identifiers." + ) + + if not use_all: + identifiers = data.df.loc[ + data.included & data.df.openalex_id.notna(), "openalex_id" + ].to_list() + else: + identifiers = data.df["openalex_id"].dropna().to_list() + + if email is not None: + pyalex.config.email = email + + if forward: + forward_data = forward_snowballing(identifiers) + else: + forward_data = {} + if backward: + backward_data = backward_snowballing(identifiers) + else: + backward_data = {} + + all_works = [] + for works_list in forward_data.values(): + all_works += works_list + for works_list in backward_data.values(): + all_works += works_list + output_data = pd.DataFrame(all_works) + output_data.drop_duplicates(subset=["id"], inplace=True) + output_data.rename({"id": "openalex_id"}, axis=1, inplace=True) + output_data = ASReviewData(output_data) + output_data.to_file(output_path) + + +def _parse_arguments_snowballing(): + parser = argparse.ArgumentParser(prog="asreview data snowballing") + parser.add_argument( + "input_path", type=str, help="The file path of the input dataset." + ) + parser.add_argument( + "output_path", type=str, help="The file path of the output dataset." + ) + parser.add_argument("--forward", type=bool, help="Do forward snowballing.") + parser.add_argument("--backward", type=bool, help="Do backward snowballing.") + parser.add_argument( + "--use_all", + type=bool, + default=False, + required=False, + help=( + "Do snowballing on all records in the dataset, not just the included ones." + ), + ) + parser.add_argument( + "--email", + type=str, + required=False, + help=( + "Email address to send along with requests to OpenAlex. This will make" + " requests faster. See also " + "https://docs.openalex.org/how-to-use-the-api/rate-limits-and-authentication#the-polite-pool" + ), + ) + return parser diff --git a/tests/test_snowballing.py b/tests/test_snowballing.py index 59a537b..f5041de 100644 --- a/tests/test_snowballing.py +++ b/tests/test_snowballing.py @@ -1,16 +1,16 @@ from asreviewcontrib.datatools.snowballing import ( - backwards_snowballing, - forwards_snowballing, + backward_snowballing, + forward_snowballing, ) -def test_backwards_snowballing(): +def test_backward_snowballing(): identifiers = [ "https://openalex.org/W4281483266", "https://openalex.org/W2008620264", ] - backwards_citations = backwards_snowballing(identifiers) + backwards_citations = backward_snowballing(identifiers) assert "https://openalex.org/W1864285629" in [ field_dict["id"] for field_dict in backwards_citations[identifiers[0]] @@ -20,13 +20,13 @@ def test_backwards_snowballing(): ] -def test_forwards_snowballing(): +def test_forward_snowballing(): identifiers = [ "https://openalex.org/W4281483266", "https://openalex.org/W2008620264", ] - forwards_citations = forwards_snowballing(identifiers) + forwards_citations = forward_snowballing(identifiers) assert "https://openalex.org/W4386305682" in [ field_dict["id"] for field_dict in forwards_citations[identifiers[0]] From bc94d9df2fa05ef54bb8f41b4f6f60d85e571276 Mon Sep 17 00:00:00 2001 From: Peter Lombaers Date: Thu, 7 Mar 2024 06:56:08 +0100 Subject: [PATCH 03/19] Add `openalex_from_doi` --- asreviewcontrib/datatools/snowballing.py | 67 ++++++++++++++++-------- tests/test_snowballing.py | 15 ++++++ 2 files changed, 61 insertions(+), 21 deletions(-) diff --git a/asreviewcontrib/datatools/snowballing.py b/asreviewcontrib/datatools/snowballing.py index 0b9650c..19c5ced 100644 --- a/asreviewcontrib/datatools/snowballing.py +++ b/asreviewcontrib/datatools/snowballing.py @@ -4,7 +4,6 @@ import pandas as pd import pyalex from asreview import ASReviewData, load_data -from pyalex import Works # Maximum number of statements joined by a logical OR in a call to OpenAlex. OPENALEX_MAX_OR_LENGTH = 100 @@ -39,7 +38,9 @@ def forward_snowballing(identifiers: list[str]) -> dict[str, list[dict]]: citing_works = {} for idx, openalex_id in enumerate(identifiers): print(f"{idx}. Getting cited works for {openalex_id}") - works_citing_id = Works().filter(cites=openalex_id).select(USED_FIELDS).get() + works_citing_id = ( + pyalex.Works().filter(cites=openalex_id).select(USED_FIELDS).get() + ) citing_works[openalex_id] = [ { key: work[key] @@ -74,15 +75,13 @@ def backward_snowballing(identifiers: list[str]) -> dict[str, list[dict]]: page_length = min(OPENALEX_MAX_OR_LENGTH, OPENALEX_MAX_PAGE_LENGTH) for i in range(0, len(identifiers), page_length): fltr = "|".join(identifiers[i : i + page_length]) - pager = ( - Works() + for work in ( + pyalex.Works() .filter(openalex=fltr) .select("id,referenced_works") .paginate(per_page=page_length) - ) - for page in pager: - for work in page: - referenced_works[work["id"]] = work["referenced_works"] + ): + referenced_works[work["id"]] = work["referenced_works"] # Get the fields for the referenced works. all_identifiers = [] @@ -92,21 +91,19 @@ def backward_snowballing(identifiers: list[str]) -> dict[str, list[dict]]: all_referenced_works = {} for i in range(0, len(all_identifiers), page_length): fltr = "|".join(all_identifiers[i : i + page_length]) - pager = ( - Works() + for work in ( + pyalex.Works() .filter(openalex=fltr) .select(USED_FIELDS) - .paginate(per_page=page_length) - ) - for page in pager: - for work in page: - all_referenced_works[work["id"]] = { - key: work[key] - for key in [ - col if col != "abstract_inverted_index" else "abstract" - for col in USED_FIELDS - ] - } + .get(per_page=page_length) + ): + all_referenced_works[work["id"]] = { + key: work[key] + for key in [ + col if col != "abstract_inverted_index" else "abstract" + for col in USED_FIELDS + ] + } # Connect the referenced works back to the input works. for identifier, ref_id_list in referenced_works.items(): @@ -116,6 +113,34 @@ def backward_snowballing(identifiers: list[str]) -> dict[str, list[dict]]: return referenced_works +def openalex_from_doi(dois: list[str]) -> dict[str, str]: + """Get the OpenAlex identifiers corresponding to a list of DOIs. + + Parameters + ---------- + dois : list[str] + List of DOIs. + + Returns + ------- + dict[str, str] + Dictionary {doi: openalex_id}. If there was no OpenAlex identifier found for a + DOI, the corresponding value will be None. + """ + page_length = min(OPENALEX_MAX_OR_LENGTH, OPENALEX_MAX_PAGE_LENGTH) + id_mapping = {doi: None for doi in dois} + for i in range(0, len(dois), page_length): + fltr = "|".join(dois[i : i + page_length]) + for work in ( + pyalex.Works() + .filter(doi=fltr) + .select(["id", "doi"]) + .get(per_page=page_length) + ): + id_mapping[work["doi"]] = work["id"] + return id_mapping + + def snowballing( input_path: Path, output_path: Path, diff --git a/tests/test_snowballing.py b/tests/test_snowballing.py index f5041de..949cfbb 100644 --- a/tests/test_snowballing.py +++ b/tests/test_snowballing.py @@ -1,9 +1,24 @@ from asreviewcontrib.datatools.snowballing import ( backward_snowballing, forward_snowballing, + openalex_from_doi, ) +def test_openalex_from_doi(): + dois = [ + "https://doi.org/10.1042/cs20220150", + "https://doi.org/10.1042/bst20220734", + "not_a_doi", + ] + + assert openalex_from_doi(dois) == { + "https://doi.org/10.1042/cs20220150": "https://openalex.org/W4386305682", + "https://doi.org/10.1042/bst20220734": "https://openalex.org/W4312006214", + "not_a_doi": None, + } + + def test_backward_snowballing(): identifiers = [ "https://openalex.org/W4281483266", From 8ae51cb6b740b8315a5a71c49bc7c15d402e541f Mon Sep 17 00:00:00 2001 From: Peter Lombaers Date: Thu, 7 Mar 2024 07:09:25 +0100 Subject: [PATCH 04/19] Use `openalex_from_doi` in snowballing --- asreviewcontrib/datatools/snowballing.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/asreviewcontrib/datatools/snowballing.py b/asreviewcontrib/datatools/snowballing.py index 19c5ced..d3643f0 100644 --- a/asreviewcontrib/datatools/snowballing.py +++ b/asreviewcontrib/datatools/snowballing.py @@ -154,11 +154,24 @@ def snowballing( if not (forward or backward): raise ValueError("At least one of 'forward' or 'backward' should be True.") + # Add OpenAlex identifiers if not available. if "openalex_id" not in data.df.columns: - raise ValueError( - "Dataset should contain a column 'openalex_id' containing OpenAlex" - " identifiers." + if "doi" not in data.df.columns: + raise ValueError( + "Dataset should contain a column 'openalex_id' containing OpenAlex" + " identifiers or a column 'doi' containing DOIs." + ) + id_mapping = openalex_from_doi(data.df.doi.to_list()) + n_openalex_ids = len( + openalex_id + for openalex_id in id_mapping.values() + if openalex_id is not None ) + print( + f"Found OpenAlex identifiers for {n_openalex_ids} out of {len(id_mapping)}" + " records. Performing snowballing for those records." + ) + data["openalex_id"] = [id_mapping[doi] for doi in data.df.doi] if not use_all: identifiers = data.df.loc[ From a1ae54724c901f479785ea4a00eb433f1b88496d Mon Sep 17 00:00:00 2001 From: Peter Lombaers Date: Thu, 7 Mar 2024 07:16:25 +0100 Subject: [PATCH 05/19] Change order of operations If snowballing is only performed on the included records, we now also only look for OpenAlex identifier for those records. --- asreviewcontrib/datatools/snowballing.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/asreviewcontrib/datatools/snowballing.py b/asreviewcontrib/datatools/snowballing.py index d3643f0..d72dc3e 100644 --- a/asreviewcontrib/datatools/snowballing.py +++ b/asreviewcontrib/datatools/snowballing.py @@ -149,19 +149,23 @@ def snowballing( use_all: bool = False, email: str = None, ) -> None: - data = load_data(input_path) - if not (forward or backward): raise ValueError("At least one of 'forward' or 'backward' should be True.") + data = load_data(input_path) + if use_all: + data = data.df + else: + data = data.df.loc[data.included] + # Add OpenAlex identifiers if not available. - if "openalex_id" not in data.df.columns: - if "doi" not in data.df.columns: + if "openalex_id" not in data.columns: + if "doi" not in data.columns: raise ValueError( "Dataset should contain a column 'openalex_id' containing OpenAlex" " identifiers or a column 'doi' containing DOIs." ) - id_mapping = openalex_from_doi(data.df.doi.to_list()) + id_mapping = openalex_from_doi(data.doi.to_list()) n_openalex_ids = len( openalex_id for openalex_id in id_mapping.values() @@ -171,14 +175,9 @@ def snowballing( f"Found OpenAlex identifiers for {n_openalex_ids} out of {len(id_mapping)}" " records. Performing snowballing for those records." ) - data["openalex_id"] = [id_mapping[doi] for doi in data.df.doi] + data["openalex_id"] = [id_mapping[doi] for doi in data.doi] - if not use_all: - identifiers = data.df.loc[ - data.included & data.df.openalex_id.notna(), "openalex_id" - ].to_list() - else: - identifiers = data.df["openalex_id"].dropna().to_list() + identifiers = data.df["openalex_id"].dropna().to_list() if email is not None: pyalex.config.email = email From 8969ad89dd6adb4222c7970b0db23b526c5f6be6 Mon Sep 17 00:00:00 2001 From: Peter Lombaers Date: Thu, 7 Mar 2024 07:19:46 +0100 Subject: [PATCH 06/19] Add docstring --- asreviewcontrib/datatools/snowballing.py | 27 ++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/asreviewcontrib/datatools/snowballing.py b/asreviewcontrib/datatools/snowballing.py index d72dc3e..875d017 100644 --- a/asreviewcontrib/datatools/snowballing.py +++ b/asreviewcontrib/datatools/snowballing.py @@ -149,6 +149,33 @@ def snowballing( use_all: bool = False, email: str = None, ) -> None: + """Perform snowballing on an ASReview dataset. + + Parameters + ---------- + input_path : Path + Location of the input ASReview dataset. + output_path : Path + Location where to save the output dataset. + forward : bool + Perform forward snowballing. At least one of `forward` or `backward` should be + True. + backward : bool + Perform backward snowballing. At least one of `forward` or `backward` should be + True. + use_all : bool, optional + Perform snowballing on all records in the dataset or only the included + records, by default False + email : str, optional + Email address to send along with request to OpenAlex, by default None + + Raises + ------ + ValueError + If `forward` and `backward` are both False. + ValueError + If the dataset contains no column name `openalex_id` and no column names `doi`. + """ if not (forward or backward): raise ValueError("At least one of 'forward' or 'backward' should be True.") From 54d44492314f2562b3a6351821e4d84b454af37e Mon Sep 17 00:00:00 2001 From: Peter Lombaers Date: Thu, 7 Mar 2024 09:28:19 +0100 Subject: [PATCH 07/19] Fix bugs --- asreviewcontrib/datatools/snowballing.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/asreviewcontrib/datatools/snowballing.py b/asreviewcontrib/datatools/snowballing.py index 875d017..45a256a 100644 --- a/asreviewcontrib/datatools/snowballing.py +++ b/asreviewcontrib/datatools/snowballing.py @@ -79,7 +79,7 @@ def backward_snowballing(identifiers: list[str]) -> dict[str, list[dict]]: pyalex.Works() .filter(openalex=fltr) .select("id,referenced_works") - .paginate(per_page=page_length) + .get(per_page=page_length) ): referenced_works[work["id"]] = work["referenced_works"] @@ -193,18 +193,18 @@ def snowballing( " identifiers or a column 'doi' containing DOIs." ) id_mapping = openalex_from_doi(data.doi.to_list()) - n_openalex_ids = len( + n_openalex_ids = len([ openalex_id for openalex_id in id_mapping.values() if openalex_id is not None - ) + ]) print( f"Found OpenAlex identifiers for {n_openalex_ids} out of {len(id_mapping)}" " records. Performing snowballing for those records." ) data["openalex_id"] = [id_mapping[doi] for doi in data.doi] - identifiers = data.df["openalex_id"].dropna().to_list() + identifiers = data["openalex_id"].dropna().to_list() if email is not None: pyalex.config.email = email From 0b64ef7ae138f49e35d92b507d6eb9141af330d6 Mon Sep 17 00:00:00 2001 From: Peter Lombaers Date: Thu, 7 Mar 2024 09:52:58 +0100 Subject: [PATCH 08/19] Improve argparse --- asreviewcontrib/datatools/snowballing.py | 29 +++++++++++++++--------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/asreviewcontrib/datatools/snowballing.py b/asreviewcontrib/datatools/snowballing.py index 45a256a..7d2a5f3 100644 --- a/asreviewcontrib/datatools/snowballing.py +++ b/asreviewcontrib/datatools/snowballing.py @@ -193,11 +193,13 @@ def snowballing( " identifiers or a column 'doi' containing DOIs." ) id_mapping = openalex_from_doi(data.doi.to_list()) - n_openalex_ids = len([ - openalex_id - for openalex_id in id_mapping.values() - if openalex_id is not None - ]) + n_openalex_ids = len( + [ + openalex_id + for openalex_id in id_mapping.values() + if openalex_id is not None + ] + ) print( f"Found OpenAlex identifiers for {n_openalex_ids} out of {len(id_mapping)}" " records. Performing snowballing for those records." @@ -238,19 +240,24 @@ def _parse_arguments_snowballing(): parser.add_argument( "output_path", type=str, help="The file path of the output dataset." ) - parser.add_argument("--forward", type=bool, help="Do forward snowballing.") - parser.add_argument("--backward", type=bool, help="Do backward snowballing.") parser.add_argument( - "--use_all", - type=bool, - default=False, - required=False, + "--forward", "-f", action="store_true", help="Do forward snowballing." + ) + parser.add_argument( + "--backward", "-b", action="store_true", help="Do backward snowballing." + ) + parser.add_argument( + "--all", + "-a", + action="store_true", + dest="use_all", help=( "Do snowballing on all records in the dataset, not just the included ones." ), ) parser.add_argument( "--email", + "-e", type=str, required=False, help=( From 6eb85a95fb2d2a01bcfee88fb5307e5ab1f6df15 Mon Sep 17 00:00:00 2001 From: Peter Lombaers Date: Thu, 7 Mar 2024 10:29:36 +0100 Subject: [PATCH 09/19] Fix bug --- asreviewcontrib/datatools/snowballing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asreviewcontrib/datatools/snowballing.py b/asreviewcontrib/datatools/snowballing.py index 7d2a5f3..afe0f8f 100644 --- a/asreviewcontrib/datatools/snowballing.py +++ b/asreviewcontrib/datatools/snowballing.py @@ -183,7 +183,7 @@ def snowballing( if use_all: data = data.df else: - data = data.df.loc[data.included] + data = data.df.loc[data.included.astype(bool)] # Add OpenAlex identifiers if not available. if "openalex_id" not in data.columns: From 1029dbbb1459c6dad7a1850fe3e38e16b857a0aa Mon Sep 17 00:00:00 2001 From: Peter Lombaers Date: Thu, 7 Mar 2024 10:49:16 +0100 Subject: [PATCH 10/19] Deal with missing DOIs --- asreviewcontrib/datatools/snowballing.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/asreviewcontrib/datatools/snowballing.py b/asreviewcontrib/datatools/snowballing.py index afe0f8f..3fb0159 100644 --- a/asreviewcontrib/datatools/snowballing.py +++ b/asreviewcontrib/datatools/snowballing.py @@ -192,7 +192,7 @@ def snowballing( "Dataset should contain a column 'openalex_id' containing OpenAlex" " identifiers or a column 'doi' containing DOIs." ) - id_mapping = openalex_from_doi(data.doi.to_list()) + id_mapping = openalex_from_doi(data.doi.dropna().to_list()) n_openalex_ids = len( [ openalex_id @@ -201,10 +201,13 @@ def snowballing( ] ) print( - f"Found OpenAlex identifiers for {n_openalex_ids} out of {len(id_mapping)}" + f"Found OpenAlex identifiers for {n_openalex_ids} out of {len(data)}" " records. Performing snowballing for those records." ) - data["openalex_id"] = [id_mapping[doi] for doi in data.doi] + data["openalex_id"] = None + data.loc[data.doi.notna(), "openalex_id"] = data.loc[ + data.doi.notna(), "doi" + ].apply(lambda doi: id_mapping[doi]) identifiers = data["openalex_id"].dropna().to_list() From 13ad5159a59ba21229f40d04e881189e3b24a850 Mon Sep 17 00:00:00 2001 From: Peter Lombaers Date: Thu, 14 Mar 2024 10:07:11 +0100 Subject: [PATCH 11/19] Fix backwards output format and long URL bug --- asreviewcontrib/datatools/entrypoint.py | 1 - asreviewcontrib/datatools/snowballing.py | 35 ++++++++++++++++++++---- tests/__init__.py | 0 3 files changed, 30 insertions(+), 6 deletions(-) create mode 100644 tests/__init__.py diff --git a/asreviewcontrib/datatools/entrypoint.py b/asreviewcontrib/datatools/entrypoint.py index 0a9c517..acad1f2 100644 --- a/asreviewcontrib/datatools/entrypoint.py +++ b/asreviewcontrib/datatools/entrypoint.py @@ -29,7 +29,6 @@ def __init__(self): self.version = __version__ def execute(self, argv): - print(argv) if len(argv) > 1 and argv[0] in DATATOOLS: if argv[0] == "describe": args_describe_parser = _parse_arguments_describe() diff --git a/asreviewcontrib/datatools/snowballing.py b/asreviewcontrib/datatools/snowballing.py index 3fb0159..2d14071 100644 --- a/asreviewcontrib/datatools/snowballing.py +++ b/asreviewcontrib/datatools/snowballing.py @@ -73,8 +73,15 @@ def backward_snowballing(identifiers: list[str]) -> dict[str, list[dict]]: # Get the referenced works. referenced_works = {} page_length = min(OPENALEX_MAX_OR_LENGTH, OPENALEX_MAX_PAGE_LENGTH) + OPENALEX_PREFIX = "https://openalex.org/" + for i in range(0, len(identifiers), page_length): - fltr = "|".join(identifiers[i : i + page_length]) + print(f"Getting works citing records {i}-{i+page_length}") + # We need to remove the prefix here because otherwise the URL is too long. + fltr = "|".join( + identifier.removeprefix(OPENALEX_PREFIX) + for identifier in identifiers[i : i + page_length] + ) for work in ( pyalex.Works() .filter(openalex=fltr) @@ -87,10 +94,16 @@ def backward_snowballing(identifiers: list[str]) -> dict[str, list[dict]]: all_identifiers = [] for reference_list in referenced_works.values(): all_identifiers += reference_list + all_identifiers = list(set(all_identifiers)) + print(f"Found {len(all_identifiers)} records") all_referenced_works = {} for i in range(0, len(all_identifiers), page_length): - fltr = "|".join(all_identifiers[i : i + page_length]) + # We need to remove the prefix here because otherwise the URL is too long. + fltr = "|".join( + identifier.removeprefix(OPENALEX_PREFIX) + for identifier in all_identifiers[i : i + page_length] + ) for work in ( pyalex.Works() .filter(openalex=fltr) @@ -106,11 +119,19 @@ def backward_snowballing(identifiers: list[str]) -> dict[str, list[dict]]: } # Connect the referenced works back to the input works. + output = {} for identifier, ref_id_list in referenced_works.items(): - referenced_works[identifier] = [ - all_referenced_works[ref_id] for ref_id in ref_id_list + # We need the last check if 'ref_id' is in 'all_referenced_works': If a work + # references an ID that redirects to another ID, it won't be present here. + # Example: https://openalex.org/W2015370450 has in the references the identifier + # https://openalex.org/W2008744335, but this redirects to + # https://openalex.org/W4233569835 + output[identifier] = [ + all_referenced_works[ref_id] + for ref_id in ref_id_list + if ref_id in all_referenced_works ] - return referenced_works + return output def openalex_from_doi(dois: list[str]) -> dict[str, str]: @@ -215,10 +236,12 @@ def snowballing( pyalex.config.email = email if forward: + print("Starting forward snowballing") forward_data = forward_snowballing(identifiers) else: forward_data = {} if backward: + print("Starting backward snowballing") backward_data = backward_snowballing(identifiers) else: backward_data = {} @@ -228,11 +251,13 @@ def snowballing( all_works += works_list for works_list in backward_data.values(): all_works += works_list + output_data = pd.DataFrame(all_works) output_data.drop_duplicates(subset=["id"], inplace=True) output_data.rename({"id": "openalex_id"}, axis=1, inplace=True) output_data = ASReviewData(output_data) output_data.to_file(output_path) + print("Saved dataset") def _parse_arguments_snowballing(): diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 From 27112ec43e70f72051a0e66e3407ee2c28658e00 Mon Sep 17 00:00:00 2001 From: Peter Lombaers Date: Thu, 14 Mar 2024 14:04:09 +0100 Subject: [PATCH 12/19] Add tests and improve pagination --- asreviewcontrib/datatools/snowballing.py | 31 ++++++----- tests/demo_data/snowballing_doi.csv | 3 ++ tests/demo_data/snowballing_openalex.csv | 3 ++ tests/test_snowballing.py | 68 ++++++++++++++++++++++++ 4 files changed, 92 insertions(+), 13 deletions(-) create mode 100644 tests/demo_data/snowballing_doi.csv create mode 100644 tests/demo_data/snowballing_openalex.csv diff --git a/asreviewcontrib/datatools/snowballing.py b/asreviewcontrib/datatools/snowballing.py index 2d14071..fd9303a 100644 --- a/asreviewcontrib/datatools/snowballing.py +++ b/asreviewcontrib/datatools/snowballing.py @@ -37,20 +37,25 @@ def forward_snowballing(identifiers: list[str]) -> dict[str, list[dict]]: """ citing_works = {} for idx, openalex_id in enumerate(identifiers): - print(f"{idx}. Getting cited works for {openalex_id}") - works_citing_id = ( - pyalex.Works().filter(cites=openalex_id).select(USED_FIELDS).get() + print(f"{idx}. Getting works citing {openalex_id}") + pager = ( + pyalex.Works() + .filter(cites=openalex_id) + .select(USED_FIELDS) + .paginate(per_page=OPENALEX_MAX_PAGE_LENGTH, n_max=None) ) - citing_works[openalex_id] = [ - { - key: work[key] - for key in [ - col if col != "abstract_inverted_index" else "abstract" - for col in USED_FIELDS - ] - } - for work in works_citing_id - ] + citing_works[openalex_id] = [] + for page in pager: + citing_works[openalex_id] += [ + { + key: work[key] + for key in [ + col if col != "abstract_inverted_index" else "abstract" + for col in USED_FIELDS + ] + } + for work in page + ] return citing_works diff --git a/tests/demo_data/snowballing_doi.csv b/tests/demo_data/snowballing_doi.csv new file mode 100644 index 0000000..b9b24cf --- /dev/null +++ b/tests/demo_data/snowballing_doi.csv @@ -0,0 +1,3 @@ +,title,doi,included +0,"Social Networks Analysis: Tools, Measures and Visualization",https://doi.org/10.1007/978-1-4471-4054-2_1,1 +1,"Genome-wide Association Study of Alcohol Dependence",https://doi.org/10.1001/archgenpsychiatry.2009.83,0 diff --git a/tests/demo_data/snowballing_openalex.csv b/tests/demo_data/snowballing_openalex.csv new file mode 100644 index 0000000..2a30570 --- /dev/null +++ b/tests/demo_data/snowballing_openalex.csv @@ -0,0 +1,3 @@ +,openalex_id,title,included +0,https://openalex.org/W2234238252,"Social Networks Analysis: Tools, Measures and Visualization",1 +1,https://openalex.org/W1977467968,"Genome-wide Association Study of Alcohol Dependence",0 diff --git a/tests/test_snowballing.py b/tests/test_snowballing.py index 949cfbb..f9becb3 100644 --- a/tests/test_snowballing.py +++ b/tests/test_snowballing.py @@ -1,9 +1,16 @@ +from pathlib import Path + +import pandas as pd + from asreviewcontrib.datatools.snowballing import ( backward_snowballing, forward_snowballing, openalex_from_doi, + snowballing, ) +INPUT_DIR = Path(__file__).parent / "demo_data" + def test_openalex_from_doi(): dois = [ @@ -49,3 +56,64 @@ def test_forward_snowballing(): assert "https://openalex.org/W2124637492" in [ field_dict["id"] for field_dict in forwards_citations[identifiers[1]] ] + + +def test_openalex_id_forward(tmpdir): + out_fp = Path(tmpdir, "forward_all.csv") + snowballing( + input_path=INPUT_DIR / "snowballing_openalex.csv", + output_path=out_fp, + forward=True, + backward=False, + use_all=False, + ) + df = pd.read_csv(out_fp) + assert len(df) >= 23 + + all_out_fp = Path(tmpdir, "forward_all.csv") + snowballing( + input_path=INPUT_DIR / "snowballing_openalex.csv", + output_path=all_out_fp, + forward=True, + backward=False, + use_all=True, + ) + df_all = pd.read_csv(all_out_fp) + assert len(df_all) >= 387 + + +def test_openalex_id_backward(tmpdir): + out_fp = Path(tmpdir, "forward_all.csv") + snowballing( + input_path=INPUT_DIR / "snowballing_openalex.csv", + output_path=out_fp, + forward=False, + backward=True, + use_all=False, + ) + df = pd.read_csv(out_fp) + assert len(df) == 31 + + all_out_fp = Path(tmpdir, "backward_all.csv") + snowballing( + input_path=INPUT_DIR / "snowballing_openalex.csv", + output_path=all_out_fp, + forward=False, + backward=True, + use_all=True, + ) + df_all = pd.read_csv(all_out_fp) + assert len(df_all) == 117 + + +def test_snowballing_from_doi(tmpdir): + out_fp = Path(tmpdir, "doi_all.csv") + snowballing( + input_path=INPUT_DIR / "snowballing_doi.csv", + output_path=out_fp, + forward=False, + backward=True, + use_all=True, + ) + df = pd.read_csv(out_fp) + assert len(df) == 117 From d046509d675d358295b8fd1cc3b8e3cf35700e30 Mon Sep 17 00:00:00 2001 From: Peter Lombaers Date: Thu, 14 Mar 2024 14:09:56 +0100 Subject: [PATCH 13/19] Linting --- asreviewcontrib/datatools/snowballing.py | 3 ++- tests/test_snowballing.py | 10 ++++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/asreviewcontrib/datatools/snowballing.py b/asreviewcontrib/datatools/snowballing.py index fd9303a..eff60cc 100644 --- a/asreviewcontrib/datatools/snowballing.py +++ b/asreviewcontrib/datatools/snowballing.py @@ -3,7 +3,8 @@ import pandas as pd import pyalex -from asreview import ASReviewData, load_data +from asreview import ASReviewData +from asreview import load_data # Maximum number of statements joined by a logical OR in a call to OpenAlex. OPENALEX_MAX_OR_LENGTH = 100 diff --git a/tests/test_snowballing.py b/tests/test_snowballing.py index f9becb3..29ee84f 100644 --- a/tests/test_snowballing.py +++ b/tests/test_snowballing.py @@ -2,12 +2,10 @@ import pandas as pd -from asreviewcontrib.datatools.snowballing import ( - backward_snowballing, - forward_snowballing, - openalex_from_doi, - snowballing, -) +from asreviewcontrib.datatools.snowballing import backward_snowballing +from asreviewcontrib.datatools.snowballing import forward_snowballing +from asreviewcontrib.datatools.snowballing import openalex_from_doi +from asreviewcontrib.datatools.snowballing import snowballing INPUT_DIR = Path(__file__).parent / "demo_data" From 08e15decf4f449d4148ce3ad447d57f7b9391240 Mon Sep 17 00:00:00 2001 From: Peter Lombaers Date: Thu, 14 Mar 2024 14:34:53 +0100 Subject: [PATCH 14/19] Add info to README --- README.md | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index e8b3009..96a2494 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,8 @@ LAB](https://github.com/asreview/asreview) that can be used to: - [**Convert**](#data-convert) file formats - [**Deduplicate**](#data-dedup) data - [**Stack**](#data-vstack-experimental) multiple datasets -- [**Compose**](#data-compose-experimental) a single (labeled, partly labeled, or unlabeled) dataset from multiple datasets. +- [**Compose**](#data-compose-experimental) a single (labeled, partly labeled, or unlabeled) dataset from multiple datasets +- [**Snowball**](#snowball) a dataset to find incoming or outgoing citations. Several [tutorials](Tutorials.md) are available that show how `ASReview-Datatools` can be used in different scenarios. @@ -249,6 +250,38 @@ duplicate ambiguously labeled records exist: unlabeled is prioritized over irrelevant and relevant labels, and irrelevant labels are prioritized over relevant labels. +## Snowball + +ASReview Datatools supports snowballing via the `asreview data snowball` subcommand. +It can perform both backwards (outgoing citations) and forwards (incoming citations) +snowballing. The tool works by searching the [OpenAlex](https://openalex.org/) database +for citation data. An example usage would be: + +```bash +asreview data snowball input_dataset.csv output_dataset.csv --forward +``` + +This performs forwards snowballing on `input_dataset.csv` and writes the results to +`output_dataset.csv`. For this to work it is necessary that the input dataset contains +a column with DOI's or a column called `openalex_id` containing OpenAlex work +identifiers. The output dataset will contain the columns `id`, `doi`, `title`, `abstract`, `referenced_works` and `publication_date`. In the case of forward snowballing it will +contain all works in OpenAlex that have a reference to one of the included works in the +input dataset. In the case of backward snowballing it will contain all works in OpenAlex +with referenced by one of the included works of the input dataset. + +If you want to find references for all records in your dataset, instead of just the included works, you can include the flag `--all`, so for example: + +```bash +asreview data snowball input_dataset.csv output_dataset.csv --backward --all +``` + +One thing to note is that OpenAlex will handle data requests faster if the sender sends along their email with the request (see [OpenAlex Polite Pool](https://docs.openalex.org/how-to-use-the-api/rate-limits-and-authentication#the-polite-pool +)), you can to this using the `--email` argument. An example would be: + +```bash +asreview data snowball input_dataset.csv output_dataset.csv --backward --email my_email@provider.com +``` + ## License This extension is published under the [MIT license](/LICENSE). From 0c487f324b1546ae4ec4bb46e5b276da282e9a34 Mon Sep 17 00:00:00 2001 From: Peter Lombaers Date: Thu, 14 Mar 2024 14:38:18 +0100 Subject: [PATCH 15/19] Rename `snowballing` to `snowball` This aligns better with the other arguments. --- asreviewcontrib/datatools/entrypoint.py | 8 ++++---- .../datatools/{snowballing.py => snowball.py} | 4 ++-- .../{test_snowballing.py => test_snowball.py} | 18 +++++++++--------- 3 files changed, 15 insertions(+), 15 deletions(-) rename asreviewcontrib/datatools/{snowballing.py => snowball.py} (99%) rename tests/{test_snowballing.py => test_snowball.py} (89%) diff --git a/asreviewcontrib/datatools/entrypoint.py b/asreviewcontrib/datatools/entrypoint.py index acad1f2..5026f7e 100644 --- a/asreviewcontrib/datatools/entrypoint.py +++ b/asreviewcontrib/datatools/entrypoint.py @@ -9,8 +9,8 @@ from asreviewcontrib.datatools.convert import convert from asreviewcontrib.datatools.describe import _parse_arguments_describe from asreviewcontrib.datatools.describe import describe -from asreviewcontrib.datatools.snowballing import _parse_arguments_snowballing -from asreviewcontrib.datatools.snowballing import snowballing +from asreviewcontrib.datatools.snowball import _parse_arguments_snowball +from asreviewcontrib.datatools.snowball import snowball from asreviewcontrib.datatools.stack import _parse_arguments_vstack from asreviewcontrib.datatools.stack import vstack @@ -98,9 +98,9 @@ def execute(self, argv): resolve=args_compose.conflict_resolve, ) if argv[0] == "snowballing": - args_snowballing_parser = _parse_arguments_snowballing() + args_snowballing_parser = _parse_arguments_snowball() args_snowballing = vars(args_snowballing_parser.parse_args(argv[1:])) - snowballing(**args_snowballing) + snowball(**args_snowballing) if argv[0] == "vstack": args_vstack_parser = _parse_arguments_vstack() args_vstack = args_vstack_parser.parse_args(argv[1:]) diff --git a/asreviewcontrib/datatools/snowballing.py b/asreviewcontrib/datatools/snowball.py similarity index 99% rename from asreviewcontrib/datatools/snowballing.py rename to asreviewcontrib/datatools/snowball.py index eff60cc..2569961 100644 --- a/asreviewcontrib/datatools/snowballing.py +++ b/asreviewcontrib/datatools/snowball.py @@ -168,7 +168,7 @@ def openalex_from_doi(dois: list[str]) -> dict[str, str]: return id_mapping -def snowballing( +def snowball( input_path: Path, output_path: Path, forward: bool, @@ -266,7 +266,7 @@ def snowballing( print("Saved dataset") -def _parse_arguments_snowballing(): +def _parse_arguments_snowball(): parser = argparse.ArgumentParser(prog="asreview data snowballing") parser.add_argument( "input_path", type=str, help="The file path of the input dataset." diff --git a/tests/test_snowballing.py b/tests/test_snowball.py similarity index 89% rename from tests/test_snowballing.py rename to tests/test_snowball.py index 29ee84f..ed23716 100644 --- a/tests/test_snowballing.py +++ b/tests/test_snowball.py @@ -2,10 +2,10 @@ import pandas as pd -from asreviewcontrib.datatools.snowballing import backward_snowballing -from asreviewcontrib.datatools.snowballing import forward_snowballing -from asreviewcontrib.datatools.snowballing import openalex_from_doi -from asreviewcontrib.datatools.snowballing import snowballing +from asreviewcontrib.datatools.snowball import backward_snowballing +from asreviewcontrib.datatools.snowball import forward_snowballing +from asreviewcontrib.datatools.snowball import openalex_from_doi +from asreviewcontrib.datatools.snowball import snowball INPUT_DIR = Path(__file__).parent / "demo_data" @@ -58,7 +58,7 @@ def test_forward_snowballing(): def test_openalex_id_forward(tmpdir): out_fp = Path(tmpdir, "forward_all.csv") - snowballing( + snowball( input_path=INPUT_DIR / "snowballing_openalex.csv", output_path=out_fp, forward=True, @@ -69,7 +69,7 @@ def test_openalex_id_forward(tmpdir): assert len(df) >= 23 all_out_fp = Path(tmpdir, "forward_all.csv") - snowballing( + snowball( input_path=INPUT_DIR / "snowballing_openalex.csv", output_path=all_out_fp, forward=True, @@ -82,7 +82,7 @@ def test_openalex_id_forward(tmpdir): def test_openalex_id_backward(tmpdir): out_fp = Path(tmpdir, "forward_all.csv") - snowballing( + snowball( input_path=INPUT_DIR / "snowballing_openalex.csv", output_path=out_fp, forward=False, @@ -93,7 +93,7 @@ def test_openalex_id_backward(tmpdir): assert len(df) == 31 all_out_fp = Path(tmpdir, "backward_all.csv") - snowballing( + snowball( input_path=INPUT_DIR / "snowballing_openalex.csv", output_path=all_out_fp, forward=False, @@ -106,7 +106,7 @@ def test_openalex_id_backward(tmpdir): def test_snowballing_from_doi(tmpdir): out_fp = Path(tmpdir, "doi_all.csv") - snowballing( + snowball( input_path=INPUT_DIR / "snowballing_doi.csv", output_path=out_fp, forward=False, From e3cbbe29e5c53514e5abf4d5cc4688acde5204c7 Mon Sep 17 00:00:00 2001 From: Peter Lombaers Date: Thu, 21 Mar 2024 10:21:10 +0100 Subject: [PATCH 16/19] Fix bug after renaming snowballing to snowball --- asreviewcontrib/datatools/entrypoint.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/asreviewcontrib/datatools/entrypoint.py b/asreviewcontrib/datatools/entrypoint.py index 5026f7e..86c83a0 100644 --- a/asreviewcontrib/datatools/entrypoint.py +++ b/asreviewcontrib/datatools/entrypoint.py @@ -14,7 +14,7 @@ from asreviewcontrib.datatools.stack import _parse_arguments_vstack from asreviewcontrib.datatools.stack import vstack -DATATOOLS = ["describe", "dedup", "convert", "compose", "vstack", "snowballing"] +DATATOOLS = ["describe", "dedup", "convert", "compose", "vstack", "snowball"] class DataEntryPoint(BaseEntryPoint): @@ -97,7 +97,7 @@ def execute(self, argv): order=args_compose.hierarchy, resolve=args_compose.conflict_resolve, ) - if argv[0] == "snowballing": + if argv[0] == "snowball": args_snowballing_parser = _parse_arguments_snowball() args_snowballing = vars(args_snowballing_parser.parse_args(argv[1:])) snowball(**args_snowballing) From dfeb81282a1a13e8082baa4db53d76944386a8c1 Mon Sep 17 00:00:00 2001 From: Peter Lombaers Date: Thu, 21 Mar 2024 11:01:47 +0100 Subject: [PATCH 17/19] Fix bug due to missing prefix of dois --- asreviewcontrib/datatools/snowball.py | 10 ++++++---- tests/test_snowball.py | 4 ++-- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/asreviewcontrib/datatools/snowball.py b/asreviewcontrib/datatools/snowball.py index 2569961..27d44ee 100644 --- a/asreviewcontrib/datatools/snowball.py +++ b/asreviewcontrib/datatools/snowball.py @@ -9,6 +9,9 @@ # Maximum number of statements joined by a logical OR in a call to OpenAlex. OPENALEX_MAX_OR_LENGTH = 100 OPENALEX_MAX_PAGE_LENGTH = 200 +OPENALEX_PREFIX = "https://openalex.org/" +DOI_PREFIX = "https://doi.org/" + # OpenAlex data fields to retrieve. USED_FIELDS = [ "id", @@ -79,7 +82,6 @@ def backward_snowballing(identifiers: list[str]) -> dict[str, list[dict]]: # Get the referenced works. referenced_works = {} page_length = min(OPENALEX_MAX_OR_LENGTH, OPENALEX_MAX_PAGE_LENGTH) - OPENALEX_PREFIX = "https://openalex.org/" for i in range(0, len(identifiers), page_length): print(f"Getting works citing records {i}-{i+page_length}") @@ -155,7 +157,7 @@ def openalex_from_doi(dois: list[str]) -> dict[str, str]: DOI, the corresponding value will be None. """ page_length = min(OPENALEX_MAX_OR_LENGTH, OPENALEX_MAX_PAGE_LENGTH) - id_mapping = {doi: None for doi in dois} + id_mapping = {doi.removeprefix(DOI_PREFIX): None for doi in dois} for i in range(0, len(dois), page_length): fltr = "|".join(dois[i : i + page_length]) for work in ( @@ -164,7 +166,7 @@ def openalex_from_doi(dois: list[str]) -> dict[str, str]: .select(["id", "doi"]) .get(per_page=page_length) ): - id_mapping[work["doi"]] = work["id"] + id_mapping[work["doi"].removeprefix(DOI_PREFIX)] = work["id"] return id_mapping @@ -234,7 +236,7 @@ def snowball( data["openalex_id"] = None data.loc[data.doi.notna(), "openalex_id"] = data.loc[ data.doi.notna(), "doi" - ].apply(lambda doi: id_mapping[doi]) + ].str.removeprefix(DOI_PREFIX).apply(lambda doi: id_mapping[doi]) identifiers = data["openalex_id"].dropna().to_list() diff --git a/tests/test_snowball.py b/tests/test_snowball.py index ed23716..eb840f8 100644 --- a/tests/test_snowball.py +++ b/tests/test_snowball.py @@ -18,8 +18,8 @@ def test_openalex_from_doi(): ] assert openalex_from_doi(dois) == { - "https://doi.org/10.1042/cs20220150": "https://openalex.org/W4386305682", - "https://doi.org/10.1042/bst20220734": "https://openalex.org/W4312006214", + "10.1042/cs20220150": "https://openalex.org/W4386305682", + "10.1042/bst20220734": "https://openalex.org/W4312006214", "not_a_doi": None, } From e3e355fa45ac11566ae775fb489d84eef9861086 Mon Sep 17 00:00:00 2001 From: Peter Lombaers Date: Mon, 25 Mar 2024 14:10:49 +0100 Subject: [PATCH 18/19] Fix type annotations for older python versions --- asreviewcontrib/datatools/snowball.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/asreviewcontrib/datatools/snowball.py b/asreviewcontrib/datatools/snowball.py index 27d44ee..5c2d018 100644 --- a/asreviewcontrib/datatools/snowball.py +++ b/asreviewcontrib/datatools/snowball.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import argparse from pathlib import Path From 255b803da7ff828cf1362d0b8d7a00580cd09212 Mon Sep 17 00:00:00 2001 From: Peter Lombaers Date: Mon, 25 Mar 2024 15:18:58 +0100 Subject: [PATCH 19/19] Handle no `included` data --- asreviewcontrib/datatools/snowball.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asreviewcontrib/datatools/snowball.py b/asreviewcontrib/datatools/snowball.py index 5c2d018..3108695 100644 --- a/asreviewcontrib/datatools/snowball.py +++ b/asreviewcontrib/datatools/snowball.py @@ -211,7 +211,7 @@ def snowball( raise ValueError("At least one of 'forward' or 'backward' should be True.") data = load_data(input_path) - if use_all: + if (use_all or (data.included is None)): data = data.df else: data = data.df.loc[data.included.astype(bool)]