Skip to content

Commit

Permalink
Pubmed get_citations and get_citing_pmid
Browse files Browse the repository at this point in the history
new methods for retrieving the list of citations from a csv row of the PubMed dump and for getting the citing pmid from a META dictionary
  • Loading branch information
ariannamorettj committed Apr 1, 2024
1 parent 548f77f commit 856aec6
Show file tree
Hide file tree
Showing 5 changed files with 562 additions and 14 deletions.
42 changes: 41 additions & 1 deletion oc_ds_converter/pubmed/pubmed_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def __init__(self, orcid_index: str = None, doi_csv: str = None, publishers_file
super(PubmedProcessing, self).__init__(orcid_index, doi_csv)
self.nihrf = NIHResourceFinder()
self.doi_m = DOIManager()
self.pmid_m = PMIDManager()
if testing:
self.BR_redis= fakeredis.FakeStrictRedis()
self.RA_redis= fakeredis.FakeStrictRedis()
Expand Down Expand Up @@ -93,7 +94,7 @@ def prefix_to_publisher_to_cache(self, pref_pub_dict, path):
def csv_creator(self, item: dict) -> dict:
row = dict()
doi = ""
pmid = PMIDManager().normalise(str(item['pmid']))
pmid = self.pmid_m.normalise(str(item['pmid']))
if (pmid and self.doi_set and pmid in self.doi_set) or (pmid and not self.doi_set):
# create empty row
keys = ['id', 'title', 'author', 'pub_date', 'venue', 'volume', 'issue', 'page', 'type',
Expand Down Expand Up @@ -608,6 +609,45 @@ def add_editors_to_agent_list(self, item: dict, ag_list: list) -> list:
''' NO INFO IN DUMP: to be updated with API DATA'''
return agent_list

def get_citing_pmid(self, meta_dict:dict) -> str:
citing_pmid = ""
id_string = meta_dict.get("id")
if id_string:
id_list = id_string.split()
pmid_list = [x for x in id_list if x.startswith("pmid:")]
if len(pmid_list) == 1:
citing_pmid = pmid_list[0] # we expect only one pmid for each entity
return citing_pmid

def get_citations(self, validated_pmid, item:dict) -> list:
addressed_citations = set()

citing = validated_pmid
if not citing.startswith("pmid:"):
try:
int_pmid = int(citing)
citing = "pmid:" + str(int_pmid)
except:
return []

references_string = item.get("references")
cited_ids = references_string.split()

for cited_id in cited_ids:
try:
id_n = int(cited_id)

if id_n:
norm_cited = self.pmid_m.normalise(str(id_n), include_prefix=True)

if norm_cited:
addressed_citations.add((citing, norm_cited))
except:
pass

addressed_citations_list = list(addressed_citations)

return addressed_citations_list


def get_best_match(self, target_agent_dict, report_dicts):
Expand Down
4 changes: 1 addition & 3 deletions oc_ds_converter/run/crossref_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,16 +64,14 @@ def preprocess(crossref_json_dir:str, publishers_filepath:str, orcid_doi_filepat
# create output dir for citation data
preprocessed_citations_dir = csv_dir + "_citations"
if not os.path.exists(preprocessed_citations_dir):
makedirs(preprocessed_citations_dir)
os.makedirs(preprocessed_citations_dir)

if verbose:
print(f'[INFO: crossref_process] Getting all files from {crossref_json_dir}')
all_files, targz_fd = get_all_files_by_type(crossref_json_dir, ".json", cache)
if verbose:
pbar = tqdm(total=len(all_files))



if not redis_storage_manager or max_workers == 1:
for filename in all_files:
# skip elements starting with ._
Expand Down
27 changes: 17 additions & 10 deletions oc_ds_converter/run/pubmed_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@


def to_meta_file(cur_n, lines, interval, csv_dir):

if int(cur_n) != 0 and int(cur_n) % int(interval) == 0:
filename = "CSVFile_" + str(cur_n // interval)
filepath = os.path.join(csv_dir, f'{os.path.basename(filename)}.csv')
Expand All @@ -52,7 +51,9 @@ def to_meta_file(cur_n, lines, interval, csv_dir):
return lines


def preprocess(pubmed_csv_dir:str, publishers_filepath:str, orcid_doi_filepath:str, csv_dir:str, journals_filepath:str, wanted_doi_filepath:str=None, verbose:bool=False, interval = 1000, testing=True, cache: str = None) -> None:
def preprocess(pubmed_csv_dir: str, publishers_filepath: str, orcid_doi_filepath: str, csv_dir: str,
journals_filepath: str, wanted_doi_filepath: str = None, verbose: bool = False, interval=1000,
testing=True, cache: str = None) -> None:
if not interval:
interval = 1000
else:
Expand All @@ -63,7 +64,7 @@ def preprocess(pubmed_csv_dir:str, publishers_filepath:str, orcid_doi_filepath:s

if not os.path.exists(csv_dir):
os.makedirs(csv_dir)

filter = ["pmid", "doi", "title", "authors", "year", "journal", "references"]
if verbose:
if publishers_filepath or orcid_doi_filepath or wanted_doi_filepath:
Expand All @@ -77,11 +78,12 @@ def preprocess(pubmed_csv_dir:str, publishers_filepath:str, orcid_doi_filepath:s
log = '[INFO: pubmed_process] Processing: ' + '; '.join(what)
print(log)

pubmed_csv = PubmedProcessing(orcid_index=orcid_doi_filepath, doi_csv=wanted_doi_filepath, publishers_filepath_pubmed=publishers_filepath, journals_filepath=journals_filepath, testing=testing)
pubmed_csv = PubmedProcessing(orcid_index=orcid_doi_filepath, doi_csv=wanted_doi_filepath,
publishers_filepath_pubmed=publishers_filepath, journals_filepath=journals_filepath,
testing=testing)
if verbose:
print(f'[INFO: pubmed_process] Getting all files from {pubmed_csv_dir}')


all_files, targz_fd = get_all_files_by_type(pubmed_csv_dir, ".csv")
lines = []
count = 0
Expand All @@ -93,7 +95,7 @@ def preprocess(pubmed_csv_dir:str, publishers_filepath:str, orcid_doi_filepath:s
f.write('0')
with open(cache, 'r', encoding='utf8') as f:
count = f.read().splitlines()[0]
dtype={'pmid': str, 'doi': str, 'title': str, 'authors': str, 'year': str, 'journal': str, 'references': str}
dtype = {'pmid': str, 'doi': str, 'title': str, 'authors': str, 'year': str, 'journal': str, 'references': str}
for file in all_files:
chunksize = 100000
with open(file, 'r', encoding='utf8') as f:
Expand Down Expand Up @@ -126,12 +128,14 @@ def preprocess(pubmed_csv_dir:str, publishers_filepath:str, orcid_doi_filepath:s
pubmed_csv.save_updated_pref_publishers_map()


def pathoo(path:str) -> None:
def pathoo(path: str) -> None:
if not os.path.exists(os.path.dirname(path)):
os.makedirs(os.path.dirname(path))


if __name__ == '__main__':
arg_parser = ArgumentParser('pubmed_process.py', description='This script creates meta CSV files from pubmed preprocessed dump, enriching data through of a DOI-ORCID index')
arg_parser = ArgumentParser('pubmed_process.py',
description='This script creates meta CSV files from pubmed preprocessed dump, enriching data through of a DOI-ORCID index')
arg_parser.add_argument('-c', '--config', dest='config', required=False,
help='Configuration file path')
required = not any(arg in sys.argv for arg in {'--config', '-c'})
Expand All @@ -149,7 +153,7 @@ def pathoo(path:str) -> None:
help='A CSV filepath containing what DOI to process, not mandatory')
arg_parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', required=False,
help='Show a loading bar, elapsed time and estimated time')
arg_parser.add_argument('-int', '--interval', dest='interval',type=int, required=False, default=1000,
arg_parser.add_argument('-int', '--interval', dest='interval', type=int, required=False, default=1000,
help='int number of lines for each output csv. If nothing is declared, the default is 1000')
arg_parser.add_argument('-t', '--testing', dest='testing', action='store_true', required=False,
help='testing flag to define what to use for data validation (fakeredis instance or real redis DB)')
Expand Down Expand Up @@ -177,4 +181,7 @@ def pathoo(path:str) -> None:
verbose = settings['verbose'] if settings else args.verbose
testing = settings['testing'] if settings else args.testing
print("Data Preprocessing Phase: started")
preprocess(pubmed_csv_dir=pubmed_csv_dir, publishers_filepath=publishers_filepath, journals_filepath=journals_filepath, orcid_doi_filepath=orcid_doi_filepath, csv_dir=csv_dir, wanted_doi_filepath=wanted_doi_filepath, verbose=verbose, interval=interval, testing=testing, cache=args.cache)
preprocess(pubmed_csv_dir=pubmed_csv_dir, publishers_filepath=publishers_filepath,
journals_filepath=journals_filepath, orcid_doi_filepath=orcid_doi_filepath, csv_dir=csv_dir,
wanted_doi_filepath=wanted_doi_filepath, verbose=verbose, interval=interval, testing=testing,
cache=args.cache)
Loading

0 comments on commit 856aec6

Please sign in to comment.