Pubmed get_citations and get_citing_pmid

new methods for retrieving the list of citations from a csv row of the PubMed dump and for getting the citing pmid from a META dictionary
opencitations · Apr 1, 2024 · 856aec6 · 856aec6
1 parent 548f77f
commit 856aec6
Show file tree

Hide file tree

Showing 5 changed files with 562 additions and 14 deletions.
diff --git a/oc_ds_converter/pubmed/pubmed_processing.py b/oc_ds_converter/pubmed/pubmed_processing.py
@@ -28,6 +28,7 @@ def __init__(self, orcid_index: str = None, doi_csv: str = None, publishers_file
         super(PubmedProcessing, self).__init__(orcid_index, doi_csv)
         self.nihrf = NIHResourceFinder()
         self.doi_m = DOIManager()
+        self.pmid_m = PMIDManager()
         if testing:
             self.BR_redis= fakeredis.FakeStrictRedis()
             self.RA_redis= fakeredis.FakeStrictRedis()
@@ -93,7 +94,7 @@ def prefix_to_publisher_to_cache(self, pref_pub_dict, path):
     def csv_creator(self, item: dict) -> dict:
         row = dict()
         doi = ""
-        pmid = PMIDManager().normalise(str(item['pmid']))
+        pmid = self.pmid_m.normalise(str(item['pmid']))
         if (pmid and self.doi_set and pmid in self.doi_set) or (pmid and not self.doi_set):
             # create empty row
             keys = ['id', 'title', 'author', 'pub_date', 'venue', 'volume', 'issue', 'page', 'type',
@@ -608,6 +609,45 @@ def add_editors_to_agent_list(self, item: dict, ag_list: list) -> list:
         ''' NO INFO IN DUMP: to be updated with API DATA'''
         return agent_list
 
+    def get_citing_pmid(self, meta_dict:dict) -> str:
+        citing_pmid = ""
+        id_string = meta_dict.get("id")
+        if id_string:
+            id_list = id_string.split()
+            pmid_list = [x for x in id_list if x.startswith("pmid:")]
+            if len(pmid_list) == 1:
+                citing_pmid = pmid_list[0] # we expect only one pmid for each entity
+        return citing_pmid
+
+    def get_citations(self, validated_pmid, item:dict) -> list:
+        addressed_citations = set()
+
+        citing = validated_pmid
+        if not citing.startswith("pmid:"):
+            try:
+                int_pmid = int(citing)
+                citing = "pmid:" + str(int_pmid)
+            except:
+                return []
+
+        references_string = item.get("references")
+        cited_ids = references_string.split()
+
+        for cited_id in cited_ids:
+            try:
+                id_n = int(cited_id)
+
+                if id_n:
+                    norm_cited = self.pmid_m.normalise(str(id_n), include_prefix=True)
+
+                    if norm_cited:
+                        addressed_citations.add((citing, norm_cited))
+            except:
+                pass
+
+        addressed_citations_list = list(addressed_citations)
+
+        return addressed_citations_list
 
 
     def get_best_match(self, target_agent_dict, report_dicts):

diff --git a/oc_ds_converter/run/crossref_process.py b/oc_ds_converter/run/crossref_process.py
@@ -64,16 +64,14 @@ def preprocess(crossref_json_dir:str, publishers_filepath:str, orcid_doi_filepat
     # create output dir for citation data
     preprocessed_citations_dir = csv_dir + "_citations"
     if not os.path.exists(preprocessed_citations_dir):
-        makedirs(preprocessed_citations_dir)
+        os.makedirs(preprocessed_citations_dir)
 
     if verbose:
         print(f'[INFO: crossref_process] Getting all files from {crossref_json_dir}')
     all_files, targz_fd = get_all_files_by_type(crossref_json_dir, ".json", cache)
     if verbose:
         pbar = tqdm(total=len(all_files))
 
-
-
     if not redis_storage_manager or max_workers == 1:
         for filename in all_files:
             # skip elements starting with ._

diff --git a/oc_ds_converter/run/pubmed_process.py b/oc_ds_converter/run/pubmed_process.py
@@ -31,7 +31,6 @@
 
 
 def to_meta_file(cur_n, lines, interval, csv_dir):
-
     if int(cur_n) != 0 and int(cur_n) % int(interval) == 0:
         filename = "CSVFile_" + str(cur_n // interval)
         filepath = os.path.join(csv_dir, f'{os.path.basename(filename)}.csv')
@@ -52,7 +51,9 @@ def to_meta_file(cur_n, lines, interval, csv_dir):
         return lines
 
 
-def preprocess(pubmed_csv_dir:str, publishers_filepath:str, orcid_doi_filepath:str, csv_dir:str, journals_filepath:str, wanted_doi_filepath:str=None, verbose:bool=False, interval = 1000, testing=True, cache: str = None) -> None:
+def preprocess(pubmed_csv_dir: str, publishers_filepath: str, orcid_doi_filepath: str, csv_dir: str,
+               journals_filepath: str, wanted_doi_filepath: str = None, verbose: bool = False, interval=1000,
+               testing=True, cache: str = None) -> None:
     if not interval:
         interval = 1000
     else:
@@ -63,7 +64,7 @@ def preprocess(pubmed_csv_dir:str, publishers_filepath:str, orcid_doi_filepath:s
 
     if not os.path.exists(csv_dir):
         os.makedirs(csv_dir)
-    
+
     filter = ["pmid", "doi", "title", "authors", "year", "journal", "references"]
     if verbose:
         if publishers_filepath or orcid_doi_filepath or wanted_doi_filepath:
@@ -77,11 +78,12 @@ def preprocess(pubmed_csv_dir:str, publishers_filepath:str, orcid_doi_filepath:s
             log = '[INFO: pubmed_process] Processing: ' + '; '.join(what)
             print(log)
 
-    pubmed_csv = PubmedProcessing(orcid_index=orcid_doi_filepath, doi_csv=wanted_doi_filepath, publishers_filepath_pubmed=publishers_filepath, journals_filepath=journals_filepath, testing=testing)
+    pubmed_csv = PubmedProcessing(orcid_index=orcid_doi_filepath, doi_csv=wanted_doi_filepath,
+                                  publishers_filepath_pubmed=publishers_filepath, journals_filepath=journals_filepath,
+                                  testing=testing)
     if verbose:
         print(f'[INFO: pubmed_process] Getting all files from {pubmed_csv_dir}')
 
-
     all_files, targz_fd = get_all_files_by_type(pubmed_csv_dir, ".csv")
     lines = []
     count = 0
@@ -93,7 +95,7 @@ def preprocess(pubmed_csv_dir:str, publishers_filepath:str, orcid_doi_filepath:s
                 f.write('0')
         with open(cache, 'r', encoding='utf8') as f:
             count = f.read().splitlines()[0]
-    dtype={'pmid': str, 'doi': str, 'title': str, 'authors': str, 'year': str, 'journal': str, 'references': str}
+    dtype = {'pmid': str, 'doi': str, 'title': str, 'authors': str, 'year': str, 'journal': str, 'references': str}
     for file in all_files:
         chunksize = 100000
         with open(file, 'r', encoding='utf8') as f:
@@ -126,12 +128,14 @@ def preprocess(pubmed_csv_dir:str, publishers_filepath:str, orcid_doi_filepath:s
         pubmed_csv.save_updated_pref_publishers_map()
 
 
-def pathoo(path:str) -> None:
+def pathoo(path: str) -> None:
     if not os.path.exists(os.path.dirname(path)):
         os.makedirs(os.path.dirname(path))
 
+
 if __name__ == '__main__':
-    arg_parser = ArgumentParser('pubmed_process.py', description='This script creates meta CSV files from pubmed preprocessed dump, enriching data through of a DOI-ORCID index')
+    arg_parser = ArgumentParser('pubmed_process.py',
+                                description='This script creates meta CSV files from pubmed preprocessed dump, enriching data through of a DOI-ORCID index')
     arg_parser.add_argument('-c', '--config', dest='config', required=False,
                             help='Configuration file path')
     required = not any(arg in sys.argv for arg in {'--config', '-c'})
@@ -149,7 +153,7 @@ def pathoo(path:str) -> None:
                             help='A CSV filepath containing what DOI to process, not mandatory')
     arg_parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', required=False,
                             help='Show a loading bar, elapsed time and estimated time')
-    arg_parser.add_argument('-int', '--interval', dest='interval',type=int, required=False, default=1000,
+    arg_parser.add_argument('-int', '--interval', dest='interval', type=int, required=False, default=1000,
                             help='int number of lines for each output csv. If nothing is declared, the default is 1000')
     arg_parser.add_argument('-t', '--testing', dest='testing', action='store_true', required=False,
                             help='testing flag to define what to use for data validation (fakeredis instance or real redis DB)')
@@ -177,4 +181,7 @@ def pathoo(path:str) -> None:
     verbose = settings['verbose'] if settings else args.verbose
     testing = settings['testing'] if settings else args.testing
     print("Data Preprocessing Phase: started")
-    preprocess(pubmed_csv_dir=pubmed_csv_dir, publishers_filepath=publishers_filepath, journals_filepath=journals_filepath, orcid_doi_filepath=orcid_doi_filepath, csv_dir=csv_dir, wanted_doi_filepath=wanted_doi_filepath, verbose=verbose, interval=interval, testing=testing, cache=args.cache)
+    preprocess(pubmed_csv_dir=pubmed_csv_dir, publishers_filepath=publishers_filepath,
+               journals_filepath=journals_filepath, orcid_doi_filepath=orcid_doi_filepath, csv_dir=csv_dir,
+               wanted_doi_filepath=wanted_doi_filepath, verbose=verbose, interval=interval, testing=testing,
+               cache=args.cache)