include patient observations in specimen df

FHIR-Aggregator · Feb 12, 2025 · cdf2fe5 · cdf2fe5
1 parent 0797a24
commit cdf2fe5
Show file tree

Hide file tree

Showing 2 changed files with 46 additions and 39 deletions.
diff --git a/fhir_query/cli.py b/fhir_query/cli.py
@@ -162,6 +162,7 @@ def visualize(db_path: str, output_path: str, ignored_edges: list[str]) -> None:
     try:
         db = ResourceDB(db_path=db_path)
         visualize_aggregation(db.aggregate(ignored_edges), output_path)
+        click.echo(f"Wrote: {output_path}", file=sys.stderr)
     except Exception as e:
         logging.error(f"Error: {e}", exc_info=True)
         click.echo(f"Error: {e}", file=sys.stderr)
@@ -210,15 +211,11 @@ def dataframe(db_path: str, output_path: str, launch_dtale: bool, data_type: str
         # TODO - add more data types - including condition
         assert data_type in ["Specimen", "Patient"], f"Sorry {data_type} dataframe is not supported yet."
 
-        patient_df = None
-        specimen_df = None
-        file_name = None
+        df = None
         if data_type == "Specimen":
-            specimen_df = pd.DataFrame(db.flattened_specimens())
+            df = pd.DataFrame(db.flattened_specimens())
         if data_type == "Patient":
-            patient_df = pd.DataFrame(db.flattened_patients())
-
-        df = pd.DataFrame(db.flattened_specimens())
+            df = pd.DataFrame(db.flattened_patients())
 
         if launch_dtale:
             # TODO - add check that dtale is installed
@@ -228,14 +225,9 @@ def dataframe(db_path: str, output_path: str, launch_dtale: bool, data_type: str
         else:
             # export to csv
             file_name = output_path if output_path else f"{data_type}.csv"
-
-        if data_type == "Patient" and patient_df and file_name:
-            patient_df.to_csv(file_name, index=False)
+            df.to_csv(file_name, index=False)
             click.secho(f"Saved {file_name}", file=sys.stderr)
 
-        df.to_csv(output_path, index=False)
-        click.secho(f"Saved {output_path}", file=sys.stderr)
-
     except Exception as e:
         logging.error(f"Error: {e}", exc_info=True)
         click.echo(f"Error: {e}", file=sys.stderr)

diff --git a/fhir_query/dataframer.py b/fhir_query/dataframer.py
@@ -295,6 +295,10 @@ def values(self) -> dict:
         # update the key if code information is available
         if self.resource.get("code", {}).get("text", None):
             source = self.resource["code"]["text"]
+        else:
+            source = self.resource["code"]["coding"][0].get("display", self.resource["code"]["coding"][0].get("code"))
+
+        source = inflection.underscore(inflection.parameterize(source))
         return {source: value}
 
 
@@ -329,7 +333,6 @@ def values(self) -> dict:
 
         # get top-level value in dict if it exists
         _values = super().values
-
         if len(_values) == 0:
             assert "component" in self.resource, "no component nor top-level value found"
 
@@ -343,28 +346,30 @@ def values(self) -> dict:
                         continue
                     _values[source] = value
 
-        # knowing there's now at least 1 item in _values
-        if "component" in self.resource:
-            # ensure no top-level value is not duplicating a component code value
-            # TODO: ensure this value_key corresponds to percent_tumor on some runs due to getting display
-            value_key = [k for k in _values][0]
-            assert (
-                value_key not in self.resource["component"]
-            ), """duplicate code value found, only specify the code value in the component, see Rule obs-7
-                https://build.fhir.org/observation.html#invs"""
-
-            # get component codes
+            # knowing there's now at least 1 item in _values
             if "component" in self.resource:
-                for component in self.resource["component"]:
-                    value, source = normalize_value(component)
-                    if component.get("code", {}).get("text", None):
-                        source = component["code"]["text"]
-                    if not value:
-                        continue
-                    _values[source] = value
+                # ensure no top-level value is not duplicating a component code value
+                # TODO: ensure this value_key corresponds to percent_tumor on some runs due to getting display
+                value_key = [k for k in _values][0]
+                assert (
+                    value_key not in self.resource["component"]
+                ), """duplicate code value found, only specify the code value in the component, see Rule obs-7
+                    https://build.fhir.org/observation.html#invs"""
+
+                # get component codes
+                if "component" in self.resource:
+                    for component in self.resource["component"]:
+                        value, source = normalize_value(component)
+                        if component.get("code", {}).get("text", None):
+                            source = component["code"]["text"]
+                        if not value:
+                            continue
+                        _values[source] = value
+
         if "code" in self.resource and "text" in self.resource["code"]:
             _values["observation_code"] = self.resource["code"]["text"]
 
+        assert not [_ for _ in _values.keys() if _.startswith("value")], f"key misnamed {_values}\n  {self.resource}"
         assert len(_values) > 0, f"no values found in Observation: {self.resource}"
 
         return _values
@@ -515,17 +520,16 @@ def flattened_specimens(self) -> Generator[dict, None, None]:
         cursor = self.connection.cursor()
 
         # get a dict mapping focus ID to its associated observations
-        observations_by_focus_id = self.get_observations_by_focus(resource_type)
+        specimen_observations_by_focus_id = self.get_observations_by_focus(resource_type)
         service_requests_by_specimen_id = self.get_resources_by_reference("ServiceRequest", "specimen", "Specimen")
         document_references_by_based_on_id = self.get_resources_by_reference("DocumentReference", "basedOn", "ServiceRequest")
 
         # flatten each document reference
         cursor.execute("SELECT * FROM resources where resource_type = ?", (resource_type,))
         for _, _, _, resource in cursor.fetchall():
             specimen = json.loads(resource)
-            print(specimen)
             yield self.flattened_specimen(
-                specimen, observations_by_focus_id, service_requests_by_specimen_id, document_references_by_based_on_id
+                specimen, specimen_observations_by_focus_id, service_requests_by_specimen_id, document_references_by_based_on_id
             )
 
     def flattened_specimen(
@@ -537,7 +541,17 @@ def flattened_specimen(
         flat_specimen = traverse(specimen)
 
         # extract its .subject and append its fields (including id)
-        flat_specimen.update(self.get_subject(specimen))
+        subject = self.get_subject(specimen)
+        if "patient_id" in subject:
+            assert len(self.flattened_patients()) > 1, f"Length of flattened_patients is {len(self.flattened_patients())}"
+            _flattened_patient = next(
+                iter([_ for _ in self.flattened_patients() if _["patient_id"] == subject["patient_id"]]), None
+            )
+            if not _flattened_patient:
+                print(f"Patient not found {subject['patient_id']} {[_['patient_id'] for _ in self.flattened_patients()]}")
+            else:
+                subject = {f"patient_{k}".replace("patient_patient_", "patient_"): v for k, v in _flattened_patient.items()}
+        flat_specimen.update(subject)
 
         # populate observation codes for each associated observation
         if specimen["id"] in observation_by_id:
@@ -561,7 +575,7 @@ def flattened_specimen(
         return flat_specimen
 
     @lru_cache(maxsize=None)
-    def flattened_patients(self) -> Generator[dict, None, None]:
+    def flattened_patients(self) -> list[dict]:
         """
         Generator that yields flattened Patient records.
         Each flattened Patient merges in fields from:
@@ -571,11 +585,12 @@ def flattened_patients(self) -> Generator[dict, None, None]:
         cursor = self.connection.cursor()
 
         observations_by_focus = self.get_resources_by_reference("Observation", "focus", "Patient")
-
         cursor.execute("SELECT * FROM resources WHERE resource_type = ?", (resource_type,))
+        _flattened_patients = []
         for _, _, _, resource in cursor.fetchall():
             patient = json.loads(resource)
-            yield self.flattened_patient(patient, observations_by_focus)
+            _flattened_patients.append(self.flattened_patient(patient, observations_by_focus))
+        return _flattened_patients
 
     @staticmethod
     def flattened_patient(patient: dict, observations_by_subject: dict) -> dict: