Merge pull request #71 from GeoscienceAustralia/NPI-3685-streamline-s…

…p3-incorrect-timerange-unit-test NPI-3685 Streamline SP3 incorrect timerange unit test
GeoscienceAustralia · Jan 31, 2025 · ecdde74 · ecdde74
2 parents 01bbebe + eba0da8
commit ecdde74
Show file tree

Hide file tree

Showing 7 changed files with 362 additions and 8,681 deletions.
diff --git a/gnssanalysis/gn_io/sp3.py b/gnssanalysis/gn_io/sp3.py
@@ -367,6 +367,10 @@ def _process_sp3_block(
     names: List[str] = _SP3_DEF_PV_NAME,
 ) -> _pd.DataFrame:
     """Process a single block of SP3 data.
+    NOTE: this process creates a temporary DataFrame for *every epoch* of SP3 data read in, complete with indexes, etc.
+    This process is expensive! Epoch count has far more impact on SP3 loading speed, than number of sats.
+    TODO It may be possible to speed up SP3 reading by changing this logic to parse the data but not build a full
+    DataFrame from it, only converting to a DataFrame in the parent function, once all the data is concatenated.
 
 
     :param    str date: The date of the SP3 data block.
@@ -378,6 +382,7 @@ def _process_sp3_block(
     if not data or len(data) == 0:
         return _pd.DataFrame()
     epochs_dt = _pd.to_datetime(_pd.Series(date).str.slice(2, 21).values.astype(str), format=r"%Y %m %d %H %M %S")
+    # NOTE: setting dtype_backend="pyarrow" currently breaks parsing.
     temp_sp3 = _pd.read_fwf(_io.StringIO(data), widths=widths, names=names)
     # TODO set datatypes per column in advance
     # TODO maybe change this after updating everyting else to use actual NaNs ?

diff --git a/gnssanalysis/gn_utils.py b/gnssanalysis/gn_utils.py
@@ -2,6 +2,7 @@
 import os as _os
 import sys as _sys
 import pathlib as _pathlib
+from time import perf_counter
 
 import click as _click
 
@@ -871,3 +872,51 @@ def clkq(
             out_file.writelines(output_str)
     else:
         print(output_str)
+
+
+class ContextTimer:
+    """
+    Utility for measuring function execution time (e.g. for manually profiling which unit tests are taking
+    excessive time).
+    Call this as a context manager, e.g. (following are default values, apart from name)
+    with ContextTimer(print_time=True, name="func name", flag_if_over_sec=1.0, skip_if_under_sec=0.01) as timer:
+        some_function_to_time()
+    Based on https://stackoverflow.com/a/69156219
+    """
+
+    def __init__(self, **kwargs):
+        if kwargs is not None:
+            if "print_time" in kwargs:
+                self.print_time = bool(kwargs["print_time"])
+            else:
+                self.print_time = True
+
+            if "name" in kwargs:
+                self.name = str(kwargs["name"])
+            else:
+                self.name = None
+
+            if "flag_if_over_sec" in kwargs:
+                self.flag_if_over_sec = float(kwargs["flag_if_over_sec"])
+            else:
+                self.flag_if_over_sec = 1.0
+
+            if "skip_if_under_sec" in kwargs:
+                self.skip_if_under_sec = float(kwargs["skip_if_under_sec"])
+            else:
+                self.skip_if_under_sec = 0.01
+
+    def __enter__(self):
+        self.start = perf_counter()
+        return self
+
+    def __exit__(self, type, value, traceback):
+        self.time = perf_counter() - self.start
+        if self.skip_if_under_sec and self.time < self.skip_if_under_sec:  # Do skip?
+            return
+        do_flag = self.flag_if_over_sec and self.time > self.flag_if_over_sec
+        self.readout = (
+            f"{'SLOW!! ' if do_flag else ''}{self.time:.3f} sec elapsed{f' for {self.name}' if self.name else ''}"
+        )
+        if self.print_time:
+            print(self.readout)
diff --git a/tests/test_clk.py b/tests/test_clk.py
@@ -1,8 +1,5 @@
 from pyfakefs.fake_filesystem_unittest import TestCase
 
-import numpy as np
-import pandas as pd
-
 import gnssanalysis.gn_io.clk as clk
 import gnssanalysis.gn_diffaux as gn_diffaux
 
@@ -17,8 +14,10 @@
 class TestClk(TestCase):
     def setUp(self):
         self.setUpPyfakefs()
+        self.fs.reset()
 
     def test_clk_read(self):
+        self.fs.reset()
         file_paths = ["/fake/dir/file0.clk", "/fake/dir/file1.clk"]
         self.fs.create_file(file_paths[0], contents=input_data_igs)
         self.fs.create_file(file_paths[1], contents=input_data_gfz)
@@ -36,6 +35,7 @@ def test_clk_read(self):
         self.assertEqual(clk_df_gfz["EST"].iloc[-1], -0.000610553573006, msg="Check last datapoint is correct")
 
     def test_compare_clk(self):
+        self.fs.reset()  # Reset pyfakefs to delete any files which may have persisted from a previous test
         file_paths = ["/fake/dir/file0.clk", "/fake/dir/file1.clk"]
         self.fs.create_file(file_paths[0], contents=input_data_igs)
         self.fs.create_file(file_paths[1], contents=input_data_gfz)