From 05b0c93db422f35dde2af5eed1a8d6d8bde3f2ff Mon Sep 17 00:00:00 2001
From: Nathan <95725385+treefern@users.noreply.github.com>
Date: Fri, 10 Jan 2025 09:50:26 +0000
Subject: [PATCH 01/12] NPI-3683 added a function for removing  satellites from
 the internal header representation, to help maintain consistency of that
 header data. This is useful for example when removing offline sats.

---
 gnssanalysis/gn_io/sp3.py | 26 ++++++++++++++++
 tests/test_sp3.py         | 62 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 88 insertions(+)

diff --git a/gnssanalysis/gn_io/sp3.py b/gnssanalysis/gn_io/sp3.py
index 77cd643..67f83ae 100644
--- a/gnssanalysis/gn_io/sp3.py
+++ b/gnssanalysis/gn_io/sp3.py
@@ -193,6 +193,32 @@ def sp3_clock_nodata_to_nan(sp3_df: _pd.DataFrame) -> None:
     sp3_df.loc[nan_mask, ("EST", "CLK")] = _np.nan
 
 
+def remove_svs_from_header(sp3_df: _pd.DataFrame, sats_to_remove: set[str]):
+    """
+    Utility function to update the internal representation of an SP3 header, when SVs are removed from the SP3
+    DataFrame. This is useful e.g. when removing offline satellites.
+    :param _pd.DataFrame sp3_df: SP3 DataFrame on which to update the header (in place).
+    :param list[str] sats_to_remove: list of SV names to remove from the header.
+    """
+    num_to_remove: int = len(sats_to_remove)
+
+    # Update header SV count (bunch of type conversion because header is stored as strings)
+    sp3_df.attrs["HEADER"].HEAD.SV_COUNT_STATED = str(int(sp3_df.attrs["HEADER"].HEAD.SV_COUNT_STATED) - num_to_remove)
+
+    # Remove sats from the multi-index which contains SV_INFO and HEAD. This does both SV list and accuracy code list.
+    sp3_df.attrs["HEADER"].drop(level=1, labels=sats_to_remove, inplace=True)
+
+    # Notes on the multi-index update:
+
+    # The hierarchy here is:
+    # dict (HEADER) > series (unnamed) > HEAD (part of multi-index) > str (SV_COUNT_STATED)
+    # So we operate on the overall Series in the dict.
+
+    # In the above statement, Level 1 gets us to the 'column' names (like G02), not the 'section' names (like SV_INFO).
+    # Labels will apply to anything at that level (including things in HEAD - but those columns should never share a
+    # name with an SV).
+
+
 def remove_offline_sats(sp3_df: _pd.DataFrame, df_friendly_name: str = ""):
     """
     Remove any satellites that have "0.0" or NaN for all three position coordinates - this indicates satellite offline.
diff --git a/tests/test_sp3.py b/tests/test_sp3.py
index dcb2739..ecefc94 100644
--- a/tests/test_sp3.py
+++ b/tests/test_sp3.py
@@ -202,19 +202,44 @@ def test_velinterpolation(self, mock_file):
     @patch("builtins.open", new_callable=mock_open, read_data=offline_sat_test_data)
     def test_sp3_offline_sat_removal(self, mock_file):
         sp3_df = sp3.read_sp3("mock_path", pOnly=False)
+
+        # Confirm starting state of content
         self.assertEqual(
             sp3_df.index.get_level_values(1).unique().array.tolist(),
             ["G02", "G03", "G19"],
             "Should be three SVs in test file before removing offline ones",
         )
 
+        # Confirm header matches (this is doubling up on header update test)
+        self.assertEqual(
+            sp3_df.attrs["HEADER"].SV_INFO.index.array.tolist(),
+            ["G02", "G03", "G19"],
+            "Should be three SVs in parsed header before removing offline ones",
+        )
+        self.assertEqual(
+            sp3_df.attrs["HEADER"].HEAD.SV_COUNT_STATED, "3", "Header should have 2 SVs before removing offline"
+        )
+
+        # Now make the changes - this should also update the header
         sp3_df = sp3.remove_offline_sats(sp3_df)
+
+        # Check contents
         self.assertEqual(
             sp3_df.index.get_level_values(1).unique().array.tolist(),
             ["G02", "G03"],
             "Should be two SVs after removing offline ones",
         )
 
+        # Check header
+        self.assertEqual(
+            sp3_df.attrs["HEADER"].SV_INFO.index.array.tolist(),
+            ["G02", "G03"],
+            "Should be two SVs in parsed header after removing offline ones",
+        )
+        self.assertEqual(
+            sp3_df.attrs["HEADER"].HEAD.SV_COUNT_STATED, "2", "Header should have 2 SVs after removing offline"
+        )
+
     # sp3_test_data_truncated_cod_final is input_data2
     @patch("builtins.open", new_callable=mock_open, read_data=input_data2)
     def test_filter_by_svs(self, mock_file):
@@ -317,6 +342,43 @@ def test_trim_df(self, mock_file):
         )
 
 
+class TestSP3Utils(TestCase):
+
+    @patch("builtins.open", new_callable=mock_open, read_data=input_data)
+    def test_get_unique_svs(self, mock_file):
+        sp3_df = sp3.read_sp3("mock_path", pOnly=True)
+
+        unique_svs = set(sp3.get_unique_svs(sp3_df).values)
+        self.assertEqual(unique_svs, set(["G01", "G02"]))
+
+    @patch("builtins.open", new_callable=mock_open, read_data=input_data)
+    def test_get_unique_epochs(self, mock_file):
+        sp3_df = sp3.read_sp3("mock_path", pOnly=True)
+
+        unique_epochs = set(sp3.get_unique_epochs(sp3_df).values)
+        self.assertEqual(unique_epochs, set([229608000, 229608900, 229609800]))
+
+    @patch("builtins.open", new_callable=mock_open, read_data=sp3c_example2_data)
+    def test_remove_svs_from_header(self, mock_file):
+        sp3_df = sp3.read_sp3("mock_path", pOnly=True)
+        self.assertEqual(sp3_df.attrs["HEADER"].HEAD.SV_COUNT_STATED, "5", "Header should have 5 SVs to start with")
+        self.assertEqual(
+            set(sp3_df.attrs["HEADER"].SV_INFO.index.values),
+            set(["G01", "G02", "G03", "G04", "G05"]),
+            "Header SV list should have the 5 SVs expected to start with",
+        )
+
+        # Remove two specific SVs
+        sp3.remove_svs_from_header(sp3_df, set(["G02", "G04"]))
+
+        self.assertEqual(sp3_df.attrs["HEADER"].HEAD.SV_COUNT_STATED, "3", "Header should have 3 SVs after removal")
+        self.assertEqual(
+            set(sp3_df.attrs["HEADER"].SV_INFO.index.values),
+            set(["G01", "G03", "G05"]),
+            "Header SV list should have the 3 SVs expected",
+        )
+
+
 class TestMergeSP3(TestCase):
     def setUp(self):
         self.setUpPyfakefs()

From 1c6b980edb365ffe88b9f5d6b2e92a9af2d342f0 Mon Sep 17 00:00:00 2001
From: Nathan <95725385+treefern@users.noreply.github.com>
Date: Fri, 10 Jan 2025 09:51:28 +0000
Subject: [PATCH 02/12] NPI-3683 added simple check and logging of SP3 file
 version detected up by header parser, including a warning for old versions of
 the format which we may not have tested for

---
 gnssanalysis/gn_io/sp3.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/gnssanalysis/gn_io/sp3.py b/gnssanalysis/gn_io/sp3.py
index 67f83ae..7d97016 100644
--- a/gnssanalysis/gn_io/sp3.py
+++ b/gnssanalysis/gn_io/sp3.py
@@ -751,6 +751,22 @@ def parse_sp3_header(header: bytes, warn_on_negative_sv_acc_values: bool = True)
                 "SV_COUNT_STATED",  # (Here for convenience) parsed earlier with _RE_SP3_HEADER_SV, not by above regex
             ],
         )
+        # SP3 file versions:
+        # - a: Oct 1995 - Single constellation.
+        # - b: Oct 1998 - Adds GLONASS support.
+        # - c: ~2006 to Aug 2010 - Appears to add multiple new constellations, Japan's QZSS added in 2010.
+        # - d: Feb 2016 - Max number of satellites raised from 85 to 999.
+
+        header_version = str(header_array[0])
+        if header_version in ("a", "b"):
+            logger.warning(f"SP3 file is old version: '{header_version}', you may experience parsing issues")
+        elif header_version in ("c", "d"):
+            logger.info(f"SP3 header states SP3 file version is: {header_array[0]}")
+        else:
+            logger.warning(
+                f"SP3 header is of an unknown version, or failed to parse! Version appears to be: '{header_version}'"
+            )
+
     except AttributeError as e:  # Make the exception slightly clearer.
         raise AttributeError("Failed to parse SP3 header. Regex likely returned no match.", e)
 

From 5bcca65320cc5a5b541fd9d2946a3012aaad98ed Mon Sep 17 00:00:00 2001
From: Nathan <95725385+treefern@users.noreply.github.com>
Date: Fri, 10 Jan 2025 09:59:10 +0000
Subject: [PATCH 03/12] NPI-3683 added test for bug where EV and EP rows were
 being included in the list of SVs - only impact in validation code because
 things were not well ordered

---
 tests/test_sp3.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/tests/test_sp3.py b/tests/test_sp3.py
index ecefc94..36fe1ca 100644
--- a/tests/test_sp3.py
+++ b/tests/test_sp3.py
@@ -138,6 +138,19 @@ def test_read_sp3_validation_sv_count_mismatch_header_vs_content(self, mock_file
             "Header says there should be 1 epochs, however there are 2 (unique) epochs in the content (duplicate epoch check comes later).",
             "Loading SP3 with mismatch between SV count in header and in content, should raise exception",
         )
+    
+    @patch("builtins.open", new_callable=mock_open, read_data=sp3c_example2_data)
+    def test_read_sp3_correct_svs_read_when_ev_ep_present(self, mock_file):
+        # This should not raise an exception; SV count should match header if parsed correctly.
+        result = sp3.read_sp3(
+            "testfile.SP3",
+            pOnly=False,
+            check_header_vs_filename_vs_content_discrepancies=True,  # Actually enable the checks for this one
+            skip_filename_in_discrepancy_check=True,
+        )
+        parsed_svs_content = sp3.get_unique_svs(result).astype(str).values
+        self.assertEqual(set(parsed_svs_content), set(["G01", "G02", "G03", "G04", "G05"]))
+
     # TODO Add test(s) for correctly reading header fundamentals (ACC, ORB_TYPE, etc.)
     # TODO add tests for correctly reading the actual content of the SP3 in addition to the header.
     # TODO add tests for correctly generating sp3 output content with gen_sp3_content() and gen_sp3_header()

From 94a9b233511b352ef73968860a2c2028e3e37052 Mon Sep 17 00:00:00 2001
From: Nathan <95725385+treefern@users.noreply.github.com>
Date: Fri, 10 Jan 2025 10:04:05 +0000
Subject: [PATCH 04/12] NPI-3683 updated remove_offline_sats() to update
 internal representation of SP3 header when it removes an SV from the
 dataframe

---
 gnssanalysis/gn_io/sp3.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/gnssanalysis/gn_io/sp3.py b/gnssanalysis/gn_io/sp3.py
index 7d97016..94d8567 100644
--- a/gnssanalysis/gn_io/sp3.py
+++ b/gnssanalysis/gn_io/sp3.py
@@ -226,7 +226,7 @@ def remove_offline_sats(sp3_df: _pd.DataFrame, df_friendly_name: str = ""):
     observations will get removed entirely.
     Added in version 0.0.57
 
-    :param _pd.DataFrame sp3_df: SP3 DataFrame to remove offline / nodata satellites from
+    :param _pd.DataFrame sp3_df: SP3 DataFrame to remove offline / nodata satellites from.
     :param str df_friendly_name: Name to use when referring to the DataFrame in log output (purely for clarity). Empty
            string by default.
     :return _pd.DataFrame: the SP3 DataFrame, with the offline / nodata satellites removed
@@ -241,14 +241,25 @@ def remove_offline_sats(sp3_df: _pd.DataFrame, df_friendly_name: str = ""):
     mask_nan = (sp3_df.EST.X.isna()) & (sp3_df.EST.Y.isna()) & (sp3_df.EST.Z.isna())
     mask_either = _np.logical_or(mask_zero, mask_nan)
 
+    # We expect EV, EP rows to have already been filtered out
+    if "PV_FLAG" in sp3_df.index.names:
+        raise Exception(
+            "PV_FLAG index level found in dataframe while trying to remove offline sats. "
+            "De-interlacing of Pos, Vel and EV / EP rows must be performed before running this"
+        )
     # With mask, filter for entries with no POS value, then get the sat name (SVs) from the entry, then dedupe:
     offline_sats = sp3_df[mask_either].index.get_level_values(1).unique()
 
     # Using that list of offline / partially offline sats, remove all entries for those sats from the SP3 DataFrame:
     sp3_df = sp3_df.drop(offline_sats, level=1, errors="ignore")
+    # TODO should the following be removed?!:
     sp3_df.attrs["HEADER"].HEAD.ORB_TYPE = "INT"  # Allow the file to be read again by read_sp3 - File ORB_TYPE changes
     if len(offline_sats) > 0:
-        logger.info(f"Dropped offline / nodata sats from {df_friendly_name} SP3 DataFrame: {offline_sats.values}")
+        # Update the internal representation of the SP3 header to match the change
+        remove_svs_from_header(sp3_df, offline_sats.values)
+        logger.info(
+            f"Dropped offline / nodata sats from {df_friendly_name} SP3 DataFrame (including header): {offline_sats.values}"
+        )
     else:
         logger.info(f"No offline / nodata sats detected to be dropped from {df_friendly_name} SP3 DataFrame")
     return sp3_df

From cd716e11108e8b65dfd749ca58e60c9c9e1f0ce6 Mon Sep 17 00:00:00 2001
From: Nathan <95725385+treefern@users.noreply.github.com>
Date: Mon, 13 Jan 2025 00:22:54 +0000
Subject: [PATCH 05/12] NPI-3683 small docstring and comment clarity
 improvements

---
 gnssanalysis/gn_io/sp3.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/gnssanalysis/gn_io/sp3.py b/gnssanalysis/gn_io/sp3.py
index 94d8567..8b521a9 100644
--- a/gnssanalysis/gn_io/sp3.py
+++ b/gnssanalysis/gn_io/sp3.py
@@ -30,8 +30,10 @@
 )
 
 _RE_SP3_COMMENT_STRIP = _re.compile(rb"^(\/\*.*$\n)", _re.MULTILINE)
+
 # Regex to extract Satellite Vehicle (SV) names (E.g. G02). In SP3-d (2016) up to 999 satellites can be included).
 # Regex options/flags: multiline, findall. Updated to extract expected SV count too.
+# Note this extracts both the stated number of SVs AND the actual SV names.
 _RE_SP3_HEADER_SV = _re.compile(rb"^\+[ ]+([\d]+|)[ ]+((?:[A-Z]\d{2})+)\W", _re.MULTILINE)
 
 # Regex for orbit accuracy codes (E.g. ' 15' - space padded, blocks are three chars wide).
@@ -184,6 +186,7 @@ def sp3_pos_nodata_to_nan(sp3_df: _pd.DataFrame) -> None:
 def sp3_clock_nodata_to_nan(sp3_df: _pd.DataFrame) -> None:
     """
     Converts the SP3 Clock column's nodata values (999999 or 999999.999999 - fractional component optional) to NaNs.
+    Operates on the DataFrame in place.
     See https://files.igs.org/pub/data/format/sp3_docu.txt
 
     :param _pd.DataFrame sp3_df: SP3 data frame to filter nodata values for

From 2597f9aaccf4dd7045bea3fff0a38905b0b77f1f Mon Sep 17 00:00:00 2001
From: Nathan <95725385+treefern@users.noreply.github.com>
Date: Mon, 13 Jan 2025 00:36:52 +0000
Subject: [PATCH 06/12] NPI-3683 switch over to utility functions for unique SV
 and epoch lists

---
 gnssanalysis/gn_io/sp3.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gnssanalysis/gn_io/sp3.py b/gnssanalysis/gn_io/sp3.py
index 8b521a9..e8d79c3 100644
--- a/gnssanalysis/gn_io/sp3.py
+++ b/gnssanalysis/gn_io/sp3.py
@@ -292,7 +292,7 @@ def filter_by_svs(
     """
 
     # Get all SV names
-    all_sv_names = sp3_df.index.get_level_values(1).unique().array
+    all_sv_names = get_unique_svs(sp3_df)
     total_svs = len(all_sv_names)
     logger.info(f"Total SVs: {total_svs}")
 
@@ -427,7 +427,7 @@ def check_epoch_counts_for_discrepancies(
         sp3_filename = try_get_sp3_filename(sp3_path_or_bytes)
 
     # Could potentially check has_duplicates first for speed?
-    content_unique_epoch_count = draft_sp3_df.index.get_level_values(0).unique().size
+    content_unique_epoch_count = get_unique_epochs(draft_sp3_df).size
     # content_total_epoch_count = draft_sp3_df.index.get_level_values(0).size
 
     header_epoch_count = int(parsed_sp3_header.HEAD.N_EPOCHS)
@@ -920,7 +920,7 @@ def gen_sp3_header(sp3_df: _pd.DataFrame) -> str:
     # Check that number of SVs the header metadata says should be here, matches the number of SVs *listed* in
     # header metadata (which we use to output the new header). Then check that this in turn matches the number of
     # unique SVs in the SP3 DataFrame itself.
-    dataframe_sv_count = sp3_df.index.get_level_values(1).unique().size
+    dataframe_sv_count = get_unique_svs(sp3_df).size
     header_sv_stated_count = int(head["SV_COUNT_STATED"])
     header_sv_actual_count = int(n_sats)
 

From 74a1f30885c7024dacfe991b96134889336f0722 Mon Sep 17 00:00:00 2001
From: Nathan <95725385+treefern@users.noreply.github.com>
Date: Mon, 13 Jan 2025 00:42:47 +0000
Subject: [PATCH 07/12] NPI-3683 update comments for clarity

---
 gnssanalysis/gn_io/sp3.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/gnssanalysis/gn_io/sp3.py b/gnssanalysis/gn_io/sp3.py
index e8d79c3..74ec819 100644
--- a/gnssanalysis/gn_io/sp3.py
+++ b/gnssanalysis/gn_io/sp3.py
@@ -859,19 +859,21 @@ def get_unique_svs(sp3_df: _pd.DataFrame) -> _pd.Index:
     """
 
     # Are we dealing with a DF which is early in processing: with Position, Velocity, and EV / EP rows in it?
-    # This will have the PV_FLAG index level. It *may* contain EV / EP rows, though until we support these they
-    # should be promptly removed at that point.
-    # Alternativley, has this DF had Position and Velocity data merged (into columns, not interlaced rows).
-    # In this case the PV_FLAG index level will have been dropped.
+    # -> This will have the PV_FLAG index level. It *may* contain EV / EP rows, though until we support these they
+    #    should be promptly removed at that point.
+
+    # Alternativley, has this DF had Position and Velocity data merged (into columns, not interlaced rows)?
+    # -> In this case the PV_FLAG index level will have been dropped.
     if "PV_FLAG" in sp3_df.index.names:
         if "E" in sp3_df.index.get_level_values("PV_FLAG").unique():
             logger.warning(
                 "EV/EP record found late in SP3 processing. Until we actually support them, "
                 "they should be removed by the EV/EP check earlier on! Filtering out while determining unique SVs."
             )
-            # Filter on data that doesn't relate to EV / EP rows
+            # Filter to data that doesn't relate to EV / EP rows, then get SVs from index (level 1).
             return sp3_df.loc[sp3_df.index.get_level_values("PV_FLAG") != "E"].index.get_level_values(1).unique()
 
+    # Get index values from level 1 (which is SVs)
     return sp3_df.index.get_level_values(1).unique()
 
 

From 8f348a3d859a168c34641f2327082d03425c284e Mon Sep 17 00:00:00 2001
From: Nathan <95725385+treefern@users.noreply.github.com>
Date: Mon, 13 Jan 2025 00:52:44 +0000
Subject: [PATCH 08/12] NPI-3683 stop setting SP3 header ORB_TYPE to INT when
 removing offline sats. Update docstring

---
 gnssanalysis/gn_io/sp3.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gnssanalysis/gn_io/sp3.py b/gnssanalysis/gn_io/sp3.py
index 74ec819..2ac0d13 100644
--- a/gnssanalysis/gn_io/sp3.py
+++ b/gnssanalysis/gn_io/sp3.py
@@ -227,6 +227,7 @@ def remove_offline_sats(sp3_df: _pd.DataFrame, df_friendly_name: str = ""):
     Remove any satellites that have "0.0" or NaN for all three position coordinates - this indicates satellite offline.
     Note that this removes a satellite which has *any* missing coordinate data, meaning a satellite with partial
     observations will get removed entirely.
+    Note: the internal representation of the SP3 header will be updated to reflect the removal.
     Added in version 0.0.57
 
     :param _pd.DataFrame sp3_df: SP3 DataFrame to remove offline / nodata satellites from.
@@ -255,8 +256,7 @@ def remove_offline_sats(sp3_df: _pd.DataFrame, df_friendly_name: str = ""):
 
     # Using that list of offline / partially offline sats, remove all entries for those sats from the SP3 DataFrame:
     sp3_df = sp3_df.drop(offline_sats, level=1, errors="ignore")
-    # TODO should the following be removed?!:
-    sp3_df.attrs["HEADER"].HEAD.ORB_TYPE = "INT"  # Allow the file to be read again by read_sp3 - File ORB_TYPE changes
+
     if len(offline_sats) > 0:
         # Update the internal representation of the SP3 header to match the change
         remove_svs_from_header(sp3_df, offline_sats.values)

From 0f0402286509a314e4fd5821bd98635a4897ebdc Mon Sep 17 00:00:00 2001
From: Nathan <95725385+treefern@users.noreply.github.com>
Date: Mon, 13 Jan 2025 00:55:44 +0000
Subject: [PATCH 09/12] NPI-3683 within read_sp3() change order of dataframe
 restructuring, and various check and clean-up functions, to streamline code
 and separate concerns. The dataframe should now be more streamlined by the
 time checks are invoked, meaning the checks have less edge cases to consider.

---
 gnssanalysis/gn_io/sp3.py | 102 ++++++++++++++++++++------------------
 1 file changed, 55 insertions(+), 47 deletions(-)

diff --git a/gnssanalysis/gn_io/sp3.py b/gnssanalysis/gn_io/sp3.py
index 2ac0d13..8ee7199 100644
--- a/gnssanalysis/gn_io/sp3.py
+++ b/gnssanalysis/gn_io/sp3.py
@@ -547,7 +547,6 @@ def read_sp3(
     header = content[:header_end]
     content = content[header_end:]
     parsed_header = parse_sp3_header(header)
-    header_sv_count = parsed_header.SV_INFO.count()  # count() of non-NA/null, vs shape: dims of whole structure
     fline_b = header.find(b"%f") + 2  # TODO add to header parser
     fline = header[fline_b : fline_b + 24].strip().split(b"  ")
     base_xyzc = _np.asarray([float(fline[0])] * 3 + [float(fline[1])])  # exponent base
@@ -555,55 +554,18 @@ def read_sp3(
     sp3_df = _pd.concat([_process_sp3_block(date, data) for date, data in zip(date_lines, data_blocks)])
     sp3_df = _reformat_df(sp3_df)
 
-    # Are we running discrepancy checks?
-    if check_header_vs_filename_vs_content_discrepancies:
-        # SV count discrepancy check
-
-        # Check that count of SVs from header matches number of unique SVs we read in.
-        # To determine the unique SVs in the content we:
-        # 1. Index into the work-in-progress sp3 dataframe, filtering out anything where the PV flag is E (these entries
-        #   contain EP or EV data, not the name of an SV).
-        # 2. Get the index from that, filter down to level 1 (which has the SVs).
-        # 3. Make these SVs unique (as when you have P and V data, you will get two copies of every SV).
-        # 4. Get the size of this index, to determine the total number of unique SVs in the SP3 content.
-        df_sv_count = (
-            sp3_df.loc[sp3_df.index.get_level_values("PV_FLAG") != "E"].index.get_level_values(1).unique().size
-        )
-        if header_sv_count != df_sv_count:
-            if not continue_on_discrepancies:
-                raise ValueError(
-                    f"Number of SVs in SP3 header ({header_sv_count}) did not match file contents ({df_sv_count})!"
-                )
-            logger.warning(
-                f"Number of SVs in SP3 header ({header_sv_count}) did not match file contents ({df_sv_count})!"
-            )
-
-        # Epoch count discrepancy check
-        # Are we including the filename in the check?
-        path_bytes_to_pass = None if skip_filename_in_discrepancy_check else sp3_path_or_bytes
-        check_epoch_counts_for_discrepancies(
-            draft_sp3_df=sp3_df,
-            parsed_sp3_header=parsed_header,
-            sp3_path_or_bytes=path_bytes_to_pass,
-            continue_on_error=continue_on_discrepancies,
-        )
-
-    if drop_offline_sats:
-        sp3_df = remove_offline_sats(sp3_df)
-    if nodata_to_nan:
-        # Convert 0.000000 (which indicates nodata in the SP3 POS column) to NaN
-        sp3_pos_nodata_to_nan(sp3_df)
-        # Convert 999999* (which indicates nodata in the SP3 CLK column) to NaN
-        sp3_clock_nodata_to_nan(sp3_df)
-
     # P/V/EP/EV flag handling is currently incomplete. The current implementation truncates to the first letter,
     # so can't parse nor differenitate between EP and EV!
     if "E" in sp3_df.index.get_level_values("PV_FLAG").unique():
         if not continue_on_ep_ev_encountered:
             raise NotImplementedError("EP and EV flag rows are currently not supported")
-        logger.warning("EP / EV flag rows encountered. These are not yet supported, and will be ignored!")
+        logger.warning("EP / EV flag rows encountered. These are not yet supported. Dropping them from DataFrame...")
+        # Filter out EV / EP records, before we trip ourselves up with them. Technically this is redundant as in the
+        # next section we extract P and V records, then drop the PV_FLAG level.
+        sp3_df = sp3_df.loc[sp3_df.index.get_level_values("PV_FLAG") != "E"]
 
-    # Check very top of the header to see if this SP3 is Position only , or also contains Velocities
+    # Extract and merge Position and Velocity data. I.e. take the interlaced P and V rows, and restructure them into columns
+    # Check very top of the header to see if this SP3 is Position only, or also contains Velocities.
     if pOnly or parsed_header.HEAD.loc["PV_FLAG"] == "P":
         sp3_df = sp3_df.loc[sp3_df.index.get_level_values("PV_FLAG") == "P"]
         sp3_df.index = sp3_df.index.droplevel("PV_FLAG")
@@ -624,8 +586,56 @@ def read_sp3(
         position_df = position_df.drop(axis=1, columns="FLAGS")
 
         velocity_df.columns = SP3_VELOCITY_COLUMNS
+        # NOTE from the docs: pandas.concat copies attrs only if all input datasets have the same attrs.
+        # For simplity, we write the header to .attrs in the next section.
         sp3_df = _pd.concat([position_df, velocity_df], axis=1)
 
+    # Position and Velocity (if present) now have thier own columns, and are indexed on SV (rather than
+    # being interlaced as in source data).
+
+    # As the main transformations are done, write header data to dataframe attributes now,
+    # so it can travel with the dataframe from here on. NOTE: Pandas docs say .attrs is an experimental feature.
+    sp3_df.attrs["HEADER"] = parsed_header
+    sp3_df.attrs["path"] = sp3_path_or_bytes if type(sp3_path_or_bytes) in (str, Path) else ""
+
+    # DataFrame has been consolidated. Run remaining consistency checks, conversions, and removal steps. Doing this
+    # earlier would risk things like trying to pull pos values from an EV / EP row before it had been filtered out.
+
+    if drop_offline_sats:
+        sp3_df = remove_offline_sats(sp3_df)
+    if nodata_to_nan:
+        # Convert 0.000000 (which indicates nodata in the SP3 POS column) to NaN
+        sp3_pos_nodata_to_nan(sp3_df)
+        # Convert 999999* (which indicates nodata in the SP3 CLK column) to NaN
+        sp3_clock_nodata_to_nan(sp3_df)
+
+    # Are we running discrepancy checks?
+    if check_header_vs_filename_vs_content_discrepancies:
+        # SV count discrepancy check
+
+        content_sv_count = get_unique_svs(sp3_df).size
+        # count() gives total of non-NA/null (vs shape, which gets dims of whole structure):
+        header_sv_count = sp3_df.attrs["HEADER"].SV_INFO.count()
+        if header_sv_count != content_sv_count:
+            if not continue_on_discrepancies:
+                raise ValueError(
+                    f"Number of SVs in SP3 header ({header_sv_count}) did not match file contents ({content_sv_count})!"
+                )
+            logger.warning(
+                f"Number of SVs in SP3 header ({header_sv_count}) did not match file contents ({content_sv_count})!"
+            )
+
+        # Epoch count discrepancy check
+
+        # Include filename in check?
+        path_bytes_to_pass = None if skip_filename_in_discrepancy_check else sp3_path_or_bytes
+        check_epoch_counts_for_discrepancies(
+            draft_sp3_df=sp3_df,
+            parsed_sp3_header=parsed_header,
+            sp3_path_or_bytes=path_bytes_to_pass,
+            continue_on_error=continue_on_discrepancies,
+        )
+
     # Check for duplicate epochs, dedupe and log warning
     if sp3_df.index.has_duplicates:  # a literaly free check
         # This typically runs in sub ms time. Marks all but first instance as duped:
@@ -638,9 +648,7 @@ def read_sp3(
         )
         # Now dedupe them, keeping the first of any clashes:
         sp3_df = sp3_df[~sp3_df.index.duplicated(keep="first")]
-    # Write header data to dataframe attributes:
-    sp3_df.attrs["HEADER"] = parsed_header
-    sp3_df.attrs["path"] = sp3_path_or_bytes if type(sp3_path_or_bytes) in (str, Path) else ""
+
     return sp3_df
 
 

From ff187f9661e5efe83f4eb41ff639f4de9bc42d2e Mon Sep 17 00:00:00 2001
From: Nathan <95725385+treefern@users.noreply.github.com>
Date: Mon, 13 Jan 2025 02:16:27 +0000
Subject: [PATCH 10/12] NPI-3683 update filter_by_svs() to update internal
 header to reflect new list of SVs. Filtering logic reworked to maintain a
 unified set of SVs to keep, converting this into a set to drop just before
 applying. This should avoid the potential to end up with filter_by_name and
 filter_by_count selecting different SVs and ending up with an empty output.
 test_file_creation_util.py updated to allow epoch count to remain unchanged,
 and to use new validation logic when reading files.

---
 gnssanalysis/gn_io/sp3.py               | 42 ++++++++++++++++---------
 gnssanalysis/test_file_creation_util.py | 32 ++++++++++++-------
 2 files changed, 49 insertions(+), 25 deletions(-)

diff --git a/gnssanalysis/gn_io/sp3.py b/gnssanalysis/gn_io/sp3.py
index 8ee7199..8939760 100644
--- a/gnssanalysis/gn_io/sp3.py
+++ b/gnssanalysis/gn_io/sp3.py
@@ -278,6 +278,7 @@ def filter_by_svs(
     Utility function to trim an SP3 DataFrame down, intended for creating small sample SP3 files for
     unit testing (but could be used for other purposes).
     Can filter to a specific number of SVs, to specific SV names, and to a specific constellation.
+    Note: updates (internal representation of) SP3 header, to reflect new SV count.
 
     These filters can be used together (though filter by name and filter by sat letter i.e. constellation, does
     not make sense).
@@ -296,21 +297,22 @@ def filter_by_svs(
     total_svs = len(all_sv_names)
     logger.info(f"Total SVs: {total_svs}")
 
-    # Drop SVs which don't match given names
+    # Starting with all SVs
+    keep_set: set[str] = set(all_sv_names)
+
+    # Disqualify SVs unless the match the given names
     if filter_by_name:
-        # Make set of every SV name to drop (exclude everything besides what we want to keep)
-        exclusion_list: list[str] = list(set(all_sv_names) - set(filter_by_name))
-        sp3_df = sp3_df.drop(exclusion_list, level=1)
+        keep_set.intersection(filter_by_name)
 
-    # Drop SVs which don't match a given constellation letter (i.e. 'G', 'E', 'R', 'C')
+    # Disqualify SVs unless they match a given constellation letter (i.e. 'G', 'E', 'R', 'C')
     if filter_to_sat_letter:
         if len(filter_to_sat_letter) != 1:
             raise ValueError(
-                "Name of sat constellation to filter to, must be a single char. E.g. you cannot enter 'GR'"
+                "Name of sat constellation to filter to, must be a single char. E.g. you cannot enter 'GE'"
             )
-        # Make set of every SV name to drop (exclude everything besides what we want to keep)
-        other_constellation_sats = [sv for sv in all_sv_names if not filter_to_sat_letter.upper() in sv]
-        sp3_df = sp3_df.drop(other_constellation_sats, level=1)
+        # Make set of every SV name in the constellation we're keeping
+        constellation_sats_to_keep = [sv for sv in all_sv_names if filter_to_sat_letter.upper() in sv]
+        keep_set.intersection(constellation_sats_to_keep)
 
     # Drop SVs beyond n (i.e. keep only the first n SVs)
     if filter_by_count:
@@ -320,9 +322,21 @@ def filter_by_svs(
             raise ValueError(
                 f"Cannot filter to max of {filter_by_count} sats, as there are only {total_svs} sats total!"
             )
-        # Exclusion list built by taking all sats *beyond* the amount we want to keep.
-        exclusion_list = all_sv_names[filter_by_count:]
-        sp3_df = sp3_df.drop(exclusion_list, level=1)
+
+        # Convert working set of SVs to keep, into a sorted list so we can index in sorted order and trim
+        # to e.g. G01, G02, not the first two arbitrary numbers as the set represents them.
+        keep_list = list(keep_set)
+        keep_list.sort()
+        keep_list = keep_list[:filter_by_count]  # Trim to first n sats of sorted list
+
+        keep_set = set(keep_list)
+
+    discard_set = set(all_sv_names).difference(keep_set)
+    # sp3_df = sp3_df.loc[sp3_df.index.get_level_values(1) in list(keep_set)] # Ideally something like this (but I've missed a detail)
+    sp3_df = sp3_df.drop(list(discard_set), level=1)
+
+    # Update internal header to match
+    remove_svs_from_header(sp3_df, discard_set)
 
     return sp3_df
 
@@ -439,7 +453,7 @@ def check_epoch_counts_for_discrepancies(
                 f"{content_unique_epoch_count} (unique) epochs in the content (duplicate epoch check comes later)."
             )
         logger.warning(
-            f"Header says there should be {header_epoch_count} epochs, however there are "
+            f"WARNING: Header says there should be {header_epoch_count} epochs, however there are "
             f"{content_unique_epoch_count} (unique) epochs in the content (duplicate epoch check comes later)."
         )
 
@@ -483,7 +497,7 @@ def check_epoch_counts_for_discrepancies(
                 f"there should be {filename_derived_epoch_count} (or {filename_derived_epoch_count-1} at minimum)."
             )
         logger.warning(
-            f"Header says there should be {header_epoch_count} epochs, however filename '{sp3_filename}' implies "
+            f"WARNING: Header says there should be {header_epoch_count} epochs, however filename '{sp3_filename}' implies "
             f"there should be {filename_derived_epoch_count} (or {filename_derived_epoch_count-1} at minimum)."
         )
     # All good, validation passed.
diff --git a/gnssanalysis/test_file_creation_util.py b/gnssanalysis/test_file_creation_util.py
index e7744db..a51d57d 100644
--- a/gnssanalysis/test_file_creation_util.py
+++ b/gnssanalysis/test_file_creation_util.py
@@ -18,7 +18,7 @@
 trim_to_sat_letter: Optional[str] = None  # "E"
 
 # How many epochs to include in the trimmed file (offset from start)
-trim_to_num_epochs: int = 3
+trim_to_num_epochs: Optional[int] = None  # 3
 
 drop_offline_sats: bool = False
 
@@ -28,21 +28,29 @@
 filename = src_path.rsplit("/")[-1]
 print(f"Filename is: {filename}")
 
+# Determine sample rate (needed for trimming)
 # Raw data would be: determine_sp3_name_props() - that retrieves in seconds. But we want to be more generally applicable, so not just SP3 here ideally.
 sample_rate: timedelta = convert_nominal_span(determine_properties_from_filename(filename)["sampling_rate"])
 print(f"sample_rate is: {sample_rate}")
 
 # Load
-print("Loading SP3 into DataFrame...")
-sp3_df = read_sp3(src_path)
+print("Loading SP3 into DataFrame (Pos data only, strict mode, warn only)...")
+sp3_df = read_sp3(
+    src_path,
+    check_header_vs_filename_vs_content_discrepancies=True,
+    continue_on_discrepancies=True,
+)
+print("Read done.")
 
 # Trim to first x epochs
-print(f"Trimming to first {trim_to_num_epochs} epochs")
-sp3_df = trim_to_first_n_epochs(sp3_df=sp3_df, epoch_count=trim_to_num_epochs, sp3_filename=filename)
+if trim_to_num_epochs:
+    print(f"Trimming to first {trim_to_num_epochs} epochs")
+    sp3_df = trim_to_first_n_epochs(sp3_df=sp3_df, epoch_count=trim_to_num_epochs, sp3_filename=filename)
 
 # Filter to chosen SVs or number of SVs...
 print(
-    f"Applying SV filters (max count: {trim_to_sv_count}, limit to names: {trim_to_sv_names}, limit to constellation: {trim_to_sat_letter})..."
+    "Applying SV filters (max count: "
+    f"{trim_to_sv_count}, limit to names: {trim_to_sv_names}, limit to constellation: {trim_to_sat_letter})..."
 )
 sp3_df = filter_by_svs(
     sp3_df, filter_by_count=trim_to_sv_count, filter_by_name=trim_to_sv_names, filter_to_sat_letter=trim_to_sat_letter
@@ -50,17 +58,19 @@
 
 # Drop offline sats if requested
 if drop_offline_sats:
-    print(f"Dropping offline sats...")
+    print(f"Dropping offline sats (and updating header accordingly)...")
     sp3_df = remove_offline_sats(sp3_df)
 
 # Write out
 print(
     "Writing out new SP3 file... "
-    'CAUTION: at the time of writing the header is based on stale metadata in .attrs["HEADER"], not the contents '
-    "of the dataframe. It will need to be manually updated."
+    'CAUTION: please check output header for consistency. It is based on metadata in .attrs["HEADER"], not the '
+    "contents of the dataframe, and may not have been updated for all changes."
 )
 write_sp3(sp3_df, dest_path)
 
 # Test if we can successfully read that file...
-print("Testing re-read of the output file...")
-re_read = read_sp3(dest_path)
+print("Testing re-read of the output file (strict mode, warn only)...")
+re_read = read_sp3(
+    dest_path, pOnly=False, check_header_vs_filename_vs_content_discrepancies=True, continue_on_discrepancies=True
+)

From 4e2d49112f519d1ecc9f59a35eeb55bd7789e682 Mon Sep 17 00:00:00 2001
From: Nathan <95725385+treefern@users.noreply.github.com>
Date: Mon, 13 Jan 2025 02:23:54 +0000
Subject: [PATCH 11/12] NPI-3683 bug fix - update in place vs return new result

---
 gnssanalysis/gn_io/sp3.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gnssanalysis/gn_io/sp3.py b/gnssanalysis/gn_io/sp3.py
index 8939760..4846d27 100644
--- a/gnssanalysis/gn_io/sp3.py
+++ b/gnssanalysis/gn_io/sp3.py
@@ -302,7 +302,7 @@ def filter_by_svs(
 
     # Disqualify SVs unless the match the given names
     if filter_by_name:
-        keep_set.intersection(filter_by_name)
+        keep_set = keep_set.intersection(filter_by_name)
 
     # Disqualify SVs unless they match a given constellation letter (i.e. 'G', 'E', 'R', 'C')
     if filter_to_sat_letter:
@@ -311,8 +311,8 @@ def filter_by_svs(
                 "Name of sat constellation to filter to, must be a single char. E.g. you cannot enter 'GE'"
             )
         # Make set of every SV name in the constellation we're keeping
-        constellation_sats_to_keep = [sv for sv in all_sv_names if filter_to_sat_letter.upper() in sv]
-        keep_set.intersection(constellation_sats_to_keep)
+        constellation_sats_to_keep = set([sv for sv in all_sv_names if filter_to_sat_letter.upper() in sv])
+        keep_set = keep_set.intersection(constellation_sats_to_keep)
 
     # Drop SVs beyond n (i.e. keep only the first n SVs)
     if filter_by_count:

From 955b41af2327da8de40a4747d59b88f6e55e2a22 Mon Sep 17 00:00:00 2001
From: Nathan <95725385+treefern@users.noreply.github.com>
Date: Tue, 14 Jan 2025 11:04:40 +0000
Subject: [PATCH 12/12] NPI-3683 add return type hint prompted by PR feedback

---
 gnssanalysis/gn_io/sp3.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gnssanalysis/gn_io/sp3.py b/gnssanalysis/gn_io/sp3.py
index 4846d27..6f81db2 100644
--- a/gnssanalysis/gn_io/sp3.py
+++ b/gnssanalysis/gn_io/sp3.py
@@ -196,7 +196,7 @@ def sp3_clock_nodata_to_nan(sp3_df: _pd.DataFrame) -> None:
     sp3_df.loc[nan_mask, ("EST", "CLK")] = _np.nan
 
 
-def remove_svs_from_header(sp3_df: _pd.DataFrame, sats_to_remove: set[str]):
+def remove_svs_from_header(sp3_df: _pd.DataFrame, sats_to_remove: set[str]) -> None:
     """
     Utility function to update the internal representation of an SP3 header, when SVs are removed from the SP3
     DataFrame. This is useful e.g. when removing offline satellites.