feat: update to noisepy-io v0.2.0 (#327)

* enforce single sampling rate for one station Signed-off-by: Yiyu Ni <niyiyu@uw.edu> * update tutorial Signed-off-by: Yiyu Ni <niyiyu@uw.edu> * update tutorial Signed-off-by: Yiyu Ni <niyiyu@uw.edu> * update io ver Signed-off-by: Yiyu Ni <niyiyu@uw.edu> * update tests for single_freq Signed-off-by: Yiyu Ni <niyiyu@uw.edu> * update get_started Signed-off-by: Yiyu Ni <niyiyu@uw.edu> * update monitoring Signed-off-by: Yiyu Ni <niyiyu@uw.edu> --------- Signed-off-by: Yiyu Ni <niyiyu@uw.edu>
noisepy · Nov 6, 2024 · b8297fc · b8297fc
1 parent 2b7627d
commit b8297fc
Show file tree

Hide file tree

Showing 13 changed files with 197 additions and 166 deletions.
diff --git a/.gitignore b/.gitignore
@@ -180,6 +180,9 @@ tutorials/ncedc_data/
 tutorials/cli/tmpdata
 tutorials/pnw_data
 tutorials/composite_data
+tutorials/monitoring/CCF_ASDF/
+tutorials/monitoring/Monitoring_output.csv
+tutorials/monitoring/monito_config.yml
 
 # Jupyterbook
 _build/

diff --git a/README.md b/README.md
@@ -95,7 +95,7 @@ Taxonomy of the NoisePy variables.
 * ``step`` is the window that get skipped when sliding windows in seconds
 * ``smooth_N`` number of points for smoothing the  time or frequency domain discrete arrays.
 * ``maxlag`` maximum length in seconds saved in files in each side of the correlation (save on storage)
-* ``substack,substack_len`` boolean, window length over which to substack the correlation (to save storage or do monitoring), it has to be a multiple of ``cc_len``.
+* ``substack, substack_windows`` boolean, number of window over which to substack the correlation (to save storage or do monitoring).
 * ``time_chunk, nchunk`` refers to the time unit that defined a single job. for instace, ``cc_len`` is the correlation length (e.g., 1 hour, 30 min), the overall duration of the experiment is the total length (1 month, 1 year, ...). The time chunk could be 1 day: the code would loop through each cc_len window in a for loop. But each day will be sent as a thread.
 
 ## Acknowledgements

diff --git a/pyproject.toml b/pyproject.toml
@@ -50,7 +50,7 @@ dependencies = [
     "PyYAML==6.0",
     "pydantic-yaml==1.0",
     "psutil>=5.9.5,<6.0.0",
-    "noisepy-seis-io>=0.1.16",
+    "noisepy-seis-io>=0.2.0",
     "scipy==1.12.0"
 ]
 

diff --git a/src/noisepy/seis/correlate.py b/src/noisepy/seis/correlate.py
@@ -502,20 +502,35 @@ def _filter_channel_data(
         logging.warning(f"No data available with sampling rate >= {sampling_rate}")
         return []
     if single_freq:
-        closest_freq = min(
-            frequencies,
-            key=lambda f: max(f - sampling_rate, 0),
-        )
-        logger.info(f"Picked {closest_freq} as the closest sampling rate to {sampling_rate}. ")
+        closest_freq = _get_closest_freq(frequencies, sampling_rate)
+        logger.info(f"Picked {closest_freq} as the closest sampling_rate to {sampling_rate}. ")
         filtered_tuples = list(filter(lambda tup: tup[1].sampling_rate == closest_freq, tuples))
         logger.info(f"Filtered to {len(filtered_tuples)}/{len(tuples)} channels with sampling rate == {closest_freq}")
     else:
         filtered_tuples = list(filter(lambda tup: tup[1].sampling_rate >= sampling_rate, tuples))
+        # for each station, pick the closest >= to sampling_rate
+        tmp = list(
+            map(
+                lambda s: [t for t in filtered_tuples if t[0].station == s],
+                set([t[0].station for t in filtered_tuples]),
+            )
+        )
+        filtered_tuples = sum(list(map(lambda t: _filt_single_station(t, sampling_rate), tmp)), [])
         logger.info(f"Filtered to {len(filtered_tuples)}/{len(tuples)} channels with sampling rate >= {sampling_rate}")
 
     return filtered_tuples
 
 
+def _get_closest_freq(frequencies, sampling_rate: int):
+    return min(frequencies, key=lambda f: max(f - sampling_rate, 0))
+
+
+def _filt_single_station(tuples: List[Tuple[Channel, ChannelData]], sampling_rate: int):
+    frequencies = set(t[1].sampling_rate for t in tuples)
+    closest_freq = _get_closest_freq(frequencies, sampling_rate)
+    return [t for t in tuples if t[1].sampling_rate == closest_freq]
+
+
 def check_memory(params: ConfigParameters, nsta: int) -> int:
     # maximum memory allowed
     # TODO: Is this needed? Should it be configurable?

diff --git a/tests/test_cross_correlation.py b/tests/test_cross_correlation.py
@@ -25,21 +25,32 @@
 def test_read_channels():
     CLOSEST_FREQ = 60
     sampling_rate = 40
-    freqs = [10, 39, CLOSEST_FREQ, 100]
-    ch_data = []
-    for f in freqs:
+    ch_data1 = []
+    N = 5
+    for f in [10, 39, CLOSEST_FREQ, 100]:
         cd = ChannelData.empty()
         cd.sampling_rate = f
-        ch_data.append(cd)
-    N = 5
-    tuples = [(Channel("foo", Station("CI", "bar")), cd) for cd in ch_data] * N
+        ch_data1.append(cd)
+
+    cd = ChannelData.empty()
+    cd.sampling_rate = 100
+    ch_data2 = [cd]
+
+    tuples = [(Channel("foo", Station("CI", "FOO")), cd) for cd in ch_data1] * N
+    tuples += [(Channel("bar", Station("CI", "BAR")), cd) for cd in ch_data2] * N
 
-    # we should pick the closest frequency that is >= to the target freq, 60 in this case
+    # we should pick the closest frequency that is >= to the target sampling_rate
+    # 60 Hz in this case, for both stations
+    # CI.FOO returns 60 Hz
+    # CI.BAR returns nothing
     filtered = _filter_channel_data(tuples, sampling_rate, single_freq=True)
     assert N == len(filtered)
     assert [t[1].sampling_rate for t in filtered] == [CLOSEST_FREQ] * N
 
-    # we should get all data at >= 40 Hz (60 and 100)
+    # we should pick the closest frequency that is >= to the target sampling_rate
+    # but might be different for each station
+    # CI.FOO returns 60 Hz
+    # CI.BAR returns 100 Hz
     filtered = _filter_channel_data(tuples, sampling_rate, single_freq=False)
     assert N * 2 == len(filtered)
     assert all([t[1].sampling_rate >= sampling_rate for t in filtered])
@@ -79,11 +90,11 @@ def get_inventory(self, timespan: DateTimeRange, station: Station) -> obspy.Inve
 @pytest.mark.parametrize("rm_resp", [RmResp.NO, RmResp.INV])  # add tests for SPECTRUM, RESP and POLES_ZEROS
 @pytest.mark.parametrize("cc_method", [CCMethod.XCORR, CCMethod.COHERENCY, CCMethod.DECONV])
 @pytest.mark.parametrize("substack", [True, False])
-@pytest.mark.parametrize("substack_len", [1, 2])
+@pytest.mark.parametrize("substack_windows", [1, 2])
 @pytest.mark.parametrize("inc_hours", [0, 24])
 @pytest.mark.parametrize("dpath", ["./data/cc", "./data/acc"])
 def test_cross_correlation(
-    rm_resp: RmResp, cc_method: CCMethod, substack: bool, substack_len: int, inc_hours: int, dpath: str
+    rm_resp: RmResp, cc_method: CCMethod, substack: bool, substack_windows: int, inc_hours: int, dpath: str
 ):
     config = ConfigParameters()
     config.sampling_rate = 1.0
@@ -92,7 +103,7 @@ def test_cross_correlation(
     config.inc_hours = inc_hours
     if substack:
         config.substack = substack
-        config.substack_len = substack_len * config.cc_len
+        config.substack_windows = substack_windows
     path = os.path.join(os.path.dirname(__file__), dpath)
 
     raw_store = SCEDCS3DataStore(path, MockCatalogMock())

diff --git a/tutorials/cloud/config.yaml b/tutorials/cloud/config.yaml
@@ -35,6 +35,6 @@ stationxml: false
 step: 450.0
 storage_options: {}
 substack: false
-substack_len: 1800
+substack_windows: 1
 time_norm: no
 xcorr_only: true
diff --git a/tutorials/config.yml b/tutorials/config.yml
@@ -15,7 +15,7 @@ cc_method: xcorr
 smooth_N: 10
 smoothspect_N: 10
 substack: true
-substack_len: 3600
+substack_windows: 1
 maxlag: 200
 inc_hours: 12
 max_over_std: 10

diff --git a/tutorials/get_started.ipynb b/tutorials/get_started.ipynb
@@ -64,7 +64,7 @@
     "from noisepy.seis.io import plotting_modules\n",
     "from noisepy.seis.io.asdfstore import ASDFRawDataStore, ASDFCCStore, ASDFStackStore\n",
     "from noisepy.seis.io.datatypes import CCMethod, ConfigParameters, FreqNorm, RmResp, TimeNorm\n",
-    "from dateutil.parser import isoparse\n",
+    "from datetime import datetime, timezone\n",
     "from datetimerange import DateTimeRange\n",
     "import os\n",
     "import shutil\n",
@@ -95,8 +95,8 @@
    "source": [
     "config = ConfigParameters() # default config parameters which can be customized\n",
     "config.inc_hours = 12\n",
-    "config.sampling_rate= 20  # (int) Sampling rate in Hz of desired processing (it can be different than the data sampling rate)\n",
-    "config.cc_len= 3600  # (float) basic unit of data length for fft (sec)\n",
+    "config.sampling_rate = 20  # (int) Sampling rate in Hz of desired processing (it can be different than the data sampling rate)\n",
+    "config.cc_len = 3600  # (float) basic unit of data length for fft (sec)\n",
     "    # criteria for data selection\n",
     "config.ncomp = 3  # 1 or 3 component data (needed to decide whether do rotation)\n",
     "\n",
@@ -111,32 +111,32 @@
     "config.lomax = -115     # max longitude\n",
     "\n",
     "# pre-processing parameters\n",
-    "config.step= 1800.0  # (float) overlapping between each cc_len (sec)\n",
-    "config.stationxml= False  # station.XML file used to remove instrument response for SAC/miniseed data\n",
-    "config.rm_resp= RmResp.INV  # select 'no' to not remove response and use 'inv' if you use the stationXML,'spectrum',\n",
+    "config.step = 1800.0  # (float) overlapping between each cc_len (sec)\n",
+    "config.stationxml = False  # station.XML file used to remove instrument response for SAC/miniseed data\n",
+    "config.rm_resp = RmResp.INV  # select 'no' to not remove response and use 'inv' if you use the stationXML,'spectrum',\n",
     "config.freqmin = 0.05\n",
     "config.freqmax = 2.0\n",
     "config.max_over_std  = 10  # threshold to remove window of bad signals: set it to 10*9 if prefer not to remove them\n",
     "\n",
     "# TEMPORAL and SPECTRAL NORMALISATION\n",
-    "config.freq_norm= FreqNorm.RMA # choose between \"rma\" for a soft whitenning or \"no\" for no whitening. Pure whitening is not implemented correctly at this point.\n",
+    "config.freq_norm = FreqNorm.RMA # choose between \"rma\" for a soft whitenning or \"no\" for no whitening. Pure whitening is not implemented correctly at this point.\n",
     "config.smoothspect_N = 10  # moving window length to smooth spectrum amplitude (points)\n",
     "    # here, choose smoothspect_N for the case of a strict whitening (e.g., phase_only)\n",
     "\n",
     "config.time_norm = TimeNorm.NO  # 'no' for no normalization, or 'rma', 'one_bit' for normalization in time domain,\n",
     "    # TODO: change time_norm option from \"no\" to \"None\"\n",
-    "config.smooth_N= 10  # moving window length for time domain normalization if selected (points)\n",
+    "config.smooth_N = 10  # moving window length for time domain normalization if selected (points)\n",
     "\n",
-    "config.cc_method= CCMethod.XCORR # 'xcorr' for pure cross correlation OR 'deconv' for deconvolution;\n",
+    "config.cc_method = CCMethod.XCORR # 'xcorr' for pure cross correlation OR 'deconv' for deconvolution;\n",
     "    # FOR \"COHERENCY\" PLEASE set freq_norm to \"rma\", time_norm to \"no\" and cc_method to \"xcorr\"\n",
     "\n",
     "# OUTPUTS:\n",
     "config.substack = True  # True = smaller stacks within the time chunk. False: it will stack over inc_hours\n",
-    "config.substack_len = config.cc_len  # how long to stack over (for monitoring purpose): need to be multiples of cc_len\n",
-    "    # if substack=True, substack_len=2*cc_len, then you pre-stack every 2 correlation windows.\n",
-    "    # for instance: substack=True, substack_len=cc_len means that you keep ALL of the correlations\n",
+    "config.substack_windows = 1  # how long to stack over (for monitoring purpose)\n",
+    "    # if substack=True, substack_windows=2, then you pre-stack every 2 correlation windows.\n",
+    "    # for instance: substack=True, substack_windows=1 means that you keep ALL of the correlations\n",
     "\n",
-    "config.maxlag= 200  # lags of cross-correlation to save (sec)\n",
+    "config.maxlag = 200  # lags of cross-correlation to save (sec)\n",
     "config.substack = True"
    ]
   },
@@ -173,9 +173,9 @@
    "source": [
     "config.networks = [\"*\"]\n",
     "config.stations = [\"A*\"]\n",
-    "config.channels = [\"BHE\",\"BHN\",\"BHZ\"]\n",
-    "config.start_date =  isoparse(\"2019-02-01T00:00:00Z\")\n",
-    "config.end_date = isoparse(\"2019-02-02T00:00:00Z\")\n",
+    "config.channels = [\"BH?\"]\n",
+    "config.start_date = datetime(2019, 2, 1, tzinfo=timezone.utc)\n",
+    "config.end_date = datetime(2019, 2, 2, tzinfo=timezone.utc)\n",
     "timerange = DateTimeRange(config.start_date, config.end_date)\n",
     "\n",
     "# Download data locally. Enters raw data path, channel types, stations, config, and fdsn server.\n",
@@ -218,7 +218,7 @@
     "file = os.path.join(raw_data_path, \"2019_02_01_00_00_00T2019_02_01_12_00_00.h5\")\n",
     "raw_store = ASDFRawDataStore(raw_data_path) # Store for reading raw data\n",
     "timespans = raw_store.get_timespans()\n",
-    "plotting_modules.plot_waveform(raw_store, timespans[0], 'CI','ADO',0.01,0.4) # this function takes for input: filename, network, station, freqmin, freqmax for a bandpass filter"
+    "plotting_modules.plot_waveform(raw_store, timespans[0], 'CI','ADO', 0.01, 0.4) # this function takes for input: filename, network, station, freqmin, freqmax for a bandpass filter"
    ]
   },
   {