lifewatch · cparcerisas · Sep 3, 2024 · Sep 3, 2024 · Sep 3, 2024 · Sep 3, 2024
diff --git a/pypam/acoustic_chunk.py b/pypam/acoustic_chunk.py
diff --git a/pypam/acoustic_file.py b/pypam/acoustic_file.py
@@ -5,8 +5,6 @@
 __status__ = "Development"
 
 import datetime
-import operator
-import os
 import pathlib
 import zipfile
 
@@ -16,12 +14,11 @@
 import seaborn as sns
 import soundfile as sf
 import xarray
-from tqdm.auto import tqdm
 
 from pypam import plots
-from pypam import signal as sig
 from pypam import utils
 from pypam import units as output_units
+from pypam import acoustic_chunk
 
 pd.plotting.register_matplotlib_converters()
 plt.rcParams.update({'pcolor.shading': 'auto'})
@@ -52,32 +49,31 @@ class AcuFile:
         Set to True to subtract the dc noise (root mean squared value
     """
 
-    def __init__(self, sfile, hydrophone, p_ref, timezone='UTC', channel=0, calibration=None, dc_subtract=False):
+    def __init__(self, sfile, hydrophone, p_ref, sfile_next=None, gridded=True, timezone='UTC', channel=0,
+                 calibration=None, dc_subtract=False, chunk_id_start=0):
         # Save hydrophone model
         self.hydrophone = hydrophone
 
         # Get the date from the name
-        if type(sfile) == str:
-            file_name = os.path.split(sfile)[-1]
-        elif issubclass(sfile.__class__, pathlib.Path):
-            file_name = sfile.name
-        elif issubclass(sfile.__class__, zipfile.ZipExtFile):
-            file_name = sfile.name
-        else:
-            raise Exception('The filename has to be either a Path object or a string')
-
-        try:
-            self.date = hydrophone.get_name_datetime(file_name)
-        except ValueError:
-            self.date = datetime.datetime.now()
-            print('Filename %s does not match the %s file structure. Setting time to now...' %
-                  (file_name, self.hydrophone.name))
+        file_name = utils.parse_file_name(sfile)
+        self.date = utils.get_file_datetime(file_name, hydrophone)
 
         # Signal
         self.file_path = sfile
         self.file = sf.SoundFile(self.file_path, 'r')
         self.fs = self.file.samplerate
 
+        # Check if next file is continuous
+        self.date_end = self.date + datetime.timedelta(seconds=self.total_time())
+        self.file_path_next = None
+        if sfile_next is not None:
+            next_file_name = utils.parse_file_name(sfile_next)
+            next_date = utils.get_file_datetime(next_file_name, self.hydrophone)
+
+            if (next_date - self.date_end) < datetime.timedelta(seconds=1):
+                self.file_path_next = sfile_next
+                self.file_next = sf.SoundFile(self.file_path_next, 'r')
+
         # Reference pressure in upa
         self.p_ref = p_ref
 
@@ -94,19 +90,29 @@ def __init__(self, sfile, hydrophone, p_ref, timezone='UTC', channel=0, calibrat
         self.wav = None
         self.time = None
 
-        # Set a starting frame for the file
-        if calibration is None:
-            self._start_frame = 0
-        elif calibration < 0:
-            self._start_frame = self.hydrophone.calibrate(self.file_path)
-            if self._start_frame is None:
-                self._start_frame = 0
-        else:
-            self._start_frame = int(calibration * self.fs)
+        if gridded and calibration is not None:
+            raise AttributeError('The options gridded and calibration are not compatible')
         self.calibration = calibration
+        self.gridded = gridded
+
+        if gridded:
+            self._start_frame = int((pd.to_datetime(self.date).ceil('min') - self.date).total_seconds() * self.fs)
+        else:
+            # Set a starting frame for the file
+            if calibration is None:
+                self._start_frame = 0
+            elif calibration < 0:
+                self._start_frame = self.hydrophone.calibrate(self.file_path)
+                if self._start_frame is None:
+                    self._start_frame = 0
+            else:
+                self._start_frame = int(calibration * self.fs)
 
         self.dc_subtract = dc_subtract
 
+        # Start to count chunk id's
+        self.chunk_id_start = chunk_id_start
+
     def __getattr__(self, name):
         """
         Specific methods to make it easier to access attributes
@@ -142,26 +148,41 @@ def _bins(self, binsize=None, bin_overlap=0):
         else:
             blocksize = self.samples(binsize)
         noverlap = int(bin_overlap * blocksize)
-        n_blocks = self._n_blocks(blocksize, noverlap=noverlap)
+        if self.file_path_next is not None:
+            is_there_next = True
+        else:
+            is_there_next = False
+        n_blocks = self._n_blocks(blocksize, noverlap=noverlap, add_last=is_there_next)
         time_array, _, _ = self._time_array(binsize, bin_overlap=bin_overlap)
-        for i, block in tqdm(enumerate(sf.blocks(self.file_path, blocksize=blocksize, start=self._start_frame,
-                                                 overlap=bin_overlap, always_2d=True, fill_value=0.0)),
-                             total=n_blocks, leave=False, position=0):
-            # Select the desired channel
-            block = block[:, self.channel]
+        chunk_id = self.chunk_id_start
+        file_end = self.file_path
+        for i in np.arange(n_blocks):
             time_bin = time_array[i]
-            # Read the signal and prepare it for analysis
-            signal_upa = self.wav2upa(wav=block)
-            signal = sig.Signal(signal=signal_upa, fs=self.fs, channel=self.channel)
-            if self.dc_subtract:
-                signal.remove_dc()
-            start_sample = i * blocksize + self._start_frame
-            end_sample = start_sample + len(signal_upa)
-            yield i, time_bin, signal, start_sample, end_sample
+            start_sample = i * (blocksize - noverlap) + self._start_frame
+            if i == n_blocks - 1:
+                if is_there_next:
+                    end_sample = blocksize - (self.file.frames - start_sample)
+                    file_end = self.file_path_next
+                else:
+                    end_sample = self.file.frames
+            else:
+                end_sample = start_sample + blocksize
+            chunk = acoustic_chunk.AcuChunk(sfile_start=self.file_path, sfile_end=file_end,
+                                            start_frame=start_sample, end_frame=end_sample, hydrophone=self.hydrophone,
+                                            p_ref=self.p_ref, chunk_id=chunk_id, chunk_file_id=i, time_bin=time_bin)
+
+            chunk_id += 1
+
+            yield i, chunk
+
         self.file.seek(0)
 
-    def _n_blocks(self, blocksize, noverlap):
-        return int(np.floor(self.file.frames - self._start_frame) / (blocksize - noverlap))
+    def _n_blocks(self, blocksize, noverlap, add_last):
+        if add_last:
+            n_blocks = int(np.ceil((self.file.frames - self._start_frame) / (blocksize - noverlap)))
+        else:
+            n_blocks = int(np.floor((self.file.frames - self._start_frame) / (blocksize - noverlap)))
+        return n_blocks
 
     def samples(self, bintime):
         """
@@ -319,7 +340,7 @@ def _time_array(self, binsize=None, bin_overlap=0):
         blocks_samples = np.arange(start=self._start_frame, stop=self.file.frames - 1, step=blocksize)
         end_samples = blocks_samples + blocksize
         incr = pd.to_timedelta(blocks_samples / self.fs, unit='seconds')
-        self.time = self.date + datetime.timedelta(seconds=self._start_frame / self.fs) + incr
+        self.time = self.date + incr
         if self.timezone != 'UTC':
             self.time = pd.to_datetime(self.time).tz_localize(self.timezone).tz_convert('UTC').tz_convert(None)
         return self.time, blocks_samples.astype(int), end_samples.astype(int)
@@ -433,6 +454,8 @@ def _get_metadata_attrs(self):
                 d = d.__dict__[sub_k]
             if isinstance(d, pathlib.Path):
                 d = str(d)
+            if isinstance(d, bool):
+                d = int(d)
             if d is None:
                 d = 0
             metadata_attrs[k.replace('.', '_')] = d
@@ -478,44 +501,13 @@ def _apply_multiple(self, method_list, binsize=None, band_list=None, bin_overlap
                     sorted_bands = [band] + sorted_bands
                 else:
                     sorted_bands = sorted_bands + [band]
-        log = True
         if 'db' in kwargs.keys():
             if not kwargs['db']:
                 log = False
         # Define an empty dataset
         ds = xarray.Dataset()
-        for i, time_bin, signal, start_sample, end_sample in self._bins(binsize, bin_overlap=bin_overlap):
-            ds_bands = xarray.Dataset()
-            for j, band in enumerate(sorted_bands):
-                signal.set_band(band, downsample=downsample)
-                methods_output = xarray.Dataset()
-                for method_name in method_list:
-                    f = operator.methodcaller(method_name, **kwargs)
-                    try:
-                        output = f(signal)
-                    except Exception as e:
-                        print('There was an error in band %s, feature %s. Setting to None. '
-                              'Error: %s' % (band, method_name, e))
-                        output = None
-
-                    units_attrs = output_units.get_units_attrs(method_name=method_name, log=log,
-                                                               p_ref=self.p_ref, **kwargs)
-                    methods_output[method_name] = xarray.DataArray([[output]], coords={'id': [i],
-                                                                                       'datetime': ('id', [time_bin]),
-                                                                                       'start_sample': ('id',
-                                                                                                        [start_sample]),
-                                                                                       'end_sample': (
-                                                                                       'id', [end_sample]),
-                                                                                       'band': [j],
-                                                                                       'low_freq': ('band', [band[0]]),
-                                                                                       'high_freq': (
-                                                                                           'band', [band[1]])},
-                                                                   dims=['id', 'band'],
-                                                                   attrs=units_attrs)
-                if j == 0:
-                    ds_bands = methods_output
-                else:
-                    ds_bands = xarray.concat((ds_bands, methods_output), 'band')
+        for i, chunk in self._bins(binsize, bin_overlap=bin_overlap):
+            ds_bands = chunk.apply_multiple(method_list=method_list, band_list=None, **kwargs)
             if i == 0:
                 ds = ds_bands
             else:
@@ -686,15 +678,8 @@ def _octaves_levels(self, fraction=1, binsize=None, bin_overlap=0, db=True, band
         # Create an empty dataset
         da = xarray.DataArray()
         units_attrs = output_units.get_units_attrs(method_name='octave_levels', p_ref=self.p_ref, log=db)
-        for i, time_bin, signal, start_sample, end_sample in self._bins(binsize, bin_overlap=bin_overlap):
-            signal.set_band(band, downsample=downsample)
-            fbands, levels = signal.octave_levels(db, fraction)
-            da_levels = xarray.DataArray(data=[levels],
-                                         coords={'id': [i], 'start_sample': ('id', [start_sample]),
-                                                 'end_sample': ('id', [end_sample]), 'datetime': ('id', [time_bin]),
-                                                 'frequency': fbands},
-                                         dims=['id', 'frequency']
-                                         )
+        for i, chunk in self._bins(binsize, bin_overlap=bin_overlap):
+            da_levels = chunk.octaves_levels(fraction=fraction, db=db, band=band)
             if i == 0:
                 da = da_levels
             else:
@@ -784,15 +769,8 @@ def spectrogram(self, binsize=None, bin_overlap=0, nfft=512, fft_overlap=0.5,
             band = [None, self.fs / 2]
 
         da = xarray.DataArray()
-        for i, time_bin, signal, start_sample, end_sample in self._bins(binsize, bin_overlap=bin_overlap):
-            signal.set_band(band, downsample=downsample)
-            freq, t, sxx = signal.spectrogram(nfft=nfft, overlap=fft_overlap, scaling=scaling, db=db)
-            da_sxx = xarray.DataArray([sxx], coords={'id': [i],
-                                                     'start_sample': ('id', [start_sample]),
-                                                     'end_sample': ('id', [end_sample]),
-                                                     'datetime': ('id', [time_bin]),
-                                                     'frequency': freq, 'time': t},
-                                      dims=['id', 'frequency', 'time'])
+        for i, chunk in self._bins(binsize, bin_overlap=bin_overlap):
+            da_sxx = chunk.spectrogram(nfft=nfft, fft_overlap=fft_overlap, scaling=scaling, db=db)
             if i == 0:
                 da = da_sxx
             else:
@@ -838,26 +816,9 @@ def _spectrum(self, scaling='density', binsize=None, bin_overlap=0, nfft=512, ff
 
         spectrum_str = 'band_' + scaling
         ds = xarray.DataArray()
-        for i, time_bin, signal, start_sample, end_sample in self._bins(binsize, bin_overlap=bin_overlap):
-            signal.set_band(band, downsample=downsample)
-            fbands, spectra, percentiles_val = signal.spectrum(scaling=scaling, nfft=nfft, db=db,
-                                                               percentiles=percentiles, overlap=fft_overlap)
-
-            spectra_da = xarray.DataArray([spectra],
-                                          coords={'id': [i],
-                                                  'datetime': ('id', [time_bin]),
-                                                  'frequency': fbands,
-                                                  'start_sample': ('id',
-                                                                   [start_sample]),
-                                                  'end_sample': ('id', [end_sample]),
-                                                  },
-                                          dims=['id', 'frequency'])
-            percentiles_da = xarray.DataArray([percentiles_val],
-                                              coords={'id': [i], 'datetime': ('id', [time_bin]),
-                                                      'percentiles': percentiles},
-                                              dims=['id', 'percentiles'])
-
-            ds_bin = xarray.Dataset({spectrum_str: spectra_da, 'value_percentiles': percentiles_da})
+        for i, chunk in self._bins(binsize, bin_overlap=bin_overlap):
+            ds_bin = chunk.spectrum(scaling=scaling, nfft=nfft, fft_overlap=fft_overlap, db=db,
+                                     percentiles=percentiles, band=band)
             if i == 0:
                 ds = ds_bin
             else: