From 8f752d00a024ed0c80accfb3f5fb7fffe92084e2 Mon Sep 17 00:00:00 2001
From: kienle <kienle@passcal.nmt.edu>
Date: Thu, 16 Jan 2025 12:33:07 -0700
Subject: [PATCH] Revert "Revert "Read all data points in SOH channels""

This reverts commit e5ce8e4ba5d68aaa9a1248e8f73914362a8c5c65.
---
 .../model/mseed_data/mseed_reader.py          |  48 ++++++--
 .../model/mseed_data/record_reader.py         | 114 ++++++++++++++++--
 tests/model/mseed_data/test_record_reader.py  |  78 ++++++++++++
 3 files changed, 220 insertions(+), 20 deletions(-)
 create mode 100644 tests/model/mseed_data/test_record_reader.py

diff --git a/sohstationviewer/model/mseed_data/mseed_reader.py b/sohstationviewer/model/mseed_data/mseed_reader.py
index ac66513a4..45a3243ea 100644
--- a/sohstationviewer/model/mseed_data/mseed_reader.py
+++ b/sohstationviewer/model/mseed_data/mseed_reader.py
@@ -1,6 +1,8 @@
 from numbers import Real
 from typing import BinaryIO, Optional, Dict, Union, List, Tuple
 from pathlib import Path
+
+import numpy
 from obspy import UTCDateTime
 
 from sohstationviewer.model.mseed_data.record_reader import RecordReader
@@ -30,6 +32,9 @@ def move_to_next_record(file, current_record_start: int,
     file.seek(record_size, 1)
 
 
+LogData = Dict[str, Union[List[str], Dict[str, List[str]]]]
+
+
 class MSeedReader:
     def __init__(self, file_path: Path,
                  read_start: float = UTCDateTime(0).timestamp,
@@ -42,8 +47,7 @@ class MSeedReader:
                  soh_data: Dict = {},
                  mass_pos_data: Dict = {},
                  waveform_data: Dict = {},
-                 log_data: Dict[str, Union[List[str],
-                                           Dict[str, List[str]]]] = {},
+                 log_data: LogData = {},
                  gap_minimum: Optional[float] = None
                  ) -> None:
         """
@@ -147,8 +151,8 @@ class MSeedReader:
 
     def append_data(self, data_dict: dict,
                     record: RecordReader,
-                    data_points: Tuple[Real, Real],
-                    times: Tuple[Real, Real]) -> None:
+                    data_points: List[Real],
+                    times: List[Real]) -> None:
         """
         Append some data points to the given data_dict
 
@@ -167,8 +171,8 @@ class MSeedReader:
         self.add_chan_data(station, meta, data_points, times)
 
     def _add_new_trace(self, channel: Dict, metadata: RecordMetadata,
-                       data_points: Tuple[Real, Real],
-                       times: Tuple[Real, Real]) -> None:
+                       data_points: List[Real],
+                       times: List[Real]) -> None:
         """
         Start a new trace to channel['tracesInfo'] with data_point as
             the first data value and metadata's start_time as first time value
@@ -208,14 +212,14 @@ class MSeedReader:
             channel['tracesInfo'][-1]['data'] = channel['tracesInfo'][-1][
                                                     'data'][:-1]
             channel['tracesInfo'][-1]['times'] = channel['tracesInfo'][-1][
-                                                    'times'][:-1]
+                                                     'times'][:-1]
 
         channel['tracesInfo'][-1]['data'].extend(data_points)
         channel['tracesInfo'][-1]['times'].extend(times)
 
     def add_chan_data(self, station: dict, metadata: RecordMetadata,
-                      data_points: Tuple[Real, Real],
-                      times: Tuple[Real, Real]) -> None:
+                      data_points: List[Real],
+                      times: List[Real]) -> None:
         """
         Add new channel to the passed station and append data_point to the
         channel if there's no gap/overlap or start a new trace of data
@@ -315,10 +319,28 @@ class MSeedReader:
                     continue
                 else:
                     break
-            first_data_point = record.get_two_data_points()
-            times = (record.record_metadata.start_time,
-                     record.record_metadata.end_time)
-            self.append_data(data_dict, record, first_data_point, times)
+            if data_dict is self.waveform_data:
+                data_points = list(record.get_two_data_points())
+                times = [record.record_metadata.start_time,
+                         record.record_metadata.end_time]
+            else:
+                try:
+                    data_points = record.get_data_points()
+                    sample_interval = 1 / record.record_metadata.sample_rate
+                    times = numpy.arange(
+                            record.record_metadata.start_time,
+                            record.record_metadata.end_time,
+                            sample_interval).tolist()
+                    # The previous calculation will always miss one time point
+                    # at the end. We can adjust the stop argument to capture
+                    # that point in one calculation, but the code for that is a
+                    # lot harder to parse. It is a lot more straightforward to
+                    # just add the final time point at the end.
+                    times.append(times[-1] + sample_interval)  # noqa
+                except ZeroDivisionError:
+                    data_points = None
+                    times = None
+            self.append_data(data_dict, record, data_points, times)
             self.append_log(record)
 
             move_to_next_record(self.file, current_record_start, record)
diff --git a/sohstationviewer/model/mseed_data/record_reader.py b/sohstationviewer/model/mseed_data/record_reader.py
index 9105bc7dc..654f0a9dc 100644
--- a/sohstationviewer/model/mseed_data/record_reader.py
+++ b/sohstationviewer/model/mseed_data/record_reader.py
@@ -1,7 +1,11 @@
+import sys
+import warnings
 from numbers import Real
 from typing import BinaryIO, Optional, List, Tuple
 
+import numpy
 from obspy import UTCDateTime
+from obspy.io.mseed.util import _unpack_steim_1, _unpack_steim_2
 
 from sohstationviewer.model.mseed_data.decode_mseed import (
     decode_ieee_float, decode_ieee_double, decode_steim, decode_int16,
@@ -40,6 +44,37 @@ class RecordReader:
         self.ascii_text: str = ''
         self.read_header()
 
+    def get_libmseed_swapflag(self):
+        """
+        Get the appropriate byte-order swap flag to pass to obspy's wrappers
+        of libmseed's functions. A swap flag of 0 means big-endian, while a
+        swap flag of 1 means little-endian.
+
+        In C, the language libmseed was written in, the swap flag is pretty
+        straightforward and depends entirely on the endianness of the record
+        being read. However, the abstractions Python builds on top of C make it
+        so that we have to take into consider the running's system endianness
+        when determining the appropriate swap flag. In particular, a
+        little-endian system means we have to invert the endianness of the
+        current record to get the final swap flag.
+
+        :return: 0 or 1, depending on the data set and the running system.
+        """
+        if sys.byteorder == 'little':
+            if self.data_unpacker.byte_order_char in ['!', '>']:
+                swapflag = 1
+            else:
+                swapflag = 0
+        elif sys.byteorder == 'big':
+            warnings.warn('SOHViewer has not been tested on big-endian '
+                          'hardware. Please be advised that the plots shown '
+                          'might not reflect the actual data.')
+            if self.data_unpacker.byte_order_char in ['!', '>', '@', '=']:
+                swapflag = 0
+            else:
+                swapflag = 1
+        return swapflag
+
     def read_header(self) -> None:
         """
         Read the header of the current data record. The header includes the
@@ -242,18 +277,21 @@ class RecordReader:
         except UnicodeDecodeError:
             pass
 
-    def get_two_data_points(self) -> Optional[Tuple[Real, Real]]:
+    def get_encoding_format(self):
         """
-        Get two data points from the current data record.
-        :return: two data point from the current data record; the chosen points
-            depend on the encoding type stored in blockette 1000.
+        Get the encoding format of the record.
+        :return: the encoding format of the record
         """
-        record_start = self.file.tell()
-
         encoding_format = self.blockette_1000.encoding_format
         encoding_format = self.header_unpacker.unpack('b', encoding_format)[0]
         encoding_format = EncodingFormat(encoding_format)
+        return encoding_format
 
+    def get_raw_record_data(self):
+        """
+        Get the data stored in the record as a string of raw bytes.
+        :return: the data bytes stored in the record
+        """
         data_start = self.header_unpacker.unpack(
             'H', self.fixed_header.data_offset
         )[0]
@@ -261,8 +299,20 @@ class RecordReader:
             'B', self.blockette_1000.record_length
         )[0]
         record_size = 2 ** record_size_exponent
-
         record_data = self.file.read(record_size)[data_start:]
+        return record_data
+
+    def get_two_data_points(self) -> Optional[Tuple[Real, Real]]:
+        """
+        Get two data points from the current data record.
+        :return: two data points from the current data record; the chosen
+            points depend on the encoding type stored in blockette 1000.
+        """
+        record_start = self.file.tell()
+
+        encoding_format = self.get_encoding_format()
+
+        record_data = self.get_raw_record_data()
         if encoding_format == EncodingFormat.ASCII:
             self.decode_ascii_data(record_data)
             data_points = None
@@ -291,3 +341,53 @@ class RecordReader:
         # if needed.
         self.file.seek(record_start)
         return data_points
+
+    def get_data_points(self) -> numpy.ndarray:
+        """
+        Get all the data points in the current record by decoding the raw data.
+        :return: a numpy array containing all the data points stored in the
+            current record
+        """
+        try:
+            record_start = self.file.tell()
+
+            encoding_format = self.get_encoding_format()
+
+            record_data = self.get_raw_record_data()
+            record_sample_count = self.header_unpacker.unpack(
+                'H', self.fixed_header.sample_count
+            )[0]
+
+            if encoding_format == EncodingFormat.ASCII:
+                self.decode_ascii_data(record_data)
+                data_points = None
+            elif encoding_format in [EncodingFormat.STEIM_1,
+                                     EncodingFormat.STEIM_2]:
+                if encoding_format == EncodingFormat.STEIM_1:
+                    unpack_steim = _unpack_steim_1
+                else:
+                    unpack_steim = _unpack_steim_2
+                compressed_dtype = numpy.dtype(numpy.uint8)
+                data = numpy.frombuffer(record_data, compressed_dtype)
+                swapflag = self.get_libmseed_swapflag()
+                data_points = unpack_steim(data, record_sample_count, swapflag,
+                                           0)
+            else:
+                # We currently support only a subset of the data types
+                # supported by obspy.
+                encoding_to_dtype = {
+                    EncodingFormat.INT_16_BIT: 'i2',
+                    EncodingFormat.INT_32_BIT: 'i4',
+                    EncodingFormat.IEEE_FLOAT_32_BIT: 'f4',
+                    EncodingFormat.IEEE_FLOAT_64_BIT: 'f8',
+                }
+                dtype = numpy.dtype(self.data_unpacker.byte_order_char +
+                                    encoding_to_dtype[encoding_format])
+
+                data_points = numpy.frombuffer(record_data, dtype,
+                                               record_sample_count)
+            return data_points
+        finally:
+            # Ensure that we seek back to the start of the record. This needs
+            # to happen so that we can call this method again if needed.
+            self.file.seek(record_start)
diff --git a/tests/model/mseed_data/test_record_reader.py b/tests/model/mseed_data/test_record_reader.py
new file mode 100644
index 000000000..90c780a0a
--- /dev/null
+++ b/tests/model/mseed_data/test_record_reader.py
@@ -0,0 +1,78 @@
+import tempfile
+from unittest import TestCase
+
+import numpy
+import obspy
+from numpy.testing import assert_array_equal
+from obspy import Trace, Stream
+
+from sohstationviewer.model.mseed_data.record_reader import RecordReader
+
+
+class TestRecordReader(TestCase):
+    temp_data_file = tempfile.NamedTemporaryFile(delete=False)
+    temp_ascii_file = tempfile.NamedTemporaryFile(delete=False)
+
+    @classmethod
+    def setUpClass(cls) -> None:
+        data_file = ('tests/test_data/Q330-sample/day_vols_AX08/'
+                     'AX08.XA..VKI.2021.186')
+        ascii_file = ('tests/test_data/Q330-sample/day_vols_AX08/'
+                      'AX08.XA..LOG.2021.186')
+        # Rewrite the data to make testing easier.
+        data_stream: Stream = obspy.read(data_file)
+        data_trace: Trace = data_stream[0]
+        # We only want enough data that the record has exactly one hour of
+        # data for easier testing. The number of points hardcoded below is
+        # calculated based on the sample rate of the data.
+        data_trace.data = numpy.arange(0, 361, dtype=data_trace.data.dtype)
+        data_trace.stats['starttime'] -= data_trace.stats['starttime']
+        data_trace.write(cls.temp_data_file.name, format='MSEED')
+
+        ascii_stream: Stream = obspy.read(ascii_file)
+        ascii_trace: Trace = ascii_stream[0]
+        ascii_trace.data = numpy.array([b'\n'] * len(ascii_trace))
+        ascii_trace.stats['starttime'] -= ascii_trace.stats['starttime']
+        ascii_trace.write(cls.temp_ascii_file.name, format='MSEED')
+
+    @classmethod
+    def tearDownClass(cls) -> None:
+        cls.temp_data_file.close()
+        cls.temp_ascii_file.close()
+
+    def test_get_two_data_points_data_record(self):
+        with open(self.temp_data_file.name, 'rb') as data_file:
+            expected_data_point = (0, 360)
+
+            record = RecordReader(data_file)
+            actual_data_points = record.get_two_data_points()
+
+            self.assertEqual(expected_data_point, actual_data_points)
+            self.assertEqual('', record.ascii_text)
+
+    def test_get_two_data_points_ascii_record(self):
+        with open(self.temp_ascii_file.name, 'rb') as data_file:
+
+            record = RecordReader(data_file)
+            actual_data_points = record.get_two_data_points()
+
+            self.assertIsNone(actual_data_points)
+            self.assertEqual('\n' * 3950, record.ascii_text)
+
+    def test_get_data_points_data_record(self):
+        with open(self.temp_data_file.name, 'rb') as data_file:
+            expected_data_point = numpy.arange(0, 361, dtype=numpy.int32)
+
+            record = RecordReader(data_file)
+            actual_data_points = record.get_data_points()
+
+            assert_array_equal(expected_data_point, actual_data_points)
+            self.assertEqual('', record.ascii_text)
+
+    def test_get_data_points_ascii_record(self):
+        with open(self.temp_ascii_file.name, 'rb') as data_file:
+            record = RecordReader(data_file)
+            actual_data_points = record.get_data_points()
+
+            self.assertIsNone(actual_data_points)
+            self.assertEqual('\n' * 3950, record.ascii_text)
-- 
GitLab