diff --git a/lemi2seed/lemi_metadata.py b/lemi2seed/lemi_metadata.py new file mode 100644 index 0000000000000000000000000000000000000000..c1f154ca9b919fa33b11a318b0343beca3d37b1c --- /dev/null +++ b/lemi2seed/lemi_metadata.py @@ -0,0 +1,176 @@ +# -*- coding: utf-8 -*- +""" +Routines for parsing LEMI field spreadsheets(i.e. install and demobilization +sheets) and validating metadata; +Templates for the spreadsheets are provided to the PIs and their formatting is +hard coded. +The metadata for a complete MT dataset was structured into the following +categories: Survey, Station, Run, Electric, Magnetic, and Auxiliary + +Maeva Pourpoint - IRIS/PASSCAL +""" +from __future__ import annotations + +import logging +import logging.config +import re + +from dataclasses import dataclass, field +from obspy import UTCDateTime +from pathlib import Path +from typing import Optional, Set + +from lemi2seed.utils import check_email_formatting, is_valid_uri, str2list + +# Read logging config file +log_file_path = Path(__file__).parent.joinpath('logging.conf') +logging.config.fileConfig(log_file_path) +# Create logger +logger = logging.getLogger(__name__) + + +@dataclass +class BaseMetadata: + """ + This class handles the metadata attributes shared at all levels: survey; + station; run; electric; magnetic; auxiliary. + """ + # List of metadata fields that are required for archiving but "missing" + # because not provided by the user. + metadata_missing: Set[str] = field(default_factory=set) + # List of metadata fields that are invalid + metadata_invalid: Set[str] = field(default_factory=set) + + +@dataclass +class BaseSurvey(BaseMetadata): + """ + This class handles the metadata attributes shared at the survey and station + levels. + """ + restricted_status: Optional[str] = field(default=None, + metadata={'xml_id': ['restricted_status'], + 'required': True, + 'gui': True}) + time_period_end: Optional[UTCDateTime] = field(default=None, + metadata={'xml_id': ['end_date'], + 'required': True, + 'gui': True}) + time_period_start: Optional[UTCDateTime] = field(default=None, + metadata={'xml_id': ['start_date'], + 'required': True, + 'gui': True}) + + def validate_time_period_end(self, metadata_input: UTCDateTime, data_input: UTCDateTime) -> bool: + if isinstance(metadata_input, UTCDateTime) and metadata_input >= data_input: + return True + else: + logger.error("End date of {} should be in UTC and greater than " + "the data acquisition end time." + .format(self.__class__.__name__.split('_')[0])) + return False + + def validate_time_period_start(self, metadata_input: UTCDateTime, data_input: UTCDateTime) -> bool: + if isinstance(metadata_input, UTCDateTime) and metadata_input <= data_input: + return True + else: + logger.error("Start date of {} should be in UTC and lower than " + "the data acquisition start time." + .format(self.__class__.__name__.split('_')[0])) + return False + + +@dataclass +class Survey(BaseSurvey): + """This class handles all the metadata attributes at the survey level.""" + acquired_by_author: Optional[str] = field(default=None, + metadata={'xml_id': ['comments', 'mt.survey.acquired_by.author'], + 'required': False, + 'gui': True}) + acquired_by_comments: Optional[str] = field(default=None, + metadata={'xml_id': ['comments', 'mt.survey.acquired_by.comments'], + 'required': False, + 'gui': True}) + archive_network: Optional[str] = field(default=None, + metadata={'required': True, + 'gui': True}) + citation_dataset_doi: Optional[str] = field(default=None, + metadata={'xml_id': ['identifiers'], + 'required': True, + 'gui': True}) + comments: Optional[str] = field(default=None, + metadata={'xml_id': ['comments', 'mt.survey.comments'], + 'required': False, + 'gui': True}) + country: Optional[str] = field(default=None, + metadata={'xml_id': ['comments', 'mt.survey.country'], + 'required': False, + 'gui': True}) + geographic_name: Optional[str] = field(default=None, + metadata={'xml_id': ['comments', 'mt.survey.geographic_name'], + 'required': True, + 'gui': True}) + name: Optional[str] = field(default=None, + metadata={'xml_id': ['description'], + 'required': True, + 'gui': True}) + project: Optional[str] = field(default=None, + metadata={'xml_id': ['comments', 'mt.survey.project'], + 'required': True, + 'gui': True}) + project_lead_author: Optional[str] = field(default=None, + metadata={'xml_id': ['operators.contacts.names'], + 'required': False, + 'gui': True}) + project_lead_email: Optional[str] = field(default=None, + metadata={'xml_id': ['operators.contacts.emails'], + 'required': False, + 'gui': True}) + project_lead_organization: Optional[str] = field(default=None, + metadata={'xml_id': ['operators.agency'], + 'required': False, + 'gui': True}) + summary: Optional[str] = field(default=None, + metadata={'xml_id': ['comments', 'mt.survey.summary'], + 'required': False, + 'gui': True}) + + @staticmethod + def validate_archive_network(metadata_input: str) -> bool: + try: + valid = re.match(r"^\w{2}$", metadata_input) + except TypeError: + logger.error("The network code should be a string.") + return False + else: + if valid is None: + logger.error("The network code should be two alphanumeric " + "character long.") + return bool(valid) + + @staticmethod + def validate_citation_dataset_doi(metadata_input: str) -> bool: + # Proper formatting for DOI? + try: + dois = str2list(metadata_input) + except AttributeError: + logger.error("The DOI number(s) should be a string.") + return False + else: + if not all([is_valid_uri(doi) for doi in dois]): + logger.error("Invalid DOI(s). The DOI number(s) provided by " + "the archive should be strings formatted as " + "follows: 'scheme: path'.") + return False + return True + + @staticmethod + def validate_project_lead_email(metadata_input: str) -> bool: + if metadata_input is not None: + try: + emails = str2list(metadata_input) + except AttributeError: + logger.error("The project lead email(s) should be a string.") + return False + return all([check_email_formatting(email) for email in emails]) + return True diff --git a/tests/test_lemi_metadata.py b/tests/test_lemi_metadata.py new file mode 100644 index 0000000000000000000000000000000000000000..11d9bff061aad958aa1a066046961c058d5a7594 --- /dev/null +++ b/tests/test_lemi_metadata.py @@ -0,0 +1,173 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +"""Tests for `lemi_metadata` module.""" + +import logging +import unittest + +from obspy import UTCDateTime +from pathlib import Path + +from lemi2seed.lemi_data import LemiData +from lemi2seed.lemi_metadata import BaseSurvey, Survey, log_file_path + +OUTPUT_MSEED = Path(__file__).parent.joinpath('MSEED') +TEST_DIR = Path(__file__).parent.joinpath('test_data') +SCR_DIR = "lemi2seed.lemi_metadata" + +logging.config.fileConfig(log_file_path) +logger = logging.getLogger(SCR_DIR) + + +class TestBaseSurvey(unittest.TestCase): + """Test suite for BaseSurvey data class.""" + + def setUp(self): + """Set up test fixtures.""" + lemi_data = LemiData(TEST_DIR.joinpath('DATA0110'), OUTPUT_MSEED) + lemi_data.prep_data() + self.data_stats = lemi_data.stats + + def test_validate_time_period_start_not_utc(self): + """Test basic functionality of validate_time_period_start.""" + data_input = self.data_stats['time_period_start'] + metadata_input = 2021.167 + bs = BaseSurvey() + self.assertFalse(bs.validate_time_period_start(metadata_input, data_input)) + + def test_validate_time_period_start_greater_than_acquisition_start(self): + """Test basic functionality of validate_time_period_start.""" + data_input = self.data_stats['time_period_start'] + metadata_input = UTCDateTime('2020-10-01T00:00:00.000000Z') + bs = BaseSurvey() + self.assertFalse(bs.validate_time_period_start(metadata_input, data_input)) + + def test_validate_time_period_start_valid(self): + """Test basic functionality of validate_time_period_start.""" + data_input = self.data_stats['time_period_start'] + metadata_input = UTCDateTime('2020-09-30T00:00:00.000000Z') + bs = BaseSurvey() + self.assertTrue(bs.validate_time_period_start(metadata_input, data_input)) + + def test_validate_time_period_end_not_utc(self): + """Test basic functionality of validate_time_period_end.""" + data_input = self.data_stats['time_period_end'] + metadata_input = 2021.167 + bs = BaseSurvey() + self.assertFalse(bs.validate_time_period_end(metadata_input, data_input)) + + def test_validate_time_period_end_lower_than_acquisition_end(self): + """Test basic functionality of validate_time_period_end.""" + data_input = self.data_stats['time_period_end'] + metadata_input = UTCDateTime('2020-09-30T00:00:00.000000Z') + bs = BaseSurvey() + self.assertFalse(bs.validate_time_period_end(metadata_input, data_input)) + + def test_validate_time_period_end_valid(self): + """Test basic functionality of validate_time_period_end.""" + data_input = self.data_stats['time_period_end'] + metadata_input = UTCDateTime('2020-10-02T00:00:00.000000Z') + bs = BaseSurvey() + self.assertTrue(bs.validate_time_period_end(metadata_input, data_input)) + + +class TestSurvey(unittest.TestCase): + """Test suite for Survey data class.""" + + def test_validate_archive_network_undefined(self): + """Test basic functionality of validate_archive_network.""" + with self.assertLogs(logger, level='ERROR') as cmd: + Survey.validate_archive_network(None) + msg = "The network code should be a string." + self.assertEqual(cmd.output, [":".join(['ERROR', SCR_DIR, msg])]) + + def test_validate_archive_network_erroneous_type(self): + """Test basic functionality of validate_archive_network.""" + with self.assertLogs(logger, level='ERROR') as cmd: + Survey.validate_archive_network(12) + msg = "The network code should be a string." + self.assertEqual(cmd.output, [":".join(['ERROR', SCR_DIR, msg])]) + + def test_validate_archive_network_invalid(self): + """Test basic functionality of validate_archive_network.""" + with self.assertLogs(logger, level='ERROR') as cmd: + Survey.validate_archive_network('EMX') + msg = "The network code should be two alphanumeric character long." + self.assertEqual(cmd.output, [":".join(['ERROR', SCR_DIR, msg])]) + + def test_validate_archive_network_valid(self): + """Test basic functionality of validate_archive_network.""" + self.assertTrue(Survey.validate_archive_network('EM')) + + def test_validate_citation_dataset_doi_undefined(self): + """Test basic functionality of validate_citation_dataset_doi.""" + with self.assertLogs(logger, level='ERROR') as cmd: + Survey.validate_citation_dataset_doi(None) + msg = "The DOI number(s) should be a string." + self.assertEqual(cmd.output, [":".join(['ERROR', SCR_DIR, msg])]) + + def test_validate_citation_dataset_doi_erroneous_type(self): + """Test basic functionality of validate_citation_dataset_doi.""" + with self.assertLogs(logger, level='ERROR') as cmd: + Survey.validate_citation_dataset_doi(10.7914) + msg = "The DOI number(s) should be a string." + self.assertEqual(cmd.output, [":".join(['ERROR', SCR_DIR, msg])]) + + def test_validate_citation_dataset_doi_invalid_doi(self): + """Test basic functionality of validate_citation_dataset_doi.""" + with self.assertLogs(logger, level='ERROR') as cmd: + Survey.validate_citation_dataset_doi('10.7914/SN/EM') + msg = ("Invalid DOI(s). The DOI number(s) provided by the archive " + "should be strings formatted as follows: 'scheme: path'.") + self.assertEqual(cmd.output, [":".join(['ERROR', SCR_DIR, msg])]) + + def test_validate_citation_dataset_doi_invalid_dois(self): + """Test basic functionality of validate_citation_dataset_doi.""" + dois = '10.7914/SN/EM, DOI:10.3421/SN/EG' + with self.assertLogs(logger, level='ERROR') as cmd: + Survey.validate_citation_dataset_doi(dois) + msg = ("Invalid DOI(s). The DOI number(s) provided by the archive " + "should be strings formatted as follows: 'scheme: path'.") + self.assertEqual(cmd.output, [":".join(['ERROR', SCR_DIR, msg])]) + + def test_validate_citation_dataset_doi_valid_doi(self): + """Test basic functionality of validate_citation_dataset_doi.""" + doi = 'DOI:10.7914/SN/EM' + self.assertTrue(Survey.validate_citation_dataset_doi(doi)) + + def test_validate_citation_dataset_doi_valid_dois(self): + """Test basic functionality of validate_citation_dataset_doi.""" + dois = 'DOI:10.7914/SN/EM, DOI:10.3421/SN/EG' + self.assertTrue(Survey.validate_citation_dataset_doi(dois)) + + def test_validate_project_lead_email_undefined(self): + """Test basic functionality of validate_project_lead_email.""" + self.assertTrue(Survey.validate_project_lead_email(None)) + + def test_validate_project_lead_email_erroneous_type(self): + """Test basic functionality of validate_project_lead_email.""" + with self.assertLogs(logger, level='ERROR') as cmd: + Survey.validate_project_lead_email(12) + msg = "The project lead email(s) should be a string." + self.assertEqual(cmd.output, [":".join(['ERROR', SCR_DIR, msg])]) + + def test_validate_project_lead_email_invalid_email(self): + """Test basic functionality of validate_project_lead_email.""" + email = 'mpasscal.edu' + self.assertFalse(Survey.validate_project_lead_email(email)) + + def test_validate_project_lead_email_invalid_emails(self): + """Test basic functionality of validate_project_lead_email.""" + emails = 'mpasscal.edu, d@passcal.edu' + self.assertFalse(Survey.validate_project_lead_email(emails)) + + def test_validate_project_lead_email_valid_email(self): + """Test basic functionality of validate_citation_dataset_doi.""" + email = 'm@passcal.edu' + self.assertTrue(Survey.validate_project_lead_email(email)) + + def test_validate_project_lead_email_valid_emails(self): + """Test basic functionality of validate_citation_dataset_doi.""" + emails = 'm@passcal.edu, d@passcal.edu' + self.assertTrue(Survey.validate_project_lead_email(emails))