diff --git a/lemi2seed/lemi_metadata.py b/lemi2seed/lemi_metadata.py new file mode 100644 index 0000000000000000000000000000000000000000..a9e048ee07d97e458eedcfe8ae47b2d47dbb746d --- /dev/null +++ b/lemi2seed/lemi_metadata.py @@ -0,0 +1,165 @@ +# -*- coding: utf-8 -*- +""" +Routines for parsing LEMI field spreadsheets(i.e. install and demobilization +sheets) and validating metadata; +Templates for the spreadsheets are provided to the PIs and their formatting is +hard coded. +The metadata for a complete MT dataset was structured into the following +categories: Survey, Station, Run, Electric, Magnetic, and Auxiliary + +Maeva Pourpoint - IRIS/PASSCAL +""" +from __future__ import annotations + +import logging +import logging.config +import re + +from dataclasses import dataclass, field +from obspy import UTCDateTime +from pathlib import Path +from typing import Optional, Set + +from lemi2seed.utils import check_email_formatting, is_valid_uri, str2list + +# Read logging config file +log_file_path = Path(__file__).parent.joinpath('logging.conf') +logging.config.fileConfig(log_file_path) +# Create logger +logger = logging.getLogger(__name__) + + +@dataclass +class BaseMetadata: + # List of metadata fields that are required for archiving but "missing" + # because not provided by the user. + metadata_missing: Set[str] = field(default_factory=set) + # List of metadata fields that are invalid + metadata_invalid: Set[str] = field(default_factory=set) + + +@dataclass +class BaseSurvey(BaseMetadata): + restricted_status: Optional[str] = field(default=None, + metadata={'xml_id': ['restricted_status'], + 'required': True, + 'gui': True}) + time_period_end: Optional[UTCDateTime] = field(default=None, + metadata={'xml_id': ['end_date'], + 'required': True, + 'gui': True}) + time_period_start: Optional[UTCDateTime] = field(default=None, + metadata={'xml_id': ['start_date'], + 'required': True, + 'gui': True}) + + def validate_time_period_end(self, metadata_input: UTCDateTime, data_input: UTCDateTime) -> bool: + if not (isinstance(metadata_input, UTCDateTime) and metadata_input >= data_input): + logger.error("End date of {} should be in UTC and greater than " + "the data acquisition end time." + .format(self.__class__.__name__.split('_')[0])) + return False + return True + + def validate_time_period_start(self, metadata_input: UTCDateTime, data_input: UTCDateTime) -> bool: + if not (isinstance(metadata_input, UTCDateTime) and metadata_input <= data_input): + logger.error("Start date of {} should be in UTC and lower than " + "the data acquisition start time." + .format(self.__class__.__name__.split('_')[0])) + return False + return True + + +@dataclass +class Survey(BaseSurvey): + acquired_by_author: Optional[str] = field(default=None, + metadata={'xml_id': ['comments', 'mt.survey.acquired_by.author'], + 'required': False, + 'gui': True}) + acquired_by_comments: Optional[str] = field(default=None, + metadata={'xml_id': ['comments', 'mt.survey.acquired_by.comments'], + 'required': False, + 'gui': True}) + archive_network: Optional[str] = field(default=None, + metadata={'required': True, + 'gui': True}) + citation_dataset_doi: Optional[str] = field(default=None, + metadata={'xml_id': ['identifiers'], + 'required': True, + 'gui': True}) + comments: Optional[str] = field(default=None, + metadata={'xml_id': ['comments', 'mt.survey.comments'], + 'required': False, + 'gui': True}) + country: Optional[str] = field(default=None, + metadata={'xml_id': ['comments', 'mt.survey.country'], + 'required': False, + 'gui': True}) + geographic_name: Optional[str] = field(default=None, + metadata={'xml_id': ['comments', 'mt.survey.geographic_name'], + 'required': True, + 'gui': True}) + name: Optional[str] = field(default=None, + metadata={'xml_id': ['description'], + 'required': True, + 'gui': True}) + project: Optional[str] = field(default=None, + metadata={'xml_id': ['comments', 'mt.survey.project'], + 'required': True, + 'gui': True}) + project_lead_author: Optional[str] = field(default=None, + metadata={'xml_id': ['operators.contacts.names'], + 'required': False, + 'gui': True}) + project_lead_email: Optional[str] = field(default=None, + metadata={'xml_id': ['operators.contacts.emails'], + 'required': False, + 'gui': True}) + project_lead_organization: Optional[str] = field(default=None, + metadata={'xml_id': ['operators.agency'], + 'required': False, + 'gui': True}) + summary: Optional[str] = field(default=None, + metadata={'xml_id': ['comments', 'mt.survey.summary'], + 'required': False, + 'gui': True}) + + @staticmethod + def validate_archive_network(metadata_input: str) -> bool: + try: + valid = re.match(r"^\w{2}$", metadata_input) + except TypeError: + logger.error("The network code should be a string.") + return False + else: + if valid is None: + logger.error("The network code should be two alphanumeric " + "character long.") + return bool(valid) + + @staticmethod + def validate_citation_dataset_doi(metadata_input: str) -> bool: + # Proper formatting for DOI? + try: + dois = str2list(metadata_input) + except AttributeError: + logger.error("The DOI number(s) should be a string.") + return False + else: + if not all([is_valid_uri(doi) for doi in dois]): + logger.error("Invalid DOI(s). The DOI number(s) provided by " + "the archive should be strings formatted as " + "follows: 'scheme: path'.") + return False + return True + + @staticmethod + def validate_project_lead_email(metadata_input: str) -> bool: + if metadata_input is not None: + try: + emails = str2list(metadata_input) + except AttributeError: + logger.error("The project lead email(s) should be a string.") + return False + return all([check_email_formatting(email) for email in emails]) + return True