Source code for tds2stac.analysers.recognizer

# SPDX-FileCopyrightText: 2023 Karlsruher Institut für Technologie
#
# SPDX-License-Identifier: CC0-1.0

from typing import Union

import requests
from lxml import etree

from .. import logger, utils
from ..statics import constants


[docs] class Recognizer(object): """ A class for recognizing nine different and possible scenarios in management of TDS datasets. We will explain each scenario in the following. First scenario: Just `catalogRef` tags are located directly under the dataset element tag. tag `https://thredds.imk-ifu.kit.edu/thredds/catalog/regclim/raster/global/era5/sfc/single/catalog.xml` (nested) Second senarion: `CatalogRefs` are not under a dataset element tag and directly come below the `catalog`. `https://thredds.imk-ifu.kit.edu/thredds/catalog/catalogues/sensor_catalog_ext.xml` (nested) Third scenario: One single `dataset` tag is located next to `CatalogRef` tags. All are under a `dataset` tag. `https://thredds.imk-ifu.kit.edu/thredds/catalog/regclim/raster/global/chirps/catalog.xml` (nested) Fourth scenario: An empty datasets. `https://thredds.imk-ifu.kit.edu/thredds/catalog/catalogues/bio_geo_chem_catalog_ext.xml` or `https://thredds.atmohub.kit.edu/thredds/catalog/snowfogs/catalog.xml` Fifth scenario: There is no `CatalogRef` tag and all are `dataset` tag. All of them are under a `dataset` tag. `https://thredds.imk-ifu.kit.edu/thredds/catalog/climate/raster/global/chelsa/v1.2/catalog.html` Sixth scenario: A single dataset `https://thredds.imk-ifu.kit.edu/thredds/catalog/regclim/raster/global/era5/sfc/single/catalog.xml?dataset=regclim/raster/global/era5/sfc/single/era5_sfc_20210101.nc` Seventh scenario: An aggregated dataset `https://thredds.imk-ifu.kit.edu/thredds/catalog/catalogues/swabian_moses_2021.xml?dataset=swabian_moses_aggregation` Eighth scenario: A combination of `caralogRef` and `dataset` tags that is not under a `dataset` tag.It's similar to second scenario but with datasets `https://thredds.imk-ifu.kit.edu/thredds/catalog/catalogues/transfer.xml` Ninth scenario: When we have a bunch of single `dataset` tags next to catalogref. It's similar to third scenario but with more datasets. `https://thredds.imk-ifu.kit.edu/thredds/catalog/regclim/raster/global/hydrogfd/v3.0/catalog.xml` (nested) Args: main_catalog_url: TDS Catalog url to start harvesting nested_check: An option for checking nested datasets in TDS (True or False) auth: Authentication for TDS catalog e.g.('user', 'password') logger_properties: A dictionary for logger properties. requests_properties: A dictionary for requests properties. """ main_catalog_url: str """ TDS Catalog url to start harvesting (*) """ nested_check: bool """ An option for checking nested datasets in TDS (True or False) (optional) """ logger_properties: dict """ A dictionary for logger properties. For more information see :class:`~tds2stac.logger.Logger` """ requests_properties: dict """ To obtain additional information on this topic, refer to the :class:`~tds2stac.TDS2STAC.requests_properties`. The default value is an empty dictionary. """ def __init__( self, main_catalog_url: str, nested_check: bool = False, logger_properties: dict = dict(), requests_properties: dict = dict(), ): if logger_properties is not None: self.logger_properties = logger_properties self.requests_properties = requests_properties self.all_dirs: list = [] self.all_href: list = [] self.all_dirs_extensions: list = [] self.nested_check = nested_check self.nested_num: int = 0 self.nested_num_temp: int = 0 self.status: Union[str, None] = None # using 'xml_processing' we get the XML contents of catalog URL. xml_url_catalog, id_catalog, xml = utils.xml_processing( main_catalog_url, self.requests_properties ) self.id_catalog = id_catalog self.xml_url_catalog = xml_url_catalog self.xml_url_catalog_temp = xml_url_catalog self.xml = xml self.recognition_function(self.xml_url_catalog, self.xml) self.logger_properties["logger_level"] = "INFO" self.logger_properties[ "logger_msg" ] = f"{self.status, self.nested_num}" logger.Logger(self.logger_properties)
[docs] def recognition_function(self, url: str, xml_content): """ A function for recognizing number of scenarios in TDS """ # Opening the Catalog url try: tree = etree.XML(xml_content) # return [e for e in tree] except BaseException: return ( "The Catalog is not reachable. Check the Catalg URL in the TDS" ) # we have nine different cases. # First, catalogRefs are under the dataset tag (https://thredds.imk-ifu.kit.edu/thredds/catalog/regclim/raster/global/era5/sfc/single/catalog.html) --> nested # and one without dataset tag directly with catalogRef(https://thredds.imk-ifu.kit.edu/thredds/catalog/catalogues/sensor_catalog_ext.html) --> nested # the third one is the case that we have a single data next to catalogRefs (https://thredds.imk-ifu.kit.edu/thredds/catalog/regclim/raster/global/chirps/catalog.html) --> nested # the fourth is empty datasets (https://thredds.imk-ifu.kit.edu/thredds/catalog/catalogues/bio_geo_chem_catalog_ext.html or https://thredds.atmohub.kit.edu/thredds/catalog/snowfogs/catalog.html) # the fifth one is having all data under the dataset (https://thredds.imk-ifu.kit.edu/thredds/catalog/climate/raster/global/chelsa/v1.2/catalog.html) # the sixth one is a single data (https://thredds.imk-ifu.kit.edu/thredds/catalog/regclim/raster/global/era5/sfc/single/catalog.html?dataset=regclim/raster/global/era5/sfc/single/era5_sfc_20210101.nc) # the seventh is a aggregated data (https://thredds.imk-ifu.kit.edu/thredds/catalog/catalogues/swabian_moses_2021.html?dataset=swabian_moses_aggregation) # the eighth case is combination of caralogRef and dataset that is not under a dataset tag like second case but with dataset (https://thredds.imk-ifu.kit.edu/thredds/catalog/catalogues/transfer.html) # the ninth case is the case when we have a bunch of single data next to catalogref (https://thredds.imk-ifu.kit.edu/thredds/catalog/regclim/raster/global/hydrogfd/v3.0/catalog.html) --> nested all_tags_list = [e for e in tree] if "catalogRef" not in str(all_tags_list) and "dataset" in str( all_tags_list ): # First and third to seventh cases are here # First case: if ( tree.findall(".//{%s}dataset[@urlPath]" % constants.unidata) == [] and tree.findall(".//{%s}catalogRef" % constants.unidata) != [] ): if self.nested_check is not True: self.status = "First Scenario" return self.status else: self.nested_num = 0 self.nested_num_temp = 0 self.nested_checker(url) for i in self.all_dirs: if i.count("/") > self.nested_num: self.nested_num = i.count("/") self.nested_num = self.nested_num + 1 self.status = "First Scenario" return self.status, self.nested_num # Third case: if ( tree.findall(".//{%s}dataset[@urlPath]" % constants.unidata) != [] and len( tree.findall( ".//{%s}dataset[@urlPath]" % constants.unidata ) ) == 1 and tree.findall(".//{%s}catalogRef" % constants.unidata) != [] ): if self.nested_check is not True: self.status = "Third Scenario" return self.status else: self.nested_num = 0 self.nested_num_temp = 0 self.nested_checker(url) for i in self.all_dirs: if i.count("/") > self.nested_num: self.nested_num = i.count("/") self.nested_num = self.nested_num + 1 self.status = "Third Scenario" return self.status, self.nested_num # Ninth case: if ( tree.findall(".//{%s}dataset[@urlPath]" % constants.unidata) != [] and len( tree.findall( ".//{%s}dataset[@urlPath]" % constants.unidata ) ) > 1 and tree.findall(".//{%s}catalogRef" % constants.unidata) != [] ): if self.nested_check is not True: self.status = "Ninth Scenario" return self.status else: self.nested_num = 0 self.nested_num_temp = 0 self.nested_checker(url) for i in self.all_dirs: if i.count("/") > self.nested_num: self.nested_num = i.count("/") self.nested_num = self.nested_num + 1 self.status = "Ninth Scenario" return self.status, self.nested_num # Fourth case: if ( tree.findall(".//{%s}dataset[@urlPath]" % constants.unidata) == [] and tree.findall(".//{%s}catalogRef" % constants.unidata) == [] ): self.status = "Fourth Scenario" return self.status if ( tree.findall(".//{%s}dataset[@urlPath]" % constants.unidata) != [] and tree.findall(".//{%s}catalogRef" % constants.unidata) == [] ): # Fifth to Seventh cases are here # THIS CONDITION SHOULD BE REFACTORED. I THINK BY REMOVING [@urlPath] FROM THE CONDITION IT WILL WORK if len(tree.findall(".//{%s}dataset" % constants.unidata)) > 1: self.status = "Fifth Scenario" return self.status else: # Sixth and Seventh cases are here dataset = tree.find("{%s}dataset" % constants.unidata) metadata = dataset.find("{%s}metadata" % constants.unidata) # Serviceså service_tag = dataset.find( "{%s}serviceName" % constants.unidata ) if service_tag is None: if metadata is not None: service_tag = metadata.find( "{%s}serviceName" % constants.unidata ) else: return "The dataset is not even Single or Aggregated dataset".__str__() if service_tag is None: # Use services found in the file. FMRC aggs do this. services = tree.findall( ".//{%s}service[@serviceType='Compound']" % constants.unidata ) else: # Use specific named services THIS PART SHOULD BE REFACTORED services = tree.findall( ".//{%s}service[@name='%s']" % (constants.unidata, service_tag.text) ) for i, service in enumerate(services): # In TDS version 4 and 5 'Compound' is different if ( service.get("serviceType") == "Compound" or service.get("serviceType") == "compound" ): for s in service.findall( "{%s}service" % constants.unidata ): if dataset.get("urlPath") is not None: service_url = utils.references_urls( url, s.get("base") ) + dataset.get("urlPath") else: service_url = utils.references_urls( url, s.get("base") ) if s.get("name") == "http": a = requests.head(service_url) if "Content-Length" not in a.headers: self.status = "Seventh Scenario" return self.status else: self.status = "Sixth Scenario" return self.status # Second case: elif "catalogRef" in str(all_tags_list) and "dataset" not in str( all_tags_list ): if self.nested_check is not True: self.status = "Second Scenario" return self.status else: self.nested_num = 0 self.nested_num_temp = 0 self.nested_checker_exceptions(url) for i in self.all_href: for j in self.all_href: if i[0] in j[1]: if j[0] not in self.all_dirs_extensions: self.all_dirs_extensions.append(j[0]) self.nested_num = len(self.all_dirs_extensions) + 1 self.status = "Second Scenario" return self.status, self.nested_num # Eighth case: elif "catalogRef" in str(all_tags_list) and "dataset" in str( all_tags_list ): if self.nested_check is not True: self.status = "Eighth Scenario" return self.status else: self.nested_num = 0 self.nested_num_temp = 0 self.nested_checker_exceptions(url) for i in self.all_href: for j in self.all_href: if i[0] in j[1]: if j[0] not in self.all_dirs_extensions: self.all_dirs_extensions.append(j[0]) self.nested_num = len(self.all_dirs_extensions) + 1 self.status = "Eighth Scenario" return self.status, self.nested_num # Fourth case: elif "catalogRef" not in str(all_tags_list) and "dataset" not in str( all_tags_list ): if ( tree.findall(".//{%s}dataset[@urlPath]" % constants.unidata) == [] and tree.findall(".//{%s}catalogRef" % constants.unidata) == [] ): self.status = "Fourth Scenario" return self.status
[docs] def nested_checker(self, url: str): """ A function for returning the depth of nested datasets in TDS for scenarios 1, 3, ,and 9 """ xml_url_catalog, id_catalog, xml = utils.xml_processing( url, self.requests_properties ) try: tree = etree.XML(xml) except BaseException: return ( "The Catalog is not reachable. Check the Catalg URL in the TDS" ) for child in tree.findall(".//{%s}catalogRef" % constants.unidata): if "catalogRef" in str(child): self.nested_checker( utils.references_urls( url, child.get("{%s}href" % constants.w3) ) ) url_modified = url.replace("catalog.xml", "").replace( self.xml_url_catalog.replace("catalog.xml", ""), "" ) if url_modified not in self.all_dirs: self.all_dirs.append( url.replace("catalog.xml", "").replace( self.xml_url_catalog.replace("catalog.xml", ""), "" ) )
[docs] def nested_checker_exceptions(self, url: str): """ A function for returning the depth of nested datasets in TDS for scenarios 2 and 8 """ xml_url_catalog, id_catalog, xml = utils.xml_processing( url, self.requests_properties ) try: tree = etree.XML(xml) except BaseException: return ( "The Catalog is not reachable. Check the Catalg URL in the TDS" ) for child in tree.findall(".//{%s}catalogRef" % constants.unidata): if "catalogRef" in str(child): self.nested_checker_exceptions( utils.references_urls( url, child.get("{%s}href" % constants.w3) ) ) if ( url, [ utils.references_urls( url, c.get("{%s}href" % constants.w3) ) for c in tree.findall( ".//{%s}catalogRef" % constants.unidata ) ], ) not in self.all_href: self.all_href.append( ( url, [ utils.references_urls( url, c.get("{%s}href" % constants.w3) ) for c in tree.findall( ".//{%s}catalogRef" % constants.unidata ) ], ) )
def __str__(self): return f"{self.status, self.nested_num}"