Source code for conmo.metrics.metric

from abc import ABC, abstractmethod
from os import path
from typing import Iterable

import pandas as pd

from conmo.conf import File, Index, Label


[docs]class Metric(ABC):

[docs]    @abstractmethod
    def calculate(self, idx: int, algorithms: Iterable[str], last_preprocess_dir: str, algorithms_dir: str, metrics_dir: str) -> None:
        """
        Calculates specific metric for each of the algorithms' results.

        Parameters
        ----------
        idx: str
            Index of the metric in the Experiment. Userful in case you want to calculate several metrics.
        algoritmss: Iterable[str]
            List of names of the selected algorithms.
        last_preprocess_dir:
            Name of the directory where the ground truth is located
        algorithms_dir:
            Name of the directory where the results of the algorithms executions are stored.
        metrics_dir:
            Name of th edirectory where the results will be stored.
        """
        pass

[docs]    def problem_label(self, truth: pd.DataFrame) -> str:
        """
        Determinates the nature of the problem by identifying the column's name of the labels.

        Parameters
        ----------
        truth: Pandas Dataframe
            Labels file of the dataset.
        Returns
        -------
        str
            Returns the column for the metric.

        Raises
        ------
        RuntimeError
            If the labels of the ground truth are invalid for the problem.
        """
        if Label.ANOMALY in truth.columns:
            return Label.ANOMALY
        elif Label.RUL in truth.columns:
            return Label.RUL
        elif Label.BATTERIES_DEG_TYPES == truth.columns.values.tolist(): # It's a problem related with batteries degradation
            return Label.BATTERIES_DEG_TYPES
        else:
            raise RuntimeError("Invalid labels for ground truth.")

[docs]    def labels_per_sequence(self, labels: pd.DataFrame) -> bool:
        """
        Use only with time series datasets.
        Checks if the labels file of the chosen dataset has an index format with sequences only or sequences and time.
        *This method in future updates will be changed to a specific class for time series.*

        Parameters
        ----------
        labels: Pandas Dataframe
            Labels file of the dataset.
        
        Returns
        -------
        bool
            True if the labels contains 1 level of index with sequence or False if the labels file contains 2 leves with sequence
            and time.

        Raises
        ------
        RuntimeError
            If the number of index levels is invalid.
        """
        # First level is FOLD
        if labels.index.nlevels == 2 and labels.index.names[1] == Index.SEQUENCE:
            return True
        elif labels.index.nlevels == 3 and labels.index.names[1] == Index.SEQUENCE and labels.index.names[2] == Index.TIME:
            return False
        else:
            raise RuntimeError("Invalid number of levels for labels.")

[docs]    def show_start_message(self):
        """
        Simple method to print on the terminal the name of the used metric.
        """
        print("\n+++ Metric {} +++".format(self.__class__.__name__))

[docs]    def load_truth(self, last_preprocess_dir: str):
        """
        Load labels from the last preprocess directory.

        Parameters
        ----------
        last_preprocess_dir: str
            Last diretory where the labels dataframe was stored.

        Returns
        -------
        Pandas Dataframe
            Dataframe cantainig the labels.
        """
        return pd.read_parquet(path.join(last_preprocess_dir, File.LABELS))

[docs]    def load_results(self, algorithm: str, algorithms_dir: str) -> pd.DataFrame:
        """
        Load results for a specific algorthm.

        Parameters
        ----------
        algoritm: str
            Name of the selected algorithm.
        algorithms_dir: str
            Name of the directory where the results of the algorithms executions are stored.

        Returns
        -------
        Pandas Dataframe
            Dataframe cantainig the results (predictions).
        """
        return pd.read_parquet(path.join(algorithms_dir, "{}.gz".format(algorithm)))

[docs]    def save_output(self, metric: pd.DataFrame, idx: int, metrics_dir: str) -> None:
        """
        Save metric's output to disk.

        Parameters
        ----------
        metric: Pandas Dataframe
            Dataframe containing the metric's results. 
        idx: int
            Index of the metric in the Experiment. Userful in case you want to calculate several metrics.
        metrics_dir: str
            Name of the directory where the results will be stored.
        """
        name = "{:02}_{}".format(idx, self.__class__.__name__)
        print(metric)
        metric.to_parquet(path.join(metrics_dir, "{}.gz".format(
            name)), compression="gzip", index=True)