Source code for conmo.splitters.splitter

from abc import ABC, abstractmethod
from os import path

import pandas as pd

from conmo.conf import File, Index


[docs]class Splitter(ABC):

[docs]    @abstractmethod
    def split(self, in_dir: str, out_dir: str) -> None:
        """
        Performs the split to both data and labels of the dataset.

        Parameters
        ----------
        in_dir: str
            Input directory of the before step.
        out_dir: str
            Output directory where te split data will be stored.
        """
        pass

[docs]    def show_start_message(self):
        """
        Simple method to print on the terminal the name of the selected splitter.
        """
        print("\n+++ Splitter {} +++".format(self.__class__.__name__))

[docs]    def already_splitted(self, df: pd.DataFrame) -> bool:
        """
        Checks if the dataset was already splitted.

        Parameters
        ----------
        df: Pandas Dataframe
            Input dataset.

        Returns
        -------
        bool
            True in case the dataset was already splitted, False otherwise.

        Raises
        ------
        RuntimeError
            If the dataset isn't splitted and doesn't follow Conmo's format.

        """
        nindex = df.index.names
        # Soft comparison to allow both [SEQUENCE,TIME] and only [TIME] indexes for DATA and LABELS dataframes
        if (len(nindex) == 3 or len(nindex) == 4) and nindex[0] == Index.FOLD and nindex[1] == Index.SET and nindex[2] == Index.SEQUENCE:
            return True
        elif (len(nindex) == 1 or len(nindex) == 2) and nindex[0] == Index.SEQUENCE:
            return False
        else:
            print(nindex)
            raise RuntimeError(
                "Input DataFrame does not contain a valid index configuration.")

[docs]    def load_input(self, in_dir: str) -> (pd.DataFrame, pd.DataFrame):
        """
        Read parquet data and labels files of the chosen dataset.
        
        Parameters
        ----------
        in_dir: str
            Input directory where the files are located.

        Returns
        -------
        data: Pandas Dataframe
            Loaded data file.
        labels: Pandas Dataframe
            Loaded labels file.
        
        Raises
        ------
        If data and labels have different sequences values.
        """
        # Load input dataframes
        data = pd.read_parquet(path.join(in_dir, File.DATA))
        labels = pd.read_parquet(path.join(in_dir, File.LABELS))

        # Check both DATA and LABELS have the same sequences indexes
        if not data.index.get_level_values(Index.SEQUENCE).unique().equals(labels.index.get_level_values(Index.SEQUENCE).unique()):
            raise RuntimeError(
                "Data and Labels files have different sequences values. Both must have the same values")

        return data, labels

[docs]    def save_output(self, out_dir: str, data: pd.DataFrame, labels: pd.DataFrame) -> None:
        """
        Save splitted dataset to parquet format.

        Parameters
        ----------
        out_dir: str
            Output directory where the results will be saved.
        data: Pandas Dataframe
            Splitted data.
        labels: Pandas Dataframe
            Splitted labels.
        """
        # Save output dataframes
        data.to_parquet(path.join(out_dir, File.DATA),
                        compression="gzip", index=True)
        labels.to_parquet(path.join(out_dir, File.LABELS),
                          compression="gzip", index=True)