Source code for conmo.preprocesses.preprocess

from abc import ABC, abstractmethod
from os import path
from typing import Iterable, Union

import pandas as pd

from conmo.conf import File, Index


[docs]class Preprocess(ABC):
    """
    Abstract base class for a Preprocess.

    This class is an abstract class from which other subclasses inherit and must not be instanciated directly.
    """

[docs]    @abstractmethod
    def apply(self, in_dir: str, out_dir: str) -> None:
        """
        Applies the preprocess to the given dataset.

        Parameters
        ----------
        in_dir: str
            Input directory where the files are located. Usually, this is the output directory of the splitter step.
        out_dir: str
            Output directory where the files will be saved.
        """
        pass

[docs]    def show_start_message(self) -> None:
        """
        Simple method to print on the terminal the name of the selected splitter.
        """
        print("\n+++ Preprocess {} +++".format(self.__class__.__name__))

[docs]    def load_input(self, in_dir: str) -> (pd.DataFrame, pd.DataFrame):
        """
        Read parquet data and labels files of the chosen dataset before it's split.
        
        Parameters
        ----------
        in_dir: str
            Input directory where the files are located.

        Returns
        -------
        data: Pandas Dataframe
            Loaded data file.
        labels: Pandas Dataframe
            Loaded labels file.
        """
        data = pd.read_parquet(path.join(in_dir, File.DATA))
        labels = pd.read_parquet(path.join(in_dir, File.LABELS))
        return data, labels

[docs]    def save_output(self, out_dir: str, data: pd.DataFrame, labels: pd.DataFrame) -> None:
        """
        Save preprocessed dataset to parquet format.

        Parameters
        ----------
        out_dir: str
            Output directory where the results will be saved.
        data: Pandas Dataframe
            Preprocessed data.
        labels: Pandas Dataframe
            Preprocessed labels.
        """
        data.to_parquet(path.join(out_dir, File.DATA),
                        compression="gzip", index=True)
        labels.to_parquet(path.join(out_dir, File.LABELS),
                          compression="gzip", index=True)


[docs]class ExtendedPreprocess(Preprocess):
    """
    Specific class to implement preprocessing which consists of applying certain transformations on some columns of the dataset. 
    The preprocessing that inherit from this class have in the constructor to_data, to_labels and test_set to indicate the columns
    on which to apply the DATA and LABELS preprocessing respectively, and if the TEST ones are included or not.
    """

[docs]    def __init__(self, to_data: Union[bool, Iterable[str]], to_labels: Union[bool, Iterable[str]], test_set: bool) -> None:
        self.to_data = to_data
        self.to_labels = to_labels
        self.test_set = test_set

[docs]    @abstractmethod
    def transform(self, df: pd.DataFrame, columns: Iterable[str]) -> pd.DataFrame:
        """
        Performs the preprocess over the dataframe with the given columns.

        Parameters
        ----------
        df: Pandas Dataframe
            Dataframe containing the data or the labels of the dataset.
        columns: Iterable[str]
            List of columns that will be used in the preprocess. Also the columns of the final dataframe.

        Returns
        -------
        Pandas Dataframe:
            Dataframe preprocessed.
        """
        pass

[docs]    def apply(self, in_dir: str, out_dir: str) -> None:
        self.show_start_message()
        data, labels = self.load_input(in_dir)

        # DATA
        if self.to_data != False:
            data = self.transform(
                data, self.extract_columns(data, self.to_data))

        # LABELS
        if self.to_labels != False:
            labels = self.transform(
                labels, self.extract_columns(labels, self.to_labels))

        self.save_output(out_dir, data, labels)

[docs]    def extract_columns(self, df: pd.DataFrame, columns: Union[bool, Iterable[str]]) -> Iterable[str]:
        """
        Returns a list containig all the column's name of the data.

        Parameters
        ----------
            df: Pandas Dataframe
                Dataframe containing the data.
            columns: Union[bool, Iterable[str]]
                Bool value if the dataframe has columns or the list of columns.

        Returns
        -------
        columns: Iterable[str]
            List containing the names of the dataframe's columns.
        """
        if columns == True:
            return df.columns
        else:
            return columns