Source code for conmo.preprocesses.sklearn_preprocess

from typing import Iterable, Union

import pandas as pd
from sklearn.preprocessing import (Binarizer, FunctionTransformer,
                                   KBinsDiscretizer, KernelCenterer,
                                   LabelBinarizer, LabelEncoder, MaxAbsScaler,
                                   MinMaxScaler, MultiLabelBinarizer,
                                   Normalizer, OneHotEncoder, OrdinalEncoder,
                                   PolynomialFeatures, PowerTransformer,
                                   QuantileTransformer, RobustScaler,
                                   StandardScaler)

from conmo.conf import Index
from conmo.preprocesses.preprocess import ExtendedPreprocess


[docs]class SklearnPreprocess(ExtendedPreprocess):
    """
    Class used to wrap existing preprocess in the Scikit-Learn library.
    It also allows this preprocess to be applied to certain columns of the dataset.
    """

[docs]    def __init__(self, to_data: Union[bool, Iterable[str]], to_labels: Union[bool, Iterable[str]], test_set: bool, preprocess: Union[Binarizer, FunctionTransformer, KBinsDiscretizer, KernelCenterer, LabelBinarizer, LabelEncoder, MultiLabelBinarizer, MaxAbsScaler, MinMaxScaler, Normalizer, OneHotEncoder, OrdinalEncoder, PolynomialFeatures, PowerTransformer, QuantileTransformer, RobustScaler, StandardScaler]) -> None:
        super().__init__(to_data, to_labels, test_set)
        self.preprocess = preprocess

[docs]    def transform(self, df: pd.DataFrame, columns: Iterable[str]) -> pd.DataFrame:
        # Fit with TRAIN data and transform both TRAIN and TEST, fold by fold
        for fold in df.index.get_level_values(Index.FOLD).unique():
            # Train: fit and transform
            index_slice = pd.IndexSlice[(fold, Index.SET_TRAIN), columns]
            df.loc[index_slice] = self.preprocess.fit_transform(
                df.loc[index_slice].values)

            # Test: transform
            if self.test_set == True:
                index_slice = pd.IndexSlice[(fold, Index.SET_TEST), columns]
                df.loc[index_slice] = self.preprocess.transform(
                    df.loc[index_slice].values)

        return df