from abc import ABC, abstractmethod
from os import path
from typing import Iterable, Union
import pandas as pd
from conmo.conf import File, Index
[docs]class Preprocess(ABC):
"""
Abstract base class for a Preprocess.
This class is an abstract class from which other subclasses inherit and must not be instanciated directly.
"""
[docs] @abstractmethod
def apply(self, in_dir: str, out_dir: str) -> None:
"""
Applies the preprocess to the given dataset.
Parameters
----------
in_dir: str
Input directory where the files are located. Usually, this is the output directory of the splitter step.
out_dir: str
Output directory where the files will be saved.
"""
pass
[docs] def show_start_message(self) -> None:
"""
Simple method to print on the terminal the name of the selected splitter.
"""
print("\n+++ Preprocess {} +++".format(self.__class__.__name__))
[docs] def save_output(self, out_dir: str, data: pd.DataFrame, labels: pd.DataFrame) -> None:
"""
Save preprocessed dataset to parquet format.
Parameters
----------
out_dir: str
Output directory where the results will be saved.
data: Pandas Dataframe
Preprocessed data.
labels: Pandas Dataframe
Preprocessed labels.
"""
data.to_parquet(path.join(out_dir, File.DATA),
compression="gzip", index=True)
labels.to_parquet(path.join(out_dir, File.LABELS),
compression="gzip", index=True)
[docs]class ExtendedPreprocess(Preprocess):
"""
Specific class to implement preprocessing which consists of applying certain transformations on some columns of the dataset.
The preprocessing that inherit from this class have in the constructor to_data, to_labels and test_set to indicate the columns
on which to apply the DATA and LABELS preprocessing respectively, and if the TEST ones are included or not.
"""
[docs] def __init__(self, to_data: Union[bool, Iterable[str]], to_labels: Union[bool, Iterable[str]], test_set: bool) -> None:
self.to_data = to_data
self.to_labels = to_labels
self.test_set = test_set
[docs] def apply(self, in_dir: str, out_dir: str) -> None:
self.show_start_message()
data, labels = self.load_input(in_dir)
# DATA
if self.to_data != False:
data = self.transform(
data, self.extract_columns(data, self.to_data))
# LABELS
if self.to_labels != False:
labels = self.transform(
labels, self.extract_columns(labels, self.to_labels))
self.save_output(out_dir, data, labels)
[docs] def extract_columns(self, df: pd.DataFrame, columns: Union[bool, Iterable[str]]) -> Iterable[str]:
"""
Returns a list containig all the column's name of the data.
Parameters
----------
df: Pandas Dataframe
Dataframe containing the data.
columns: Union[bool, Iterable[str]]
Bool value if the dataframe has columns or the list of columns.
Returns
-------
columns: Iterable[str]
List containing the names of the dataframe's columns.
"""
if columns == True:
return df.columns
else:
return columns