Source code for conmo.datasets.batteries_degradation

import os
import shutil
from os import path
from typing import Iterable

import numpy as np
import pandas as pd
import scipy.io as sio
from scipy.interpolate import PchipInterpolator as pchip

from conmo.conf import File, Index
from conmo.datasets.dataset import LocalDataset


[docs]class BatteriesDataset(LocalDataset):
    """
    This is a dataset obtained from measurements of certain types of degradation of three types of batteries.
    Since it belongs to the local datasets, to launch any experiment with it, it must be stored on disk with 
    the following directory structure:
    - DTW-Li-ion-Diagnosis
        - data : Data and labels for the three types of batteries 
                    are stored here.
        - mat: 
            - LFP:
                - diagnosis:
                    - V.mat
                - test:
                    - V_references.mat
                    - x_test_0.mat
                    - x_test_1.mat
                    - x_test_2.mat
                    - x_test_3.mat
                    - y_test.mat
            - NCA:
                - diagnosis
                - test
            - NMC:
                - The same as NCA and LFP
            - Q.mat
    """

    CHEMISTRY_LIST = ['LFP', 'NCA', 'NMC']
    MIN_V = 0
    MAX_V = 0
    SIZE = 128
    UI_STEP = 0.0005
    MIN_V_LFP = 3.20
    MAX_V_LFP = 3.50
    MIN_V_NCA = 3.20
    MAX_V_NCA = 4.23
    MIN_V_NMC = 3.44
    MAX_V_NMC = 4.28

[docs]    def __init__(self, path: str, chemistry: str, test_set: int) -> None:
        super().__init__(path)
        if chemistry not in self.CHEMISTRY_LIST:
            raise RuntimeError("Invalid selected chemistry")
        if test_set not in range(4):
            raise RuntimeError("Invalid selected testing data")
        self.path = path
        self.test_set = test_set
        self.chemistry = chemistry
        self.MIN_V, self.MAX_V, _ = self.get_minmaxV(chemistry)

[docs]    def dataset_files(self) -> Iterable:
        files = []
        for chemistry in self.CHEMISTRY_LIST:
            for test_idx in range(4):
                files.append(path.join(self.dataset_dir,
                                       "{}-{:02}_{}".format(chemistry, test_idx, File.DATA)))
                files.append(path.join(self.dataset_dir,
                                       "{}-{:02}_{}".format(chemistry, test_idx, File.LABELS)))
        return files

[docs]    def load(self) -> None:
        """
        Parse dataset train/test data to match Conmo's standard.
        """
        path_data = path.join(self.path, 'data')
        path_mat = path.join(self.path, 'mat')

        for chemistry in self.CHEMISTRY_LIST:
            # Read TRAIN DATA and generate dataframe
            train_data_np = np.load(
                path.join(path_data, 'x_train_' + chemistry + '.npy'))
            train_data = pd.DataFrame(train_data_np, columns=[
                                      "feature_{:03}".format(i) for i in range(127)])
            # Reset index for starting from 1
            train_data.index += 1

            # Read TRAIN LABELS and generate dataframe
            train_labels_np = np.load(
                path.join(path_data, 'y_train_' + chemistry + '.npy'))
            train_labels = pd.DataFrame(train_labels_np, columns=[
                                        'LLI', 'LAMPE', 'LAMNE'])
            # Reset index for starting from 1
            train_labels.index += 1

            # Load capacity file (needed later)
            Q = sio.loadmat(path.join(path_mat, 'Q.mat'))['Qnorm'].flatten()

            # Load TEST LABELS (the same over all types of test data)
            test_labels_np = sio.loadmat(
                path.join(path_mat, chemistry, 'test', 'y_test.mat'))['y_test']

            # Reshape labels from (num_samples, cycles, sample_size) to (num_samples*cycles, degradation_modes)
            test_labels_np = test_labels_np / 100
            test_labels_np = test_labels_np.reshape(-1,
                                                    test_labels_np.shape[2])
            test_labels = pd.DataFrame(test_labels_np, columns=[
                                       'LLI', 'LAMPE', 'LAMNE', 'capacity_loss'])

            # Delete last feature (capacity_loss) unusued in this problem
            test_labels.drop('capacity_loss', axis=1, inplace=True)

            # Reset index for starting from 1
            test_labels.index += 1

            # Iterate over different types of test data degradation
            for idx in range(4):
                # Read TEST DATA and generate dataframe
                test_data_np = sio.loadmat(
                    path.join(path_mat, chemistry, 'test', 'x_test_{}.mat'.format(idx)))['x_test'].T
                # (n_samples, seq_len)
                test_data_np = test_data_np.reshape(-1, test_data_np.shape[2])
                test_data_np = self.convert_to_input_data(
                    test_data_np, Q, self.SIZE-1, chemistry)

                test_data_np = self.normalise_data(
                    test_data_np, np.min(train_data_np), np.max(train_data_np))

                # Convert to Pandas dataframe
                test_data = pd.DataFrame(test_data_np, columns=[
                                         "feature_{:03}".format(i) for i in range(127)])
                # Reset index for starting from 1
                test_data.index += 1

                # Generate DATA dataframe
                data = pd.concat([train_data, test_data], keys=[
                                 1, 2], names=[Index.SEQUENCE, Index.TIME])
                data.sort_index(inplace=True)

                # Generate LABELS dataframe
                labels = pd.concat([train_labels, test_labels], keys=[
                                   1, 2], names=[Index.SEQUENCE, Index.TIME])
                labels.sort_index(inplace=True)

                # Save parsed dataframes to disk
                data.to_parquet(path.join(self.dataset_dir, "{}-{:02}_{}".format(
                    chemistry, idx, File.DATA)), compression="gzip", index=True)
                labels.to_parquet(path.join(self.dataset_dir, "{}-{:02}_{}".format(
                    chemistry, idx, File.LABELS)), compression="gzip", index=True)

[docs]    def feed_pipeline(self, out_dir: str) -> None:
        """
        Copy selected data file to pipeline step folder.

        Parameters
        ----------
        out_dir:
            Directory where the dataset was originally stored.
        """
        shutil.copy(path.join(self.dataset_dir, "{}-{:02}_{}".format(
            self.chemistry, self.test_set, File.DATA)), path.join(out_dir, File.DATA))
        shutil.copy(path.join(self.dataset_dir, "{}-{:02}_{}".format(
            self.chemistry, self.test_set, File.LABELS)), path.join(out_dir, File.LABELS))

[docs]    def sklearn_predefined_split(self) -> Iterable[int]:
        """
        Generates array of indexes of same length as sequences to be used with 'PredefinedSplit'

        Returns
        -------
        array, list with the index for each sequence of the dataset.
        """
        return [-1, 0]

[docs]    def convert_to_input_data(self, ui_new: list, Q: list, size: int, material: int) -> np.ndarray:
        '''
        Converts the voltage values of the real cells to the input data for the neural network

        Parameters
        ----------
        ui_new: array
            Voltage values of the cell at each cycle in percentage.
        Q: array
            Capacity percentages from 0 to 100 from the simulated dataset.
        size: int
            The length of the curves.
        material: str
            Chemistry of the cell.

        Returns
        -------
        x_test: array
            The input data for the neural network.
        '''
        min_v, max_v, = self.MIN_V, self.MAX_V
        samples = []
        for sample in range(len(ui_new)):
            # convert to IC
            ui_sample, dqi_sample = self.IC(
                ui_new[sample], Q, self.UI_STEP, min_v, max_v)
            # reduce size
            new_sample = self.reduce_size(ui_sample, dqi_sample, size)
            samples.append(new_sample)
        x_test = np.array(samples)
        return x_test

[docs]    def IC(self, u: np.ndarray, q: np.ndarray, ui_step: float = 0.0005, minV: float = 3.2, maxV: float = 3.5) -> (np.ndarray, np.ndarray):
        '''
        Get the ICA data for a given voltage curve

        Parameters
        ----------
        u: numpy array
            Voltage curve.
        q: numpy array
            Capacity curve.
        ui_step: float
            Step of interpolation.
        minV: float
            Minimum voltage of the IC curve.
        maxV: float
            Maximum voltage of the IC curve.

        Returns
        -------
        ui, dqi: numpy arrays
            Interpolated voltage and derivative of capacity
        '''

        # voltages values for which capacity is interpolated
        ui = np.arange(minV, maxV, ui_step)
        qi = np.interp(ui, u, q)
        return ui[1:], np.diff(qi)

[docs]    def reduce_size(self, ui: np.ndarray, dqi: np.ndarray, size: int) -> np.ndarray:
        '''
        Reduces the length of the IC data to a given size

        Parameters
        ----------
        ui: numpy array
            Voltage curve.
        dqi: numpy array
            Derivative of capacity (IC).
        size: int 
            Size at which to reduce the IC data.

        Returns
        -------
        numpy array
            Reduced IC.
        '''

        curve = pchip(ui, dqi)
        ui_reduced = np.linspace(min(ui), max(ui), size)
        return curve(ui_reduced)

[docs]    def normalise_data(self, data: np.ndarray, min_val: float, max_val: float, low: int = 0, high: int = 1) -> float:
        '''
        Normalises the data to the range [low, high]

        Parameters
        ----------
        data: numpy array
            Data to normalise.
        min: float
            Minimum value of data.
        max: float
            Maximum value of data.
        low: float
            Minimum value of the range.
        high: float
            Maximum value of the range.

        Returns
        -------
        normalised_data: float
            normalised data
        '''
        normalised_data = (data - min_val)/(max_val - min_val)
        normalised_data = (high - low)*normalised_data + low
        return normalised_data

[docs]    def get_minmaxV(self, material: np.ndarray) -> (int, int, str):
        '''
        Returns the range voltage in which to study the IC curves

        Parameters
        ----------
        material: numpy array
            Chemistry to study.

        Returns
        -------
        min_v, max_v, path: numpy arrays, str
            Min and max voltage values and path where data is located,
        '''
        min_v = -1
        max_v = -1
        tmp_path = path.join(self.path, 'mat', material, 'diagnosis')
        if material == "LFP":
            min_v = self.MIN_V_LFP
            max_v = self.MAX_V_LFP
        elif material == "NCA":
            min_v = self.MIN_V_NCA
            max_v = self.MAX_V_NCA
        elif material == "NMC":
            min_v = self.MIN_V_NMC
            max_v = self.MAX_V_NMC
        else:
            print("ERROR: Chemistry not found")
            return -1
        if min_v == -1 or max_v == -1 or path == "":
            print("ERROR: Chemistry not found")
            return -1
        return min_v, max_v, tmp_path