import os
import shutil
from os import path
from typing import Iterable
import numpy as np
import pandas as pd
import scipy.io as sio
from scipy.interpolate import PchipInterpolator as pchip
from conmo.conf import File, Index
from conmo.datasets.dataset import LocalDataset
[docs]class BatteriesDataset(LocalDataset):
"""
This is a dataset obtained from measurements of certain types of degradation of three types of batteries.
Since it belongs to the local datasets, to launch any experiment with it, it must be stored on disk with
the following directory structure:
- DTW-Li-ion-Diagnosis
- data : Data and labels for the three types of batteries
are stored here.
- mat:
- LFP:
- diagnosis:
- V.mat
- test:
- V_references.mat
- x_test_0.mat
- x_test_1.mat
- x_test_2.mat
- x_test_3.mat
- y_test.mat
- NCA:
- diagnosis
- test
- NMC:
- The same as NCA and LFP
- Q.mat
"""
CHEMISTRY_LIST = ['LFP', 'NCA', 'NMC']
MIN_V = 0
MAX_V = 0
SIZE = 128
UI_STEP = 0.0005
MIN_V_LFP = 3.20
MAX_V_LFP = 3.50
MIN_V_NCA = 3.20
MAX_V_NCA = 4.23
MIN_V_NMC = 3.44
MAX_V_NMC = 4.28
[docs] def __init__(self, path: str, chemistry: str, test_set: int) -> None:
super().__init__(path)
if chemistry not in self.CHEMISTRY_LIST:
raise RuntimeError("Invalid selected chemistry")
if test_set not in range(4):
raise RuntimeError("Invalid selected testing data")
self.path = path
self.test_set = test_set
self.chemistry = chemistry
self.MIN_V, self.MAX_V, _ = self.get_minmaxV(chemistry)
[docs] def dataset_files(self) -> Iterable:
files = []
for chemistry in self.CHEMISTRY_LIST:
for test_idx in range(4):
files.append(path.join(self.dataset_dir,
"{}-{:02}_{}".format(chemistry, test_idx, File.DATA)))
files.append(path.join(self.dataset_dir,
"{}-{:02}_{}".format(chemistry, test_idx, File.LABELS)))
return files
[docs] def load(self) -> None:
"""
Parse dataset train/test data to match Conmo's standard.
"""
path_data = path.join(self.path, 'data')
path_mat = path.join(self.path, 'mat')
for chemistry in self.CHEMISTRY_LIST:
# Read TRAIN DATA and generate dataframe
train_data_np = np.load(
path.join(path_data, 'x_train_' + chemistry + '.npy'))
train_data = pd.DataFrame(train_data_np, columns=[
"feature_{:03}".format(i) for i in range(127)])
# Reset index for starting from 1
train_data.index += 1
# Read TRAIN LABELS and generate dataframe
train_labels_np = np.load(
path.join(path_data, 'y_train_' + chemistry + '.npy'))
train_labels = pd.DataFrame(train_labels_np, columns=[
'LLI', 'LAMPE', 'LAMNE'])
# Reset index for starting from 1
train_labels.index += 1
# Load capacity file (needed later)
Q = sio.loadmat(path.join(path_mat, 'Q.mat'))['Qnorm'].flatten()
# Load TEST LABELS (the same over all types of test data)
test_labels_np = sio.loadmat(
path.join(path_mat, chemistry, 'test', 'y_test.mat'))['y_test']
# Reshape labels from (num_samples, cycles, sample_size) to (num_samples*cycles, degradation_modes)
test_labels_np = test_labels_np / 100
test_labels_np = test_labels_np.reshape(-1,
test_labels_np.shape[2])
test_labels = pd.DataFrame(test_labels_np, columns=[
'LLI', 'LAMPE', 'LAMNE', 'capacity_loss'])
# Delete last feature (capacity_loss) unusued in this problem
test_labels.drop('capacity_loss', axis=1, inplace=True)
# Reset index for starting from 1
test_labels.index += 1
# Iterate over different types of test data degradation
for idx in range(4):
# Read TEST DATA and generate dataframe
test_data_np = sio.loadmat(
path.join(path_mat, chemistry, 'test', 'x_test_{}.mat'.format(idx)))['x_test'].T
# (n_samples, seq_len)
test_data_np = test_data_np.reshape(-1, test_data_np.shape[2])
test_data_np = self.convert_to_input_data(
test_data_np, Q, self.SIZE-1, chemistry)
test_data_np = self.normalise_data(
test_data_np, np.min(train_data_np), np.max(train_data_np))
# Convert to Pandas dataframe
test_data = pd.DataFrame(test_data_np, columns=[
"feature_{:03}".format(i) for i in range(127)])
# Reset index for starting from 1
test_data.index += 1
# Generate DATA dataframe
data = pd.concat([train_data, test_data], keys=[
1, 2], names=[Index.SEQUENCE, Index.TIME])
data.sort_index(inplace=True)
# Generate LABELS dataframe
labels = pd.concat([train_labels, test_labels], keys=[
1, 2], names=[Index.SEQUENCE, Index.TIME])
labels.sort_index(inplace=True)
# Save parsed dataframes to disk
data.to_parquet(path.join(self.dataset_dir, "{}-{:02}_{}".format(
chemistry, idx, File.DATA)), compression="gzip", index=True)
labels.to_parquet(path.join(self.dataset_dir, "{}-{:02}_{}".format(
chemistry, idx, File.LABELS)), compression="gzip", index=True)
[docs] def feed_pipeline(self, out_dir: str) -> None:
"""
Copy selected data file to pipeline step folder.
Parameters
----------
out_dir:
Directory where the dataset was originally stored.
"""
shutil.copy(path.join(self.dataset_dir, "{}-{:02}_{}".format(
self.chemistry, self.test_set, File.DATA)), path.join(out_dir, File.DATA))
shutil.copy(path.join(self.dataset_dir, "{}-{:02}_{}".format(
self.chemistry, self.test_set, File.LABELS)), path.join(out_dir, File.LABELS))
[docs] def sklearn_predefined_split(self) -> Iterable[int]:
"""
Generates array of indexes of same length as sequences to be used with 'PredefinedSplit'
Returns
-------
array, list with the index for each sequence of the dataset.
"""
return [-1, 0]
[docs] def IC(self, u: np.ndarray, q: np.ndarray, ui_step: float = 0.0005, minV: float = 3.2, maxV: float = 3.5) -> (np.ndarray, np.ndarray):
'''
Get the ICA data for a given voltage curve
Parameters
----------
u: numpy array
Voltage curve.
q: numpy array
Capacity curve.
ui_step: float
Step of interpolation.
minV: float
Minimum voltage of the IC curve.
maxV: float
Maximum voltage of the IC curve.
Returns
-------
ui, dqi: numpy arrays
Interpolated voltage and derivative of capacity
'''
# voltages values for which capacity is interpolated
ui = np.arange(minV, maxV, ui_step)
qi = np.interp(ui, u, q)
return ui[1:], np.diff(qi)
[docs] def reduce_size(self, ui: np.ndarray, dqi: np.ndarray, size: int) -> np.ndarray:
'''
Reduces the length of the IC data to a given size
Parameters
----------
ui: numpy array
Voltage curve.
dqi: numpy array
Derivative of capacity (IC).
size: int
Size at which to reduce the IC data.
Returns
-------
numpy array
Reduced IC.
'''
curve = pchip(ui, dqi)
ui_reduced = np.linspace(min(ui), max(ui), size)
return curve(ui_reduced)
[docs] def normalise_data(self, data: np.ndarray, min_val: float, max_val: float, low: int = 0, high: int = 1) -> float:
'''
Normalises the data to the range [low, high]
Parameters
----------
data: numpy array
Data to normalise.
min: float
Minimum value of data.
max: float
Maximum value of data.
low: float
Minimum value of the range.
high: float
Maximum value of the range.
Returns
-------
normalised_data: float
normalised data
'''
normalised_data = (data - min_val)/(max_val - min_val)
normalised_data = (high - low)*normalised_data + low
return normalised_data
[docs] def get_minmaxV(self, material: np.ndarray) -> (int, int, str):
'''
Returns the range voltage in which to study the IC curves
Parameters
----------
material: numpy array
Chemistry to study.
Returns
-------
min_v, max_v, path: numpy arrays, str
Min and max voltage values and path where data is located,
'''
min_v = -1
max_v = -1
tmp_path = path.join(self.path, 'mat', material, 'diagnosis')
if material == "LFP":
min_v = self.MIN_V_LFP
max_v = self.MAX_V_LFP
elif material == "NCA":
min_v = self.MIN_V_NCA
max_v = self.MAX_V_NCA
elif material == "NMC":
min_v = self.MIN_V_NMC
max_v = self.MAX_V_NMC
else:
print("ERROR: Chemistry not found")
return -1
if min_v == -1 or max_v == -1 or path == "":
print("ERROR: Chemistry not found")
return -1
return min_v, max_v, tmp_path