Source code for conmo.datasets.server_machine_dataset

import os
import shutil
from os import path
from typing import Iterable

import numpy as np
import pandas as pd

from conmo.conf import File, Index, Label
from conmo.datasets.dataset import RemoteDataset


[docs]class ServerMachineDataset(RemoteDataset):
    URL = 'https://github.com/NetManAIOps/OmniAnomaly/archive/refs/heads/master.zip'
    FILE_FORMAT = 'zip'
    CHECKSUM = 'ebef7f89e73cdc9e926bdbf8bb591447'
    CHECKSUM_FORMAT = 'md5'
    VARIABLES = ['cpu_r', 'load_1', 'load_5', 'load_15', 'mem_shmem', 'mem_u', 'mem_u_e', 'total_mem', 'disk_q', 'disk_r', 'disk_rb', 'disk_svc', 'disk_u', 'disk_w', 'disk_wa', 'disk_wb', 'si', 'so', 'eth1_fi', 'eth1_fo', 'eth1_pi', 'eth1_po',
                 'tcp_tw', 'tcp_use', 'active_opens', 'curr_estab', 'in_errs', 'in_segs', 'listen_overflows', 'out_rsts', 'out_segs', 'passive_opens', 'retransegs', 'tcp_timeouts', 'udp_in_dg', 'udp_out_dg', 'udp_rcv_buf_errs', 'udp_snd_buf_errs']
    LABEL = Label.ANOMALY
    SUBDATASETS = ['1-01', '1-02', '1-03', '1-04', '1-05', '1-06', '1-07', '1-08', '2-01', '2-02', '2-03', '2-04', '2-05', '2-06', '2-07', '2-08' , '2-09', '3-01', '3-02', '3-03', '3-04', '3-05', '3-06', '3-07', '3-08', '3-09', '3-10', '3-11']
    SEQUENCE_COLUMN = 'machine'
    TIME_COLUMN = 'time'

[docs]    def __init__(self, subdataset: str) -> None:
        super().__init__(self.URL, self.FILE_FORMAT, self.CHECKSUM, self.CHECKSUM_FORMAT)
        if subdataset not in self.SUBDATASETS:
            raise RuntimeError("Invalid selected subdataset")
        self.subdataset = subdataset

[docs]    def dataset_files(self) -> Iterable:
        files = []
        for key in self.SUBDATASETS:
            files.append(path.join(self.dataset_dir,
                         "{}_{}".format(key, File.DATA)))
            files.append(path.join(self.dataset_dir,
                         "{}_{}".format(key, File.LABELS)))
        return files

[docs]    def parse_to_package(self, raw_dir: str) -> None:
        # Warning: For SMD dataset, it's downloaded from the entire Github repository so the dataset is located in a subfolder
        raw_dir = path.join(raw_dir, 'OmniAnomaly-master',
                            'ServerMachineDataset')

        for subdataset in os.listdir(path.join(raw_dir, 'train')):
            # Read TRAIN and TEST DATA and generate dataframe
            train = pd.read_csv(path.join(raw_dir, 'train', subdataset),
                                sep=',', header=None, names=self.VARIABLES)
            # Reset index for starting from 1 (TIME)
            train.index += 1

            test = pd.read_csv(path.join(raw_dir, 'test', subdataset),
                               sep=',', header=None, names=self.VARIABLES)
            # Reset index for starting from 1 (TIME)
            test.index += 1

            # Generate DATA dataframe
            data = pd.concat([train, test], keys=[1, 2], names=[
                Index.SEQUENCE, Index.TIME])
            data.sort_index(inplace=True)

            # Generate LABELS dataframe and fill it with train anomaly labels set to 0
            labels = pd.read_csv(path.join(raw_dir, 'test_label', subdataset),
                                 sep=',', header=None, names=[self.LABEL])

            # Reset index for starting from 1 (TIME)
            labels.index += 1

            # Reindex labels with test/train index and fill it with train anomaly labels set to 0
            labels = pd.concat([labels], keys=[1], names=[
                Index.SEQUENCE, Index.TIME])
            labels = labels.reindex(data.index, fill_value=0)
            labels.sort_index(inplace=True)

            # Extract file data (for identifying subdatasets)
            group_index = subdataset.split('-')[1]
            index = subdataset.split('-')[2].split('.')[0]

            # Save dataframes
            data.to_parquet(path.join(self.dataset_dir, "{:01}-{:02}_{}".format(
                int(group_index), int(index), File.DATA)), compression="gzip", index=True)
            labels.to_parquet(path.join(self.dataset_dir, "{:01}-{:02}_{}".format(
                int(group_index), int(index), File.LABELS)), compression="gzip", index=True)

[docs]    def feed_pipeline(self, out_dir: str) -> None:
        shutil.copy(path.join(self.dataset_dir, "{}_{}".format(
            self.subdataset, File.DATA)), path.join(out_dir, File.DATA))
        shutil.copy(path.join(self.dataset_dir, "{}_{}".format(
            self.subdataset, File.LABELS)), path.join(out_dir, File.LABELS))

[docs]    def sklearn_predefined_split(self) -> Iterable[int]:
        """
        Generates array of indexes of same length as sequences to be used with 'PredefinedSplit'
        SMD dataset has only 2 sequences: one for train and another for test.

        Returns
        -------
        array
            List with the index for each sequence of the dataset.
        """
        return [-1, 0]