Source code for rarity.data_loader.data_loader

# Copyright 2021 AI Singapore. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import List
import pandas as pd

from rarity.data_loader import BaseLoader
from rarity.utils.common_functions import is_regression, is_classification


[docs]class CSVDataLoader(BaseLoader):
    '''
    Dataloader that compiles all input files in csv format.

    Args:
        xFeature_file (str):
            Path to csv file that contains all xfeatures used for model development/training.

            `Example of csv file storing xfeatures :`

            +-----------+-----------+
            | feature_0 | feature_1 |
            +-----------+-----------+
            | 21        | B         |
            +-----------+-----------+
            | 36        | A         |
            +-----------+-----------+


        yTrue_file (str):
            Path to csv file that contains all actual values (regression) / true labels (classification)

            `Example of csv file storing yTrue values for`

            ``Regression`` :

            +--------+
            | price  |
            +========+
            | 78634  |
            +--------+
            | 98273  |
            +--------+
            | 2780   |
            +--------+

            ``Binary Classification`` :

            +--------+
            | churn  |
            +========+
            | 1      |
            +--------+
            | 0      |
            +--------+
            | 1      |
            +--------+

            ``Multiclass Classification`` :

            +--------+
            | size   |
            +========+
            | big    |
            +--------+
            | medium |
            +--------+
            | small  |
            +--------+

        yPred_file_ls (:obj:`List[str]`):
            List consists of csv file paths that contain prediction values / probabilities generated by specific model type.
            One csv file for 1 model prediction outputs

            `Example of csv file stroing yPred values / labels for`

            ``Regression`` :

            +--------+
            | price  |
            +========+
            | 83683  |
            +--------+
            | 67293  |
            +--------+

            ``Binary Classification`` :

            +--------+--------+
            | 0      | 1      |
            +========+========+
            | 0.0675 | 0.9325 |
            +--------+--------+
            | 0.6237 | 0.3767 |
            +--------+--------+

            ``Multiclass Classification`` :

            +--------+--------+--------+
            | big    | medium | small  |
            +========+========+========+
            | 0.7772 | 0.1140 | 0.1088 |
            +--------+--------+--------+
            | 0.0014 | 0.8169 | 0.1817 |
            +--------+--------+--------+

        model_names_ls (:obj:`List[str]`):
            List contains model names representing the model used to generate yPred

        analysis_type (str):
            Analysis type defined by user. Corresponding feature components will be auto-populated based on the \
            specified analysis type. Supported analysis types : ``Regression``, ``Binary Classification``, ``Multiclass Classification``
    '''
    def __init__(
        self,
        xFeatures_file: str,
        yTrue_file: str,
        yPred_file_ls: List[str] = [],
        model_names_ls: List[str] = [],
        analysis_type: str = None
    ):
        super().__init__(xFeatures_file, yTrue_file, yPred_file_ls, model_names_ls, analysis_type)
        self.analysis_type = analysis_type.lower().replace(' ', '-')

        assert len(self.yPreds) == len(self.models), 'no. of yPred_files must be equal to \
                                                    no. of model_names in correct order'

        assert self.analysis_type in ['regression', 'binary-classification', 'multiclass-classification'], "Currently " \
            "supported analysis types: ['Regression', 'Binary Classification', 'Multiclass Classification']"

        if self.analysis_type == 'multiclass-classification':
            cols_ls = [list(pd.read_csv(yPred).columns) for yPred in self.yPreds]
            assert all(len(col) > 2 for col in cols_ls), "Data for yPred_file doesn't seem to be a multiclass prediction. " \
                "Please ensure there is prediction probabilities for each class in the yPred file."

        if self.analysis_type == 'binary-classification':
            assert pd.read_csv(self.yTrue).nunique().values[0] == 2, "Data for yTrue_file doesn't seem to be a binary-class " \
                "prediction. Please ensure yTrue_file consists of array of only 2 unique class"

    def get_features(self):
        return pd.read_csv(self.xFeatures)

    def get_yTrue(self):
        yTrue = pd.read_csv(self.yTrue)
        yTrue.rename(columns={yTrue.columns[0]: 'yTrue'}, inplace=True)
        return yTrue

    def get_yPreds(self):
        yPred_ls = [pd.read_csv(yPred) for yPred in self.yPreds]
        if is_regression(self.analysis_type):
            yPred_df = pd.concat(yPred_ls, axis=1)
            if len(self.models) > 1:
                yPred_df.rename(columns={yPred_df.columns[0]: f'yPred_{self.models[0]}', 
                                        yPred_df.columns[1]: f'yPred_{self.models[1]}'}, inplace=True)
            elif len(self.models) == 1:
                yPred_df.rename(columns={yPred_df.columns[0]: f'yPred_{self.models[0]}'}, inplace=True)
            return yPred_df
        elif is_classification(self.analysis_type):
            for i in range(len(self.models)):
                yPred_ls[i]['model'] = self.models[i]
                yPred_ls[i]['yPred-label'] = yPred_ls[i][list(yPred_ls[i].columns)[:-1]].idxmax(axis=1)
            return yPred_ls

    def get_model_list(self):
        return self.models

    def get_analysis_type(self):
        return self.analysis_type

    def get_all(self):
        if is_regression(self.analysis_type):
            df = pd.concat([self.get_features(), self.get_yTrue(), self.get_yPreds()], axis=1)
            return df
        elif is_classification(self.analysis_type):
            df_ls = []
            for i in range(len(self.models)):
                df = pd.concat([self.get_features(), self.get_yTrue(), self.get_yPreds()[i]], axis=1)
                df_ls.append(df)
    #         dfs = pd.concat(df_ls) # can remove this dfs if memory issue for big files
            return df_ls


[docs]class DataframeLoader(BaseLoader):
    '''
    Dataloader that compile all xFeatures, yTrue, yPreds in dataframe format.

    Args:
        df_xFeatures (:obj:`~pd.DataFrame`):
            Dataframe that contains all xfeatures used for model development/training.

        df_yTrue (:obj:`~pd.DataFrame`):
            Dataframe that contains all true values / labels.

        df_yPred_ls (:obj:`List[~pd.DataFrame]`):
            Dataframe that contains all predicted values (regresession) / probabilities (classification).

        model_names_ls (:obj:`List[str]`):
            List contains model names representing the model used to generate yPred.

        analysis_type (str):
            Analysis type defined by user. Corresponding feature components will be auto-populated based on the
            specified analysis type. Supported analysis types : ``Regression``, ``Binary Classification``, ``Multiclass Classification``

    '''
    def __init__(
        self,
        df_xFeatures: pd.DataFrame,
        df_yTrue: pd.DataFrame,
        df_yPred_ls: List[pd.DataFrame] = [],
        model_names_ls: List[str] = [],
        analysis_type: str = None
    ):
        super().__init__(df_xFeatures, df_yTrue, df_yPred_ls, model_names_ls, analysis_type)
        self.yTrue = df_yTrue.copy()
        self.yPreds = [df.copy() for df in df_yPred_ls]
        self.analysis_type = analysis_type.lower().replace(' ', '-')
        assert len(self.yPreds) == len(self.models), 'no. of yPred_files must be equal to ' \
                'no. of model_names in correct order'

        assert self.analysis_type in ['regression', 'binary-classification', 'multiclass-classification'], "Currently " \
            "supported analysis types: ['Regression', 'Binary Classification', 'Multiclass Classification']"

        if self.analysis_type == 'multiclass-classification':
            cols_ls = [list(yPred.columns) for yPred in self.yPreds]
            assert all(len(col) > 2 for col in cols_ls), "Data for df_yPred_list doesn't seem to be a multiclass prediction. " \
                "Please ensure there is prediction probabilities for each class in the df of df_yPred_list."

        if self.analysis_type == 'binary-classification':
            assert self.yTrue.nunique().values[0] == 2, "Data for df_yTrue doesn't seem to be a binary-class prediction. " \
                "Please ensure df_yTrue consists of data with only 2 unique class"

    def get_features(self):
        return self.xFeatures

    def get_yTrue(self):
        self.yTrue.rename(columns={self.yTrue.columns[0]: 'yTrue'}, inplace=True)
        return self.yTrue

    def get_yPreds(self):
        if is_regression(self.analysis_type):
            yPred_df = pd.concat(self.yPreds, axis=1)
            if len(self.models) > 1:
                yPred_df.rename(columns={yPred_df.columns[0]: f'yPred_{self.models[0]}', 
                                        yPred_df.columns[1]: f'yPred_{self.models[1]}'}, inplace=True)
            elif len(self.models) == 1:
                yPred_df.rename(columns={yPred_df.columns[0]: f'yPred_{self.models[0]}'}, inplace=True)
            return yPred_df
        elif is_classification(self.analysis_type):
            for i in range(len(self.models)):
                self.yPreds[i]['model'] = self.models[i]
                self.yPreds[i]['yPred-label'] = self.yPreds[i][list(self.yPreds[i].columns)[:2]].idxmax(axis=1)
            return self.yPreds

    def get_model_list(self):
        return self.models

    def get_analysis_type(self):
        return self.analysis_type

    def get_all(self):
        if is_regression(self.analysis_type):
            df = pd.concat([self.get_features(), self.get_yTrue(), self.get_yPreds()], axis=1)
            return df
        elif is_classification(self.analysis_type):
            df_ls = []
            for i in range(len(self.models)):
                df = pd.concat([self.xFeatures, self.get_yTrue(), self.get_yPreds()[i]], axis=1)
                df_ls.append(df)
    #         dfs = pd.concat(df_ls) # can remove this dfs if memory issue for big files
            return df_ls