Source code for rarity.data_loader.data_loader

# Copyright 2021 AI Singapore. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import List
import pandas as pd

from rarity.data_loader import BaseLoader
from rarity.utils.common_functions import is_regression, is_classification


[docs]class CSVDataLoader(BaseLoader): ''' Dataloader that compiles all input files in csv format. Args: xFeature_file (str): Path to csv file that contains all xfeatures used for model development/training. `Example of csv file storing xfeatures :` +-----------+-----------+ | feature_0 | feature_1 | +-----------+-----------+ | 21 | B | +-----------+-----------+ | 36 | A | +-----------+-----------+ yTrue_file (str): Path to csv file that contains all actual values (regression) / true labels (classification) `Example of csv file storing yTrue values for` ``Regression`` : +--------+ | price | +========+ | 78634 | +--------+ | 98273 | +--------+ | 2780 | +--------+ ``Binary Classification`` : +--------+ | churn | +========+ | 1 | +--------+ | 0 | +--------+ | 1 | +--------+ ``Multiclass Classification`` : +--------+ | size | +========+ | big | +--------+ | medium | +--------+ | small | +--------+ yPred_file_ls (:obj:`List[str]`): List consists of csv file paths that contain prediction values / probabilities generated by specific model type. One csv file for 1 model prediction outputs `Example of csv file stroing yPred values / labels for` ``Regression`` : +--------+ | price | +========+ | 83683 | +--------+ | 67293 | +--------+ ``Binary Classification`` : +--------+--------+ | 0 | 1 | +========+========+ | 0.0675 | 0.9325 | +--------+--------+ | 0.6237 | 0.3767 | +--------+--------+ ``Multiclass Classification`` : +--------+--------+--------+ | big | medium | small | +========+========+========+ | 0.7772 | 0.1140 | 0.1088 | +--------+--------+--------+ | 0.0014 | 0.8169 | 0.1817 | +--------+--------+--------+ model_names_ls (:obj:`List[str]`): List contains model names representing the model used to generate yPred analysis_type (str): Analysis type defined by user. Corresponding feature components will be auto-populated based on the \ specified analysis type. Supported analysis types : ``Regression``, ``Binary Classification``, ``Multiclass Classification`` ''' def __init__( self, xFeatures_file: str, yTrue_file: str, yPred_file_ls: List[str] = [], model_names_ls: List[str] = [], analysis_type: str = None ): super().__init__(xFeatures_file, yTrue_file, yPred_file_ls, model_names_ls, analysis_type) self.analysis_type = analysis_type.lower().replace(' ', '-') assert len(self.yPreds) == len(self.models), 'no. of yPred_files must be equal to \ no. of model_names in correct order' assert self.analysis_type in ['regression', 'binary-classification', 'multiclass-classification'], "Currently " \ "supported analysis types: ['Regression', 'Binary Classification', 'Multiclass Classification']" if self.analysis_type == 'multiclass-classification': cols_ls = [list(pd.read_csv(yPred).columns) for yPred in self.yPreds] assert all(len(col) > 2 for col in cols_ls), "Data for yPred_file doesn't seem to be a multiclass prediction. " \ "Please ensure there is prediction probabilities for each class in the yPred file." if self.analysis_type == 'binary-classification': assert pd.read_csv(self.yTrue).nunique().values[0] == 2, "Data for yTrue_file doesn't seem to be a binary-class " \ "prediction. Please ensure yTrue_file consists of array of only 2 unique class" def get_features(self): return pd.read_csv(self.xFeatures) def get_yTrue(self): yTrue = pd.read_csv(self.yTrue) yTrue.rename(columns={yTrue.columns[0]: 'yTrue'}, inplace=True) return yTrue def get_yPreds(self): yPred_ls = [pd.read_csv(yPred) for yPred in self.yPreds] if is_regression(self.analysis_type): yPred_df = pd.concat(yPred_ls, axis=1) if len(self.models) > 1: yPred_df.rename(columns={yPred_df.columns[0]: f'yPred_{self.models[0]}', yPred_df.columns[1]: f'yPred_{self.models[1]}'}, inplace=True) elif len(self.models) == 1: yPred_df.rename(columns={yPred_df.columns[0]: f'yPred_{self.models[0]}'}, inplace=True) return yPred_df elif is_classification(self.analysis_type): for i in range(len(self.models)): yPred_ls[i]['model'] = self.models[i] yPred_ls[i]['yPred-label'] = yPred_ls[i][list(yPred_ls[i].columns)[:-1]].idxmax(axis=1) return yPred_ls def get_model_list(self): return self.models def get_analysis_type(self): return self.analysis_type def get_all(self): if is_regression(self.analysis_type): df = pd.concat([self.get_features(), self.get_yTrue(), self.get_yPreds()], axis=1) return df elif is_classification(self.analysis_type): df_ls = [] for i in range(len(self.models)): df = pd.concat([self.get_features(), self.get_yTrue(), self.get_yPreds()[i]], axis=1) df_ls.append(df) # dfs = pd.concat(df_ls) # can remove this dfs if memory issue for big files return df_ls
[docs]class DataframeLoader(BaseLoader): ''' Dataloader that compile all xFeatures, yTrue, yPreds in dataframe format. Args: df_xFeatures (:obj:`~pd.DataFrame`): Dataframe that contains all xfeatures used for model development/training. df_yTrue (:obj:`~pd.DataFrame`): Dataframe that contains all true values / labels. df_yPred_ls (:obj:`List[~pd.DataFrame]`): Dataframe that contains all predicted values (regresession) / probabilities (classification). model_names_ls (:obj:`List[str]`): List contains model names representing the model used to generate yPred. analysis_type (str): Analysis type defined by user. Corresponding feature components will be auto-populated based on the specified analysis type. Supported analysis types : ``Regression``, ``Binary Classification``, ``Multiclass Classification`` ''' def __init__( self, df_xFeatures: pd.DataFrame, df_yTrue: pd.DataFrame, df_yPred_ls: List[pd.DataFrame] = [], model_names_ls: List[str] = [], analysis_type: str = None ): super().__init__(df_xFeatures, df_yTrue, df_yPred_ls, model_names_ls, analysis_type) self.yTrue = df_yTrue.copy() self.yPreds = [df.copy() for df in df_yPred_ls] self.analysis_type = analysis_type.lower().replace(' ', '-') assert len(self.yPreds) == len(self.models), 'no. of yPred_files must be equal to ' \ 'no. of model_names in correct order' assert self.analysis_type in ['regression', 'binary-classification', 'multiclass-classification'], "Currently " \ "supported analysis types: ['Regression', 'Binary Classification', 'Multiclass Classification']" if self.analysis_type == 'multiclass-classification': cols_ls = [list(yPred.columns) for yPred in self.yPreds] assert all(len(col) > 2 for col in cols_ls), "Data for df_yPred_list doesn't seem to be a multiclass prediction. " \ "Please ensure there is prediction probabilities for each class in the df of df_yPred_list." if self.analysis_type == 'binary-classification': assert self.yTrue.nunique().values[0] == 2, "Data for df_yTrue doesn't seem to be a binary-class prediction. " \ "Please ensure df_yTrue consists of data with only 2 unique class" def get_features(self): return self.xFeatures def get_yTrue(self): self.yTrue.rename(columns={self.yTrue.columns[0]: 'yTrue'}, inplace=True) return self.yTrue def get_yPreds(self): if is_regression(self.analysis_type): yPred_df = pd.concat(self.yPreds, axis=1) if len(self.models) > 1: yPred_df.rename(columns={yPred_df.columns[0]: f'yPred_{self.models[0]}', yPred_df.columns[1]: f'yPred_{self.models[1]}'}, inplace=True) elif len(self.models) == 1: yPred_df.rename(columns={yPred_df.columns[0]: f'yPred_{self.models[0]}'}, inplace=True) return yPred_df elif is_classification(self.analysis_type): for i in range(len(self.models)): self.yPreds[i]['model'] = self.models[i] self.yPreds[i]['yPred-label'] = self.yPreds[i][list(self.yPreds[i].columns)[:2]].idxmax(axis=1) return self.yPreds def get_model_list(self): return self.models def get_analysis_type(self): return self.analysis_type def get_all(self): if is_regression(self.analysis_type): df = pd.concat([self.get_features(), self.get_yTrue(), self.get_yPreds()], axis=1) return df elif is_classification(self.analysis_type): df_ls = [] for i in range(len(self.models)): df = pd.concat([self.xFeatures, self.get_yTrue(), self.get_yPreds()[i]], axis=1) df_ls.append(df) # dfs = pd.concat(df_ls) # can remove this dfs if memory issue for big files return df_ls