# Copyright 2021 AI Singapore. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
import pandas as pd
from rarity.data_loader import BaseLoader
from rarity.utils.common_functions import is_regression, is_classification
[docs]class CSVDataLoader(BaseLoader):
'''
Dataloader that compiles all input files in csv format.
Args:
xFeature_file (str):
Path to csv file that contains all xfeatures used for model development/training.
`Example of csv file storing xfeatures :`
+-----------+-----------+
| feature_0 | feature_1 |
+-----------+-----------+
| 21 | B |
+-----------+-----------+
| 36 | A |
+-----------+-----------+
yTrue_file (str):
Path to csv file that contains all actual values (regression) / true labels (classification)
`Example of csv file storing yTrue values for`
``Regression`` :
+--------+
| price |
+========+
| 78634 |
+--------+
| 98273 |
+--------+
| 2780 |
+--------+
``Binary Classification`` :
+--------+
| churn |
+========+
| 1 |
+--------+
| 0 |
+--------+
| 1 |
+--------+
``Multiclass Classification`` :
+--------+
| size |
+========+
| big |
+--------+
| medium |
+--------+
| small |
+--------+
yPred_file_ls (:obj:`List[str]`):
List consists of csv file paths that contain prediction values / probabilities generated by specific model type.
One csv file for 1 model prediction outputs
`Example of csv file stroing yPred values / labels for`
``Regression`` :
+--------+
| price |
+========+
| 83683 |
+--------+
| 67293 |
+--------+
``Binary Classification`` :
+--------+--------+
| 0 | 1 |
+========+========+
| 0.0675 | 0.9325 |
+--------+--------+
| 0.6237 | 0.3767 |
+--------+--------+
``Multiclass Classification`` :
+--------+--------+--------+
| big | medium | small |
+========+========+========+
| 0.7772 | 0.1140 | 0.1088 |
+--------+--------+--------+
| 0.0014 | 0.8169 | 0.1817 |
+--------+--------+--------+
model_names_ls (:obj:`List[str]`):
List contains model names representing the model used to generate yPred
analysis_type (str):
Analysis type defined by user. Corresponding feature components will be auto-populated based on the \
specified analysis type. Supported analysis types : ``Regression``, ``Binary Classification``, ``Multiclass Classification``
'''
def __init__(
self,
xFeatures_file: str,
yTrue_file: str,
yPred_file_ls: List[str] = [],
model_names_ls: List[str] = [],
analysis_type: str = None
):
super().__init__(xFeatures_file, yTrue_file, yPred_file_ls, model_names_ls, analysis_type)
self.analysis_type = analysis_type.lower().replace(' ', '-')
assert len(self.yPreds) == len(self.models), 'no. of yPred_files must be equal to \
no. of model_names in correct order'
assert self.analysis_type in ['regression', 'binary-classification', 'multiclass-classification'], "Currently " \
"supported analysis types: ['Regression', 'Binary Classification', 'Multiclass Classification']"
if self.analysis_type == 'multiclass-classification':
cols_ls = [list(pd.read_csv(yPred).columns) for yPred in self.yPreds]
assert all(len(col) > 2 for col in cols_ls), "Data for yPred_file doesn't seem to be a multiclass prediction. " \
"Please ensure there is prediction probabilities for each class in the yPred file."
if self.analysis_type == 'binary-classification':
assert pd.read_csv(self.yTrue).nunique().values[0] == 2, "Data for yTrue_file doesn't seem to be a binary-class " \
"prediction. Please ensure yTrue_file consists of array of only 2 unique class"
def get_features(self):
return pd.read_csv(self.xFeatures)
def get_yTrue(self):
yTrue = pd.read_csv(self.yTrue)
yTrue.rename(columns={yTrue.columns[0]: 'yTrue'}, inplace=True)
return yTrue
def get_yPreds(self):
yPred_ls = [pd.read_csv(yPred) for yPred in self.yPreds]
if is_regression(self.analysis_type):
yPred_df = pd.concat(yPred_ls, axis=1)
if len(self.models) > 1:
yPred_df.rename(columns={yPred_df.columns[0]: f'yPred_{self.models[0]}',
yPred_df.columns[1]: f'yPred_{self.models[1]}'}, inplace=True)
elif len(self.models) == 1:
yPred_df.rename(columns={yPred_df.columns[0]: f'yPred_{self.models[0]}'}, inplace=True)
return yPred_df
elif is_classification(self.analysis_type):
for i in range(len(self.models)):
yPred_ls[i]['model'] = self.models[i]
yPred_ls[i]['yPred-label'] = yPred_ls[i][list(yPred_ls[i].columns)[:-1]].idxmax(axis=1)
return yPred_ls
def get_model_list(self):
return self.models
def get_analysis_type(self):
return self.analysis_type
def get_all(self):
if is_regression(self.analysis_type):
df = pd.concat([self.get_features(), self.get_yTrue(), self.get_yPreds()], axis=1)
return df
elif is_classification(self.analysis_type):
df_ls = []
for i in range(len(self.models)):
df = pd.concat([self.get_features(), self.get_yTrue(), self.get_yPreds()[i]], axis=1)
df_ls.append(df)
# dfs = pd.concat(df_ls) # can remove this dfs if memory issue for big files
return df_ls
[docs]class DataframeLoader(BaseLoader):
'''
Dataloader that compile all xFeatures, yTrue, yPreds in dataframe format.
Args:
df_xFeatures (:obj:`~pd.DataFrame`):
Dataframe that contains all xfeatures used for model development/training.
df_yTrue (:obj:`~pd.DataFrame`):
Dataframe that contains all true values / labels.
df_yPred_ls (:obj:`List[~pd.DataFrame]`):
Dataframe that contains all predicted values (regresession) / probabilities (classification).
model_names_ls (:obj:`List[str]`):
List contains model names representing the model used to generate yPred.
analysis_type (str):
Analysis type defined by user. Corresponding feature components will be auto-populated based on the
specified analysis type. Supported analysis types : ``Regression``, ``Binary Classification``, ``Multiclass Classification``
'''
def __init__(
self,
df_xFeatures: pd.DataFrame,
df_yTrue: pd.DataFrame,
df_yPred_ls: List[pd.DataFrame] = [],
model_names_ls: List[str] = [],
analysis_type: str = None
):
super().__init__(df_xFeatures, df_yTrue, df_yPred_ls, model_names_ls, analysis_type)
self.yTrue = df_yTrue.copy()
self.yPreds = [df.copy() for df in df_yPred_ls]
self.analysis_type = analysis_type.lower().replace(' ', '-')
assert len(self.yPreds) == len(self.models), 'no. of yPred_files must be equal to ' \
'no. of model_names in correct order'
assert self.analysis_type in ['regression', 'binary-classification', 'multiclass-classification'], "Currently " \
"supported analysis types: ['Regression', 'Binary Classification', 'Multiclass Classification']"
if self.analysis_type == 'multiclass-classification':
cols_ls = [list(yPred.columns) for yPred in self.yPreds]
assert all(len(col) > 2 for col in cols_ls), "Data for df_yPred_list doesn't seem to be a multiclass prediction. " \
"Please ensure there is prediction probabilities for each class in the df of df_yPred_list."
if self.analysis_type == 'binary-classification':
assert self.yTrue.nunique().values[0] == 2, "Data for df_yTrue doesn't seem to be a binary-class prediction. " \
"Please ensure df_yTrue consists of data with only 2 unique class"
def get_features(self):
return self.xFeatures
def get_yTrue(self):
self.yTrue.rename(columns={self.yTrue.columns[0]: 'yTrue'}, inplace=True)
return self.yTrue
def get_yPreds(self):
if is_regression(self.analysis_type):
yPred_df = pd.concat(self.yPreds, axis=1)
if len(self.models) > 1:
yPred_df.rename(columns={yPred_df.columns[0]: f'yPred_{self.models[0]}',
yPred_df.columns[1]: f'yPred_{self.models[1]}'}, inplace=True)
elif len(self.models) == 1:
yPred_df.rename(columns={yPred_df.columns[0]: f'yPred_{self.models[0]}'}, inplace=True)
return yPred_df
elif is_classification(self.analysis_type):
for i in range(len(self.models)):
self.yPreds[i]['model'] = self.models[i]
self.yPreds[i]['yPred-label'] = self.yPreds[i][list(self.yPreds[i].columns)[:2]].idxmax(axis=1)
return self.yPreds
def get_model_list(self):
return self.models
def get_analysis_type(self):
return self.analysis_type
def get_all(self):
if is_regression(self.analysis_type):
df = pd.concat([self.get_features(), self.get_yTrue(), self.get_yPreds()], axis=1)
return df
elif is_classification(self.analysis_type):
df_ls = []
for i in range(len(self.models)):
df = pd.concat([self.xFeatures, self.get_yTrue(), self.get_yPreds()[i]], axis=1)
df_ls.append(df)
# dfs = pd.concat(df_ls) # can remove this dfs if memory issue for big files
return df_ls