# Copyright 2021 AI Singapore. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Union, List, Optional
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from rarity.data_loader import CSVDataLoader, DataframeLoader
from rarity.interpreters.structured_data.base_interpreters import BaseInterpreters
from rarity.utils.methods import compute_distances
from rarity.utils.common_functions import is_regression, is_classification, insert_index_col
[docs]class IntSimilaritiesCounterFactuals(BaseInterpreters):
'''
Transform raw data into input format suitable for visualization on Similarities / Counter-Factuals
Arguments:
data_loader (:class:`~rarity.data_loader.CSVDataLoader` or :class:`~rarity.data_loader.DataframeLoader`):
Class object from data_loader module
'''
def __init__(self, data_loader: Union[CSVDataLoader, DataframeLoader]):
super().__init__(data_loader)
self.df_features = data_loader.get_features()
self.models = data_loader.get_model_list()
[docs] def _get_categorical_features(self, df: pd.DataFrame):
'''
Identify categorical features
'''
categorical_cols = [col for col in df.columns if df[col].dtype == 'object']
return categorical_cols
[docs] def _label_encode_categorical_features(self, df: pd.DataFrame, categorical_cols: List):
'''
Fit-transform categorical features with ``LabelEncoder``
'''
df_encoded = df.copy()
for col in categorical_cols:
df_encoded[col] = LabelEncoder().fit_transform(df_encoded[col])
return df_encoded
[docs] def _apply_standard_scale(self, df: pd.DataFrame):
'''
Standard scale features
'''
scaler = StandardScaler()
return scaler.fit_transform(df)
[docs] def _get_ranking_and_distance_metrics(self, user_defined_idx: int, feature_to_exclude: List, top_n: int):
'''
Compute distance scores and generate index list sorted by distance ranking
'''
categorical_cols = self._get_categorical_features(self.df_features)
df_encoded = self._label_encode_categorical_features(self.df_features, categorical_cols)
scaled_data = self._apply_standard_scale(df_encoded)
df_scaled = pd.DataFrame(scaled_data, index=df_encoded.index, columns=df_encoded.columns)
df_scaled = df_scaled[[col for col in df_scaled.columns if col not in feature_to_exclude]]
baseline = df_scaled.iloc[[user_defined_idx]]
distance_metrics_dict = {}
for idx in df_scaled.index:
distance_metrics_dict[idx] = compute_distances(baseline, df_scaled.loc[lambda x: x.index == idx, :])[0][0]
sorted_distance_metrics_dict = dict(sorted(distance_metrics_dict.items(), key=lambda x: x[1]))
idx_for_top_n = [k for k in list(sorted_distance_metrics_dict.keys())[:top_n + 1]] # top_n + 1 => first is user_defined_idx
calculated_distance = [round(v, 4) for v in list(sorted_distance_metrics_dict.values())[:top_n + 1]]
return idx_for_top_n, calculated_distance