Source code for rarity.interpreters.structured_data.int_similarities_counter_factuals

# Copyright 2021 AI Singapore. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Union, List, Optional
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder

from rarity.data_loader import CSVDataLoader, DataframeLoader
from rarity.interpreters.structured_data.base_interpreters import BaseInterpreters
from rarity.utils.methods import compute_distances
from rarity.utils.common_functions import is_regression, is_classification, insert_index_col


[docs]class IntSimilaritiesCounterFactuals(BaseInterpreters): ''' Transform raw data into input format suitable for visualization on Similarities / Counter-Factuals Arguments: data_loader (:class:`~rarity.data_loader.CSVDataLoader` or :class:`~rarity.data_loader.DataframeLoader`): Class object from data_loader module ''' def __init__(self, data_loader: Union[CSVDataLoader, DataframeLoader]): super().__init__(data_loader) self.df_features = data_loader.get_features() self.models = data_loader.get_model_list()
[docs] def _get_categorical_features(self, df: pd.DataFrame): ''' Identify categorical features ''' categorical_cols = [col for col in df.columns if df[col].dtype == 'object'] return categorical_cols
[docs] def _label_encode_categorical_features(self, df: pd.DataFrame, categorical_cols: List): ''' Fit-transform categorical features with ``LabelEncoder`` ''' df_encoded = df.copy() for col in categorical_cols: df_encoded[col] = LabelEncoder().fit_transform(df_encoded[col]) return df_encoded
[docs] def _apply_standard_scale(self, df: pd.DataFrame): ''' Standard scale features ''' scaler = StandardScaler() return scaler.fit_transform(df)
[docs] def _get_ranking_and_distance_metrics(self, user_defined_idx: int, feature_to_exclude: List, top_n: int): ''' Compute distance scores and generate index list sorted by distance ranking ''' categorical_cols = self._get_categorical_features(self.df_features) df_encoded = self._label_encode_categorical_features(self.df_features, categorical_cols) scaled_data = self._apply_standard_scale(df_encoded) df_scaled = pd.DataFrame(scaled_data, index=df_encoded.index, columns=df_encoded.columns) df_scaled = df_scaled[[col for col in df_scaled.columns if col not in feature_to_exclude]] baseline = df_scaled.iloc[[user_defined_idx]] distance_metrics_dict = {} for idx in df_scaled.index: distance_metrics_dict[idx] = compute_distances(baseline, df_scaled.loc[lambda x: x.index == idx, :])[0][0] sorted_distance_metrics_dict = dict(sorted(distance_metrics_dict.items(), key=lambda x: x[1])) idx_for_top_n = [k for k in list(sorted_distance_metrics_dict.keys())[:top_n + 1]] # top_n + 1 => first is user_defined_idx calculated_distance = [round(v, 4) for v in list(sorted_distance_metrics_dict.values())[:top_n + 1]] return idx_for_top_n, calculated_distance
[docs] def xform(self, user_defined_idx: int, feature_to_exclude: Optional[List[str]] = None, top_n: int = 3): ''' Core transformation function to tap-out data into input format suitable for plotly graph Arguments: user_defined_idx (int): Index of the data point of interest specified by user feature_to_exclude (List of :obj:`str`, `optional`): A list of features to be excluded from the ranking and similarities distance calculation top_n (int): Number indicating the max limit of records to be displayed based on the distance ranking Returns: Outputs consist of the followings - idx_for_top_n (:obj:`List[int]`): list of integer numbers indicating the ranking position in ascending order - calculated_distance (:obj:`List[float]`): list of calculated euclidean_distances .. note:: if classification, returns: Outputs consist of the followings - df_viz (:obj:`~pd.DataFrame`): dataframes for overview visualization need with true labels and predicted labels included - idx_for_top_n (:obj:`List[int]`): list of integer numbers indicating the ranking position in ascending order - calculated_distance (:obj:`List[float]`): list of calculated euclidean_distances ''' if not isinstance(feature_to_exclude, list): try: feature_to_exclude = list(feature_to_exclude) except TypeError: # 'NoneType' object is not iterable feature_to_exclude = [] if is_regression(self.analysis_type): df_viz = super().get_df_with_offset_values() df_viz = insert_index_col(df_viz) idx_for_top_n, calculated_distance = self._get_ranking_and_distance_metrics(user_defined_idx, feature_to_exclude, top_n) elif is_classification(self.analysis_type): df_features = insert_index_col(self.df_features) yTrue = insert_index_col(self.data_loader.get_yTrue()) df_viz = df_features.merge(yTrue, how='left', on='index') yPreds = self.data_loader.get_yPreds() for i, model in enumerate(self.models): df_viz[f'yPred_{model}'] = yPreds[i]['yPred-label'] idx_for_top_n, calculated_distance = self._get_ranking_and_distance_metrics(user_defined_idx, feature_to_exclude, top_n) return df_viz, idx_for_top_n, calculated_distance