Source code for rarity.visualizers.xfeature_distribution

# Copyright 2021 AI Singapore. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import List, Dict
import plotly.express as px


[docs]def plot_distribution_by_specific_feature(ls_specific_feature: List[str], kl_div_dict_sorted: Dict, comparison_base: str, model_name: str): ''' Create distribution plot for a specific feature Arguments: ls_specific_feature (:obj:`List[str]`): list of feature to have its distribution graph plotted kl_div_dict_sorted (:obj:`Dict`): dictionary storing kl-divergence score by feature in decending order comparison_base (str): info to indicate the baseline for distribution comparison. ``dataset_type`` for regression and ``pred_state`` for classification task model_name (str): model used to generate yPred Returns: :obj:`List[~plotly.graph_objects.Figure]`: List of figures displaying distribution plot of specific feature ''' if not isinstance(ls_specific_feature, list): ls_specific_feature = [ls_specific_feature] fig_obj_ls = [] for specific_feature in ls_specific_feature: fig = _single_dist_plot(specific_feature, kl_div_dict_sorted, comparison_base, model_name) fig_obj_ls.append(fig) return fig_obj_ls
[docs]def plot_distribution_by_kl_div_ranking(kl_div_dict_sorted: Dict, display_option: str, display_value: int, comparison_base: str, model_name: str): ''' Create distribution plot by kl-divergence score ranking in descending order Arguments: kl_div_dict_sorted (:obj:`Dict`): dictionary storing kl-divergence score by feature in decending order display_option (str) - info to indicate if to display distribution plot by top-N / bottom-N or both top-N + bottom-N - Available options: ``top``, ``bottom`` or ``both`` display_value (int) - number indicates the limit of graph to be displayed, max at 10 - if dataset consists of < 10 features, the limit == no. of features the dataset has comparison_base (str): info to indicate the baseline for distribution comparison. ``dataset_type`` for regression and ``pred_state`` for classification task model_name (str): model used to generate yPred Returns: :obj:`Dict[str, ~plotly.graph_objects.Figure]`: Dictionary storing distribution figures by display_option ''' fig_obj_dict = {} if display_option == 'top': sliced_kl_div_dict_top = dict(list(kl_div_dict_sorted.items())[:display_value]) fig_obj_dict['top'] = [plot_distribution_by_specific_feature(k, sliced_kl_div_dict_top, comparison_base, model_name) for k, v in sliced_kl_div_dict_top.items()] fig_obj_dict['bottom'] = [] elif display_option == 'bottom': sliced_kl_div_dict_btm = dict(list(kl_div_dict_sorted.items())[-display_value:]) fig_obj_dict['top'] = [] fig_obj_dict['bottom'] = [plot_distribution_by_specific_feature(k, sliced_kl_div_dict_btm, comparison_base, model_name) for k, v in sliced_kl_div_dict_btm.items()] elif display_option == 'both': sliced_kl_div_dict_top = dict(list(kl_div_dict_sorted.items())[:display_value]) sliced_kl_div_dict_btm = dict(list(kl_div_dict_sorted.items())[-display_value:]) fig_obj_dict['top'] = [plot_distribution_by_specific_feature(k, sliced_kl_div_dict_top, comparison_base, model_name) for k, v in sliced_kl_div_dict_top.items()] fig_obj_dict['bottom'] = [plot_distribution_by_specific_feature(k, sliced_kl_div_dict_btm, comparison_base, model_name) for k, v in sliced_kl_div_dict_btm.items()] return fig_obj_dict
def _single_dist_plot(feature: str, kl_div_dict_sorted: Dict, comparison_base: str, model_name: str): ''' Internal function to plot single distribution graph Important Arguments: comparison_base (str): ``dataset_type`` for regression, ``pred_state`` for classification ''' df = kl_div_dict_sorted[feature][1] fig = px.histogram(df, x=feature, color=comparison_base, marginal='box', opacity=0.5, barmode='overlay', # color_discrete_sequence=px.colors.qualitative.D3) # replaces default color mapping by value color_discrete_map={'correct': '#1F77B4', 'miss-predict': '#FF7F0E', 'df_reference': '#1F77B4', 'df_sliced': '#FF7F0E'}, category_orders={'pred_state': ['correct', 'miss-predict'], 'dataset_type': ['df_reference', 'df_sliced']}) kl_div_score = kl_div_dict_sorted[feature][0] customized_title = f'<b>[ KL divergence : {kl_div_score:.4f} ]</b><br>Distribution of xFeature : {feature}' customized_margin = dict(t=120) if model_name is not None: model_name = f'<span style="color:blue; font-size:14px">{model_name} </span>' feature_name = f'<span style="font-size:14px">Distribution of xFeature : {feature}</span>' customized_title = f'<b>[ KL divergence : {kl_div_score:.4f} ]</b><br>' + model_name + feature_name customized_margin = dict(t=130) fig.update_layout( title=customized_title, xaxis_title=f'xFeature : {feature}', yaxis_title='count', title_x=0.48, legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1, title=None), margin=customized_margin) return fig