Source code for rarity.visualizers.loss_clusters

# Copyright 2021 AI Singapore. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import List
import pandas as pd
import plotly.graph_objects as go
from rarity.utils.common_functions import is_regression


[docs]def plot_offset_clusters(df: pd.DataFrame, analysis_type: str): ''' For use in regression task only. Function to plot figure displaying cluster groups by prediction offset values Arguments: df (:obj:`~pd.DataFrame`): dataframe containing cluster info, output from int_loss_clusters analysis_type (str): info to indicate if analysis is regression or classification, info inherited from data_loader Returns: :obj:`~plotly.graph_objects.Figure`: figure displaying violin plot outlining cluster groups by offset values ''' models = [col.replace('cluster_', '') for col in df.columns if 'cluster_' in col] fig = _plot_common_clusters(df, models, analysis_type) fig.update_layout(title_text='<b>Overview of Prediction Offset Clusters</b>', yaxis_title='Offset from baseline') return fig
[docs]def plot_logloss_clusters(dfs: List[pd.DataFrame], analysis_type: str): ''' For use in classification task only. Function to plot figure displaying cluster groups by log-loss values Arguments: dfs (:obj:`List[~pd.DataFrame]`): list of dataframes containing cluster info, output from int_loss_clusters analysis_type (str): info to indicate if analysis is regression or classification, info inherited from data_loader Returns: :obj:`~plotly.graph_objects.Figure`: figure displaying violin plot outlining cluster groups by log-loss values ''' models = [df['model'].values[0] for df in dfs] fig = _plot_common_clusters(dfs, models, analysis_type) fig.update_layout(title_text='<b>Overview of Log-Loss Clusters on Miss-Prediction</b>', yaxis_title='Log-Loss') return fig
[docs]def plot_optimum_cluster_via_elbow_method(cluster_range: List[int], sum_squared_distance: List[float], models: List[str]): ''' Figure to guide decision on the number of clusters that is reasonable to form with KMean method Arguments: cluster_range (:obj:`List[int]`): list of integers indicating the number of clusters sum_squared_distance (:obj:`List[float]`): list of sum of squared distance generated via kmean_inertia models (:obj:`List[str]`): list of models used to generate yPred Returns: :obj:`~plotly.graph_objects.Figure`: figure displaying line plot outlining the change in sum of squared distances along the cluster range ''' fig = go.Figure() fig.add_trace(go.Scatter(x=cluster_range[0], y=sum_squared_distance[0], mode='lines+markers', name=models[0], line=dict(color='#1F77B4'))) if len(cluster_range) == 2: fig.add_trace(go.Scatter( x=cluster_range[1], y=sum_squared_distance[1], mode='lines+markers', name=models[1], line=dict(color='#FF7F0E'))) fig.update_layout(title='<b>Overview of Optimum Cluster Count</b>', xaxis_title='No. of Cluster', yaxis_title='Sum of Squared Distance', title_x=0.5, showlegend=True, legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), width=1000, height=550) fig.update_xaxes(rangemode="tozero") return fig
def _plot_common_clusters(df: pd.DataFrame, models: List[str], analysis_type: str): ''' Internal function to plot violin graph for different cluster groups ''' fig = go.Figure() if is_regression(analysis_type): x_m1 = df[f'cluster_{models[0]}'] y_m1 = df[f'offset_{models[0]}'] customdata_m1 = list(df.index) if len(models) == 2: x_m2 = df[f'cluster_{models[1]}'] y_m2 = df[f'offset_{models[1]}'] customdata_m2 = customdata_m1 else: # binary classification x_m1 = df[0]['cluster'] y_m1 = df[0]['lloss'] customdata_m1 = list(df[0].index) if len(models) == 2: x_m2 = df[1]['cluster'] y_m2 = df[1]['lloss'] customdata_m2 = list(df[1].index) fig.add_trace(go.Violin(x=x_m1, y=y_m1, legendgroup=models[0], scalegroup=models[0], name=models[0], line_color='#1f77b4', customdata=customdata_m1, hovertemplate='index=%{customdata}<br>cluster=%{x}<br>offset=%{y}<br>', showlegend=True)) if len(models) == 2: fig.add_trace(go.Violin(x=x_m2, y=y_m2, legendgroup=models[1], scalegroup=models[1], name=models[1], line_color='#FF7F0E', customdata=customdata_m2, hovertemplate='index=%{customdata}<br>cluster=%{x}<br>offset=%{y}<br>', showlegend=True)) # update characteristics shared by all traces fig.update_traces(meanline_visible=True, box_visible=True, points='all', # show all points jitter=0.05, # add some jitter on points for better visibility scalemode='count') # scale violin plot area with total count fig.update_layout( title_x=0.5, xaxis_title='Cluster', width=1000, height=550, legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), violingap=0.2, violingroupgap=0.3, violinmode='overlay') return fig