Source code for plot_utils.misc

# -*- coding: utf-8 -*-

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
from matplotlib.collections import PatchCollection

from . import helper as hlp

#%%============================================================================
[docs]def plot_ranking(
        ranking, fig=None, ax=None, figsize='auto', dpi=100,
        barh=True, top_n=None, score_ax_label=None, name_ax_label=None,
        invert_name_ax=False, grid_on=True,
):
    '''
    Plot rankings as a bar plot (in descending order), such as::

                ^
                |
        dolphin |||||||||||||||||||||||||||||||
                |
        cat     |||||||||||||||||||||||||
                |
        rabbit  ||||||||||||||||
                |
        dog     |||||||||||||
                |
               -|------------------------------------>  Age of pet
                0  1  2  3  4  5  6  7  8  9  10  11

    Parameters
    ----------
    ranking : dict or pandas.Series
        The ranking information, for example:
            {'rabbit': 5, 'cat': 8, 'dog': 4, 'dolphin': 10}
        It does not need to be sorted externally.
    fig : matplotlib.figure.Figure or ``None``
        Figure object. If None, a new figure will be created.
    ax : matplotlib.axes._subplots.AxesSubplot or ``None``
        Axes object. If None, a new axes will be created.
    figsize: (float, float)
        Figure size in inches, as a tuple of two numbers. The figure
        size of ``fig`` (if not ``None``) will override this parameter.
    dpi : float
        Figure resolution. The dpi of ``fig`` (if not ``None``) will override
        this parameter.
    barh : bool
        Whether or not to show the bars as horizontal (otherwise, vertical)
    top_n : int
        If ``None``, show all categories. ``top_n`` > 0 means showing the
        highest ``top_n`` categories. ``top_n`` < 0 means showing the lowest
        |``top_n``| categories.
    score_ax_label : str
        Label of the score axis (e.g., "Age of pet").
    name_ax_label : str
        Label of the "category name" axis (e.g., "Pet name").
    invert_name_ax : bool
        Whether to invert the "category name" axis. For example, if
        ``invert_name_ax`` is ``False``, then higher values are shown on the
        top if ``barh`` is ``True``.
    grid_on : bool
        Whether or not to show grids on the plot.

    Returns
    -------
    fig : matplotlib.figure.Figure
        The figure object being created or being passed into this function.
    ax : matplotlib.axes._subplots.AxesSubplot
        The axes object being created or being passed into this function.
    '''
    if not isinstance(ranking, (dict, pd.Series)):
        raise TypeError('`ranking` must be a Python dict or pandas Series.')

    if top_n is not None and not isinstance(top_n, (int, np.integer)):
        raise ValueError('`top_n` must be an integer of None.')

    if top_n == None:
        nr_classes = len(ranking)
        top_n = len(ranking)
    else:
        nr_classes = np.abs(top_n)

    if figsize == 'auto':
        if barh:
            figsize = (5, nr_classes * 0.26)  # 0.26 inch = height for each category
        else:
            figsize = (nr_classes * 0.26, 5)

    fig, ax = hlp._process_fig_ax_objects(fig, ax, figsize, dpi)

    if isinstance(ranking,dict):
        ranking = pd.Series(ranking)

    if barh:
        kind = 'barh'
        xlabel, ylabel = score_ax_label, name_ax_label
        ax = ranking.sort_values(
            ascending=(top_n >= 0)
        ).iloc[-np.abs(top_n):].plot(kind=kind, ax=ax)
    else:
        kind = 'bar'
        xlabel, ylabel = name_ax_label, score_ax_label
        ax = ranking.sort_values(
            ascending=(top_n < 0)
        ).iloc[:np.abs(top_n) if top_n != 0 else None].plot(
            kind=kind, ax=ax,
        )

    if invert_name_ax:
        if barh is True:
            ax.invert_yaxis()
        else:
            ax.invert_xaxis()
    if xlabel: ax.set_xlabel(xlabel)
    if ylabel: ax.set_ylabel(ylabel)
    if grid_on:
        ax.grid(ls=':')
        ax.set_axisbelow(True)

    return fig, ax

#%%============================================================================
[docs]def plot_with_error_bounds(
        x, y, upper_bound, lower_bound,
        fig=None, ax=None, figsize=None, dpi=100,
        line_color=[0.4]*3, shade_color=[0.7]*3,
        shade_alpha=0.5, linewidth=2.0, legend_loc='best',
        line_label='Data', shade_label='$\mathregular{\pm}$STD',
        logx=False, logy=False, grid_on=True,
):
    '''
    Plot a graph with one line and its upper and lower bounds, with areas between
    bounds shaded. The effect is similar to this illustration below::

      y ^            ...                         _____________________
        |         ...   ..........              |                     |
        |         .   ______     .              |  ---  Mean value    |
        |      ...   /      \    ..             |  ...  Error bounds  |
        |   ...  ___/        \    ...           |_____________________|
        |  .    /    ...      \    ........
        | .  __/   ...  ....   \________  .
        |  /    ....       ...          \  .
        | /  ....            .....       \_
        | ...                    ..........
       -|--------------------------------------->  x


    Parameters
    ----------
    x : list, numpy.ndarray, or pandas.Series
        X data points to be plotted as a line.
    y : list, numpy.ndarray, or pandas.Series
        Y data points to be plotted as a line.
    fig : matplotlib.figure.Figure or ``None``
        Figure object. If None, a new figure will be created.
    ax : matplotlib.axes._subplots.AxesSubplot or ``None``
        Axes object. If None, a new axes will be created.
    figsize: (float, float)
        Figure size in inches, as a tuple of two numbers. The figure
        size of ``fig`` (if not ``None``) will override this parameter.
    dpi : float
        Figure resolution. The dpi of ``fig`` (if not ``None``) will override
        this parameter.
    upper_bound : list, numpy.ndarray, or pandas.Series
        Upper bound of the Y values.
    lower_bound : list, numpy.ndarray, or pandas.Series
        Lower bound of the Y values.
    line_color : str, list, or tuple
        Color of the line.
    shade_color : str, list, or tuple
        Color of the underlying shades.
    shade_alpha : float
        Opacity of the shades.
    linewidth : float
        Width of the line.
    legend_loc : int, str
        Location of the legend, to be passed directly to ``plt.legend()``.
    line_label : str
        Label of the line, to be used in the legend.
    shade_label : str
        Label of the shades, to be used in the legend.
    logx : bool
        Whether or not to show the X axis in log scale.
    logy : bool
        Whether or not to show the Y axis in log scale.
    grid_on : bool
        Whether or not to show grids on the plot.

    Returns
    -------
    fig : matplotlib.figure.Figure
        The figure object being created or being passed into this function.
    ax : matplotlib.axes._subplots.AxesSubplot
        The axes object being created or being passed into this function.
    '''
    if not isinstance(x, hlp._array_like) or not isinstance(y, hlp._array_like):
        raise TypeError('`x` and `y` must be arrays.')

    if len(x) != len(y):
        raise hlp.LengthError('`x` and `y` must have the same length.')

    fig, ax = hlp._process_fig_ax_objects(fig, ax, figsize, dpi)

    hl1 = ax.fill_between(
        x, lower_bound, upper_bound,
        color=shade_color, facecolor=shade_color,
        linewidth=0.01, alpha=shade_alpha, interpolate=True,
        label=shade_label,
    )
    hl2, = ax.plot(x, y, color=line_color, linewidth=linewidth, label=line_label)
    if logx: ax.set_xscale('log')
    if logy: ax.set_yscale('log')

    if grid_on:
        ax.grid(ls=':',lw=0.5)
        ax.set_axisbelow(True)

    plt.legend(handles=[hl2,hl1],loc=legend_loc)

    return fig, ax

#%%============================================================================
[docs]def visualize_cv_scores(
        fig=None, ax=None, dpi=100, n_folds=5,
        cv_scores=None, box_height=0.6, box_width=0.9,
        gap_frac=0.05, metric_name='AUC', avg_cv_score=None,
        no_holdout_set=False, holdout_score=None, fontsize=9,
        flip_yaxis=True,
):
    '''
    Visualize K-fold cross-validation scores as well as hold-out set performance
    in an intuitive way.

    Parameters
    ----------
    fig : matplotlib.figure.Figure or ``None``
        Figure object. If None, a new figure will be created.
    ax : matplotlib.axes._subplots.AxesSubplot or ``None``
        Axes object. If None, a new axes will be created.
    dpi : float
        Figure resolution. The dpi of ``fig`` (if not ``None``) will override
        this parameter.
    n_folds : int
        Number of CV folds.
    cv_scores : list<float> or ``None``
        The validation score of each fold. If ``None``, no scores will be shown
        on the small boxes.
    box_height : float
        The height of the the small box, in inches.
    box_width : float
        The width of the small box, in inches.
    gap_frac : float
        How much gap should there be between each small box.
    metric_name : str
        The name of the metric to be shown in the figure.
    avg_cv_score : float or ``None``
        The average cross-validation score. If ``None`` (recommended), it will
        be calculated by numpy.mean(cv_scores).
    no_holdout_set : bool
        If ``False``, the hold-out data set will be visualized alongside the
        training data set. This parameter supersedes ``holdout_score``.
    holdout_score : float or ``None``
        The performance on the hold-out data set. If ``no_holdout_set`` is
        ``True``, this parameter has no effect.
    fontsize : float
        The font size of all the texts.
    flip_yaxis : bool
        If ``True``, everything will be flipped upside down. This parameter is
        for diagnosis and and debugging purpose only. It is recommended to leave
        it as ``True``.

    Returns
    -------
    fig : matplotlib.figure.Figure
        The figure object being created or being passed into this function.
    ax : matplotlib.axes._subplots.AxesSubplot
        The axes object being created or being passed into this function.
    '''
    hlp.assert_type(n_folds, int, name='n_folds')
    hlp.assert_type(cv_scores, (type(None), list), name='cv_scores')
    if cv_scores is not None:
        hlp.assert_element_type(cv_scores, hlp._scalar_like, name='cv_scores')
    # END IF
    hlp.assert_type(avg_cv_score, (type(None), hlp._scalar_like), name='avg_cv_score')
    hlp.assert_type(gap_frac, hlp._scalar_like, name='gap_frac')
    if gap_frac < 0 or gap_frac > 1:
        raise ValueError('`gap_frac` must be within (0, 1).')
    # END IF
    hlp.assert_type(metric_name, str, name='metric_name')
    hlp.assert_type(holdout_score, (type(None), hlp._scalar_like), name='holdout_score')

    GRAY_COLOR_ALPHA = 0.25
    OTHER_COLOR_ALPHA = 0.5

    total_width = n_folds * box_width
    total_height = n_folds * box_height
    fig_width = total_width * 1.5
    fig_height = total_height * 1.5

    if metric_name is None:
        metric_name = 'score'
    # END IF

    if cv_scores is not None:
        assert(len(cv_scores) == n_folds)
    # END IF
    figsize = (fig_width, fig_height)
    fig, ax = hlp._process_fig_ax_objects(fig, ax, figsize, dpi)
    for j in range(n_folds):
        text_ = [''] * n_folds
        if cv_scores is not None:
            if j == 0:
                text_[j] = '%s\n= %.4g' % (metric_name, cv_scores[j])
            else:
                text_[j] = '%.4g' % cv_scores[j]
        else:
            for k in range(n_folds):
                if k == j:
                    text_[k] = 'eval.'
                else:
                    text_[k] = 'train'
                # END IF-ELS
            # END FOR
        # END IF-ELSE
        ax = _plot_one_row_of_rectangles(
            ax, n_boxes=n_folds,
            southwest_corner=(0, j * box_height),
            box_height=box_height, fontsize=fontsize,
            box_width=box_width, gap_frac=gap_frac,
            show_which_box_as_test=j, text=text_,
        )
    # END FOR

    text_list = ['Fold %d' % (_ + 1) for _ in range(n_folds)]
    ax = _plot_one_row_of_rectangles(
        ax, n_boxes=n_folds,
        southwest_corner=(0, n_folds * box_height),
        box_height=box_height, box_width=box_width,
        gap_frac=gap_frac, fontsize=fontsize,
        show_which_box_as_test=-1,
        train_set_color='gray', alpha=GRAY_COLOR_ALPHA,
        text=text_list,
    )

    ax = _plot_one_row_of_rectangles(
        ax, n_boxes=1,
        southwest_corner=(0, -1.5 * box_height),
        box_height=box_height,
        box_width=total_width,
        gap_frac=0.0, text=['Training data'],
        fontsize=fontsize,
        train_set_color='#6baed6',
        alpha=OTHER_COLOR_ALPHA,
    )

    if not no_holdout_set:
        if holdout_score is not None:
            holdout_txt = 'Hold-out data\n%s = %.4g' % (metric_name, holdout_score)
        else:
            holdout_txt = 'Hold-out data'
        # END IF-ELSE
        holdout_box_gap_frac = 0.01
        r1 = 1 + holdout_box_gap_frac
        holdout_box_width = total_width * 0.5
        ax = _plot_one_row_of_rectangles(
            ax, n_boxes=1,
            southwest_corner=(total_width * r1, -1.5 * box_height),
            box_width=holdout_box_width,
            box_height=box_height,
            gap_frac=0.0, text=[holdout_txt],
            fontsize=fontsize,
            train_set_color='yellow',
            alpha=OTHER_COLOR_ALPHA,
        )
        ax = _plot_one_row_of_rectangles(
            ax, n_boxes=1,
            southwest_corner=(0, -2.7 * box_height),
            box_height=box_height,
            box_width=total_width * r1 + holdout_box_width,
            gap_frac=0.0, text=['All data'],
            fontsize=fontsize,
            train_set_color='gray',
            alpha=GRAY_COLOR_ALPHA,
        )
    # END IF

    if avg_cv_score is not None or cv_scores is not None:
        avg_cv_score = np.mean(cv_scores) if avg_cv_score is None else avg_cv_score
        char = '\n' if n_folds <= 4 else ' '  # too few folds: display text in two lines
        avg_score_txt = 'Mean %s%s= %.4g' % (metric_name, char, avg_cv_score)
    else:
        avg_score_txt = 'Take average'
        # END IF-ELSE
    # END IF-ELSE
    _plot_bracket(
        ax, n_folds, total_width, total_height, avg_score_txt, fontsize=fontsize,
    )

    if flip_yaxis:
        ax.invert_yaxis()
    # END IF
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)
    ax.axis('off')

    return fig, ax

#------------------------------------------------------------------------------
def _plot_one_row_of_rectangles(
        ax, n_boxes=5, southwest_corner=(0, 0),
        box_height=0.6, box_width=0.9, gap_frac=0.05,
        show_which_box_as_test=-1,
        train_set_color='green', test_set_color='orange',
        alpha=0.3, text=None, fontsize=None,
):
    '''
    Plot one row of rectangles (small boxes).

    Parameters
    ----------
    ax :
        Figure axes object.
    n_boxes : int
        Number of boxes to plot on this row.
    southwest_corner : (float, float)
        A tuple of two floats. The south-west corner coordinate of the box.
    box_height : float
        Height of a small box.
    box_width : float
        Width of a small box.
    gap_frac : float
        How much gap should there be between each small box.
    show_which_box_as_test : int
        The 0-based index of one of the ``n_boxes`` boxes to show as the
        "test" box. If -1, treat all boxes as the "train" boxes.
    train_set_color : str or tuple<float>
        The color of the "train" boxes. Can be a color name or rgb.
    test_set_color : str or tuple<float>
        The color of the "test" boxes. Can be a color name or rgb.
    alpha : float
        Opacity of the box color.
    text : list<str> or ``None``
        The text to show on each box. If ``None``, do not show text.
    fontsize : float
        The font size of the texts.

    Returns
    -------
    ax :
        Figure axes object.
    '''
    patches_train = []
    patches_test = []
    for i in range(n_boxes):
        x0, y0 = southwest_corner
        x1, y1, width, height = __add_gap_to_coord(
            x0 + i * box_width, y0, box_width, box_height, gap_frac=gap_frac,
        )
        rect = Rectangle((x1, y1), width, height)
        if i == show_which_box_as_test:
            patches_test.append(rect)
        else:
            patches_train.append(rect)
    # END IF
    box_edge_width = 0.7
    pc_train = PatchCollection(
        patches_train, edgecolor='k', lw=box_edge_width,
        facecolor=train_set_color, alpha=alpha,
    )
    pc_test = PatchCollection(
        patches_test, edgecolor='k', lw=box_edge_width,
        facecolor=test_set_color, alpha=alpha,
    )
    ax.add_collection(pc_train)
    ax.add_collection(pc_test)
    if text is not None:
        __add_text(
            ax, text, n_boxes=n_boxes, southwest_corner=southwest_corner,
            box_height=box_height, box_width=box_width, fontsize=fontsize,
        )
    # END IF
    return ax

#------------------------------------------------------------------------------
def __add_gap_to_coord(x0, y0, width, height, gap_frac=0.05):
    x1 = x0 + width * gap_frac / 2.0
    y1 = y0 + height * gap_frac / 2.0
    new_width = width * (1 - gap_frac)
    new_height = height * (1 - gap_frac)
    return x1, y1, new_width, new_height

#------------------------------------------------------------------------------
def __add_text(
        ax, text, n_boxes=5, southwest_corner=(0, 0),
        box_height=0.6, box_width=0.9, fontsize=10,
    ):
    assert(len(text) == n_boxes)
    x_mid, y_mid = ___get_mid_points(
        n_boxes=n_boxes, southwest_corner=southwest_corner,
        box_height=box_height, box_width=box_width,
    )
    for i in range(n_boxes):
        ax.text(x_mid[i], y_mid[i], text[i], ha='center', va='center', fontsize=fontsize)
    # END FOR
    return ax

#------------------------------------------------------------------------------
def ___get_mid_points(n_boxes=5, southwest_corner=(0, 0), box_height=0.6, box_width=0.9):
    x_mid = []
    y_mid = []
    x0, y0 = southwest_corner
    for i in range(n_boxes):
        x_mid.append(x0 + box_width / 2.0 + i * box_width)
        y_mid.append(y0 + box_height / 2.0)
    # END FOR
    return x_mid, y_mid

#------------------------------------------------------------------------------
def _plot_bracket(
        ax, n_boxes, total_width, total_height, text, gap_frac=0.02,
        c='gray', lw=1.0, fontsize=10,
):
    bar_len = total_width * gap_frac * 2

    x1 = total_width * (1 + gap_frac)
    x1_ = x1 + bar_len
    y1 = 0
    y1_ = y1 + bar_len

    x2 = x1
    x2_ = x2 + bar_len
    y2 = total_height
    y2_ = y2 - bar_len

    x0 = x1_
    x0_ = x0 + bar_len
    x0__ = x0_ + bar_len / 2.0  # where to put text
    y0 = (y1 + y2) / 2.0

    ax.plot([x1, x1_], [y1, y1_], c=c, lw=lw)
    ax.plot([x2, x2_], [y2, y2_], c=c, lw=lw)
    ax.plot([x1_, x1_], [y1_, y2_], c=c, lw=lw)
    ax.plot([x0, x0_], [y0, y0], c=c, lw=lw)
    ax.text(
        x0__, y0, text, ha='left', va='center', fontsize=fontsize, rotation=270,
    )