Source code for plot_utils.multiple_columns

# -*- coding: utf-8 -*-

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from distutils.version import LooseVersion

from . import helper as hlp
from . import colors_and_lines as cl

#%%============================================================================
[docs]def missing_value_counts(X, fig=None, ax=None, figsize=None, dpi=100, rot=45):
    '''
    Visualize the number of missing values in each column of ``X``.

    Parameters
    ----------
    X : pandas.DataFrame or pandas.Series
        Input data set whose every row is an observation and every column is
        a variable.
    fig : matplotlib.figure.Figure or ``None``
        Figure object. If None, a new figure will be created.
    ax : matplotlib.axes._subplots.AxesSubplot or ``None``
        Axes object. If None, a new axes will be created.
    figsize: (float, float)
        Figure size in inches, as a tuple of two numbers. The figure
        size of ``fig`` (if not ``None``) will override this parameter.
    dpi : float
        Figure resolution. The dpi of ``fig`` (if not ``None``) will override
        this parameter.
    rot : float
        Rotation (in degrees) of the x axis labels.

    Returns
    -------
    fig : matplotlib.figure.Figure
        The figure object being created or being passed into this function.
    ax : matplotlib.axes._subplots.AxesSubplot
        The axes object being created or being passed into this function.
    null_counts : pandas.Series
        A pandas Series whose every element is the number of missing values
        corresponding to each column of ``X``.
    '''
    if not isinstance(X, (pd.DataFrame, pd.Series)):
        raise TypeError('`X` should be pandas DataFrame or Series.')

    if isinstance(X, pd.Series): X = pd.DataFrame(X)

    ncol = X.shape[1]
    null_counts = X.isnull().sum()  # a pd Series containing number of non-null numbers

    if not figsize:
        figsize = (ncol * 0.5, 2.5)

    fig, ax = hlp._process_fig_ax_objects(fig, ax, figsize, dpi)

    ax.bar(range(ncol), null_counts)
    ax.set_xticks(range(ncol))

    ha = 'center' if (0 <= rot < 30 or rot == 90) else 'right'
    ax.set_xticklabels(null_counts.index, rotation=rot, ha=ha)
    plt.ylabel('Number of missing values')
    plt.grid(ls=':')
    ax.set_axisbelow(True)

    alpha = null_counts.max()*0.02  # vertical offset for the texts

    for j, col in enumerate(null_counts.index):
        if null_counts[col] != 0:  # show count of missing values on top of bars
            plt.text(
                j, null_counts[col] + alpha, str(null_counts[col]),
                ha='center', va='bottom', rotation=90,
            )

    return fig, ax, null_counts

#%%============================================================================
[docs]def histogram3d(
        X, bins=10, fig=None, ax=None, figsize=(8,4), dpi=100,
        elev=30, azim=5, alpha=0.6, data_labels=None,
        plot_legend=True, plot_xlabel=False, color=None,
        dx_factor=0.4, dy_factor=0.8,
        ylabel='Data', zlabel='Counts',
        **legend_kwargs,
):
    '''
    Plot 3D histograms. 3D histograms are best used to compare the distribution
    of more than one set of data.

    Parameters
    ----------
    X : numpy.ndarray, list<list<float>>, pandas.Series, pandas.DataFrame
        Input data. ``X`` can be:
           (1) a 2D numpy array, where each row is one data set;
           (2) a 1D numpy array, containing only one set of data;
           (3) a list of lists, e.g., [[1,2,3],[2,3,4,5],[2,4]], where each
               element corresponds to a data set (can have different lengths);
           (4) a list of 1D numpy arrays.
               [Note: Robustness is not guaranteed for X being a list of
                      2D numpy arrays.]
           (5) a pandas Series, which is treated as a 1D numpy array;
           (5) a pandas DataFrame, where each column is one data set.
    bins : int, list, numpy.ndarray, or pandas.Series
        Bin specifications. Can be:
           (1) An integer, which indicates number of bins;
           (2) An array or list, which specifies bin edges.
               [Note: If an integer is used, the widths of bars across data
                      sets may be different. Thus array/list is recommended.]
    fig : matplotlib.figure.Figure or ``None``
        Figure object. If None, a new figure will be created.
    ax : matplotlib.axes._subplots.AxesSubplot or ``None``
        Axes object. If None, a new axes will be created.
    figsize: (float, float)
        Figure size in inches, as a tuple of two numbers. The figure
        size of ``fig`` (if not ``None``) will override this parameter.
    dpi : float
        Figure resolution. The dpi of ``fig`` (if not ``None``) will override
        this parameter.
    elev : float
        Elevation of the 3D view point.
    azim : float
        Azimuth angle of the 3D view point (unit: degree).
    alpha : float
        Opacity of bars
    data_labels : list of str
        Names of different datasets, e.g., ['Simulation', 'Measurement'].
        If not provided, generic names ['Dataset #1', 'Dataset #2', ...]
        are used. The data_labels are only shown when either plot_legend or
        plot_xlabel is ``True``.
        If not provided, and X is a pandas DataFrame/Series, data_labels will
        be overridden by the column names (or name) of ``X``.
    plot_legend : bool
        Whether to show legends or not.
    plot_xlabel : str
        Whether to show data_labels of each data set on their respective x
        axis position or not.
    color : list<list>, or tuple<tuples>
        Colors of each distributions. Needs to be at least the same length as
        the number of data series in ``X``. Can be RGB colors, HEX colors,
        or valid color names in Python. If ``None``,
        get_colors(N=N, color_scheme='tab10') will be queried.
    dx_factor : float
        Width factor of 3D bars in x direction.
    dy_factor : float
        Width factor of 3D bars in y direction. For example, if ``dy_factor``
        is 0.9, there will be a small gap between bars in y direction.
    ylabel : str
        Label of Y axes.
    zlabel : str
        Labels of Z axes.

    Returns
    -------
    fig : matplotlib.figure.Figure
        The figure object being created or being passed into this function.
    ax : matplotlib.axes._subplots.AxesSubplot
        The axes object being created or being passed into this function.

    Notes
    -----
    x direction :
        Across data sets (i.e., if we have three datasets, the bars will
        occupy three different x values).
    y direction :
        Within dataset.

    Illustration::

                    ^ z
                    |
                    |
                    |
                    |
                    |
                    |--------------------> y
                   /
                  /
                 /
                /
               V  x

    '''
    from mpl_toolkits.mplot3d import Axes3D

    #---------  Data type checking for X  -------------------------------------
    if isinstance(X, np.ndarray):
        if X.ndim <= 1:
            N = 1
            X = [list(X)]  # np.array([1,2,3])-->[[1,2,3]], so that X[0]=[1,2,3]
        elif X.ndim == 2:
            N = X.shape[0]  # number of separate distribution to be compared
            X = list(X)  # turn X into a list of numpy arrays
        else:  # 3D numpy array or above
            raise TypeError('If `X` is a numpy array, it should be a 1D or 2D array.')
    elif isinstance(X, pd.Series):
        data_labels = [X.name]
        X = [list(X)]
        N = 1
    elif isinstance(X, pd.DataFrame):
        N = X.shape[1]
        if data_labels is None:
            data_labels = X.columns  # override data_labels with column names
        X = list(X.values.T)
    elif len(list(X)) > 1:  # adding list() to X to make sure len() does not throw an error
        N = len(X)  # number of separate distribution to be compared
    else:  # X is a scalar
        raise TypeError(
            '`X` must be a list, 2D numpy array, or pandas Series/DataFrame.'
        )

    #------------  NaN checking for X  ----------------------------------------
    for j in range(N):
        if not all(np.isfinite(X[j])):
            raise ValueError(
                f'X[{j}] contains non-finite values (not accepted by `histogram3d()`).'
            )

    if data_labels is None:
        data_labels = [[None]] * N
        for j in range(N):
            data_labels[j] = 'Dataset #%d' % (j+1)  # use generic data set names

    #------------ Prepare figure, axes and colors -----------------------------
    fig, ax = hlp._process_fig_ax_objects(fig, ax, figsize, dpi, '3d')
    ax.view_init(elev, azim)  # set view elevation and angle

    proxy = [[None]] * N  # create a 'proxy' to help generate legends
    if not color:
        c_ = cl.get_colors(color_scheme='tab10', N=N)  # get a list of colors
    else:
        valid_color_flag, msg = cl._check_color_types(color, N)
        if not valid_color_flag:
            raise TypeError(msg)
        c_ = color

    #------------ Plot one data set at a time ---------------------------------
    xpos_list = [[None]] * N
    for j in range(N):  # loop through each dataset
        if isinstance(bins, (list, np.ndarray)):
            if len(bins) == 0:
                raise ValueError('`bins` must not be empty.')
            else:
                all_bin_widths = np.array(bins[1:]) - np.array(bins[:-1])
                bar_width = np.min(all_bin_widths)
        elif isinstance(bins, (int, np.integer)):  # i.e., number of bins
            if bins <= 0:
                raise ValueError('`bins` must be a positive integer.')
            bar_width = np.ptp(X[j])/float(bins)  # most narrow bin width --> bar_width
        else:
            raise ValueError('`bins` must be an integer, list, or np.ndarray.')

        dz, ypos_ = np.histogram(X[j], bins)  # calculate counts and bin edges
        ypos = np.mean(np.array([ypos_[:-1],ypos_[1:]]), axis=0)  # mid-point of all bins
        xpos = np.ones_like(ypos) * (j-0.5)  # location of each data set
        zpos = np.zeros_like(xpos)  # zpos is where the bars stand
        dx = dx_factor  # width of bars in x direction (across data sets)
        dy = bar_width * dy_factor  # width of bars in y direction (within data set)
        if LooseVersion(mpl.__version__) >= LooseVersion('2.0'):
            bar3d_kwargs = {'alpha':alpha}  # lw clashes with alpha in 2.0+ versions
        else:
            bar3d_kwargs = {'alpha':alpha, 'lw':0.5}
        ax.bar3d(xpos, ypos, zpos, dx, dy, dz, color=c_[j], **bar3d_kwargs)
        proxy[j] = plt.Rectangle((0, 0), 1, 1, fc=c_[j])  # generate proxy for plotting legends
        xpos_list[j] = xpos[0] + dx/2.0  # '+dx/2.0' makes x ticks pass through center of bars

    #-------------- Legends, labels, etc. -------------------------------------
    if plot_legend is True:
        default_kwargs = {
            'loc':9, 'fancybox':True, 'framealpha':0.5, 'ncol':N, 'fontsize':10,
        }
        if legend_kwargs == {}:
            legend_kwargs.update(default_kwargs)
        else:  # if user provides some keyword arguments
            default_kwargs.update(legend_kwargs)
            legend_kwargs = default_kwargs
        ax.legend(proxy, data_labels, **legend_kwargs)

    if plot_xlabel is True:
        ax.set_xticks(xpos_list)
        ax.set_xticklabels(data_labels)
    else:
        ax.set_xticks([])

    ax.set_ylabel(ylabel)
    ax.set_zlabel(zlabel)
    ax.invert_xaxis()  # make X[0] appear in front, and X[-1] appear at back

    plt.tight_layout(pad=0.3)

    return fig, ax

#%%============================================================================
[docs]def correlation_matrix(
        X, color_map='RdBu_r', fig=None, ax=None, figsize=None,
        dpi=100, variable_names=None, rot=45, scatter_plots=False,
):
    '''
    Plot correlation matrix of a dataset ``X``, whose columns are different
    variables (or a sample of a certain random variable).

    Parameters
    ----------
    X : numpy.ndarray or pandas.DataFrame
        The data set.
    color_map : str or matplotlib.colors.Colormap
        The color scheme to show high, low, negative high correlations. Valid
        names are listed in https://matplotlib.org/users/colormaps.html. Using
        diverging color maps is recommended: PiYG, PRGn, BrBG, PuOr, RdGy,
        RdBu, RdYlBu, RdYlGn, Spectral, coolwarm, bwr, seismic.
    fig : matplotlib.figure.Figure or ``None``
        Figure object. If None, a new figure will be created.
    ax : matplotlib.axes._subplots.AxesSubplot or ``None``
        Axes object. If None, a new axes will be created.
    figsize: (float, float)
        Figure size in inches, as a tuple of two numbers. The figure
        size of ``fig`` (if not ``None``) will override this parameter.
    dpi : float
        Figure resolution. The dpi of ``fig`` (if not ``None``) will override
        this parameter.
    variable_names : list<str>
        Names of the variables in ``X``. If ``X`` is a pandas DataFrame, this
        argument is not needed: column names of ``X`` is automatically used as
        variable names. If ``X`` is a numpy array, and this argument is not
        provided, then ``X``'s column indices are used. The length of
        ``variable_names`` should match the number of columns in ``X``; if
        not, a warning will be thrown (not error).
    rot : float
        The rotation of the x axis labels, in degrees.
    scatter_plots : bool
        Whether or not to show the scatter plots of pairs of variables.

    Returns
    -------
    correlations : pandas.DataFrame
        The correlation matrix.
    fig : matplotlib.figure.Figure
        The figure object being created or being passed into this function.
    ax : matplotlib.axes._subplots.AxesSubplot
        The axes object being created or being passed into this function.
    '''
    from mpl_toolkits.axes_grid1 import make_axes_locatable

    if not isinstance(X, (np.ndarray, pd.DataFrame)):
        raise TypeError('`X` must be a numpy array or a pandas DataFrame.')

    if isinstance(X, np.ndarray):
        X = pd.DataFrame(X, copy=True)

    correlations = X.corr()
    variable_list = list(correlations.columns)
    nr = len(variable_list)

    if not figsize:
        figsize = (0.7 * nr, 0.7 * nr)  # every column of X takes 0.7 inches

    fig, ax = hlp._process_fig_ax_objects(fig, ax, figsize, dpi)

    im = ax.matshow(correlations, vmin=-1, vmax=1, cmap=color_map)
    divider = make_axes_locatable(ax)
    cax = divider.append_axes("right", size="3%", pad=0.08)
    cb = fig.colorbar(im, cax=cax)  # 'cb' is a Colorbar instance
    cb.set_label("Pearson's correlation")

    ticks = np.arange(0,correlations.shape[1],1)
    ax.set_xticks(ticks)
    ax.set_yticks(ticks)
    if variable_names is None:
        variable_names = variable_list

    if len(variable_names) != len(variable_list):
        print('*****  Warning: feature_names may not be valid!  *****')

    ha = 'center' if (0 <= rot < 30 or rot == 90) else 'left'
    ax.set_xticklabels(variable_names, rotation=rot, ha=ha)
    ax.set_yticklabels(variable_names)

    if scatter_plots:
        pd.plotting.scatter_matrix(X, figsize=(1.8 * nr, 1.8 * nr))

    return fig, ax, correlations

#%%============================================================================
[docs]def violin_plot(
        X, fig=None, ax=None, figsize=None, dpi=100, nan_warning=False,
        showmeans=True, showextrema=False, showmedians=False, vert=True,
        data_names=[], rot=45, name_ax_label=None, data_ax_label=None,
        sort_by=None, title=None, **violinplot_kwargs,
):
    '''
    Generate violin plots for each data set within ``X``.

    Parameters
    ----------
    X : pandas.DataFrame, pandas.Series, numpy.ndarray, or dict
        The data to be visualized. It can be of the following types:

        - pandas.DataFrame:
            + Each column contains a set of data
        - pandas.Series:
            + Contains only one set of data
        - numpy.ndarray:
            + 1D numpy array: only one set of data
            + 2D numpy array: each column contains a set of data
            + Higher dimensional numpy array: not allowed
        - dict:
            + Each key-value pair is one set of data
        - list of lists:
            + Each sub-list is a data set

        Note that the NaN values in the data are implicitly excluded.

    fig : matplotlib.figure.Figure or ``None``
        Figure object. If None, a new figure will be created.
    ax : matplotlib.axes._subplots.AxesSubplot or ``None``
        Axes object. If None, a new axes will be created.
    figsize: (float, float)
        Figure size in inches, as a tuple of two numbers. The figure
        size of ``fig`` (if not ``None``) will override this parameter.
    dpi : float
        Figure resolution. The dpi of ``fig`` (if not ``None``) will override
        this parameter.
    nan_warning : bool
        Whether to show a warning if there are NaN values in the data.
    showmeans : bool
        Whether to show the mean values of each data group.
    showextrema : bool
        Whether to show the extrema of each data group.
    showmedians : bool
        Whether to show the median values of each data group.
    vert : bool
        Whether to show the violins as vertical.
    data_names : list<str>, ``[]``, or ``None``
        The names of each data set, to be shown as the axis tick label of each
        data set. If ``[]`` or ``None``, it will be determined automatically.
        If ``X`` is a:
            - numpy.ndarray:
                + data_names = ['data_0', 'data_1', 'data_2', ...]
            - pandas.Series:
                + data_names = X.name
            - pd.DataFrame:
                + data_names = list(X.columns)
            - dict:
                + data_names = list(X.keys())
    rot : float
        The rotation (in degrees) of the data_names when shown as the tick
        labels. If vert is False, rot has no effect.
    name_ax_label : str
        The label of the "name axis". ("Name axis" is the axis along which
        different violins are presented.)
    data_ax_label : str
        The labels of the "data axis". ("Data axis" is the axis along which
        the data values are presented.)
    sort_by : {'name', 'mean', 'median', ``None``}
        Option to sort the different data groups in ``X`` in the violin plot.
        ``None`` means no sorting, keeping the violin plot order as provided;
        'mean' and 'median' mean sorting the violins according to the
        mean/median values of each data group; 'name' means sorting the violins
        according to the names of the groups.
    title : str
        The title of the plot.
    **violinplot_kwargs : dict
        Other keyword arguments to be passed to ``matplotlib.pyplot.violinplot()``.

    Returns
    -------
    fig : matplotlib.figure.Figure
        The figure object being created or being passed into this function.
    ax : matplotlib.axes._subplots.AxesSubplot
        The axes object being created or being passed into this function.
    '''
    _check_violin_plot_or_hist_multi_input(X, data_names, nan_warning)

    data, data_names, n_datasets = _preprocess_violin_plot_data(
        X, data_names=data_names, nan_warning=nan_warning,
    )

    data_with_names = _prepare_violin_plot_data(
        data, data_names, sort_by=sort_by, vert=vert,
    )

    fig, ax = _violin_plot_helper(
        data_with_names, fig=fig, ax=ax,
        figsize=figsize, dpi=dpi, showmeans=showmeans,
        showmedians=showmedians, vert=vert, rot=rot,
        data_ax_label=data_ax_label,
        name_ax_label=name_ax_label,
        title=title, **violinplot_kwargs,
    )

    return fig, ax

#%%============================================================================
def _check_violin_plot_or_hist_multi_input(X, data_names, nan_warning):
    '''
    Check that the input, `X`, for violin_plot() or hist_multi() is valid.
    '''
    if not isinstance(X, (pd.DataFrame, pd.Series, np.ndarray, dict, list)):
        raise TypeError(
            '`X` must be pandas.DataFrame, pandas.Series, np.ndarray, dict, or list.'
        )
    if not isinstance(data_names, (list, type(None))):
        raise TypeError('`data_names` must be a list of names, empty list, or None.')
    if nan_warning and isinstance(X, (pd.DataFrame, pd.Series)) and X.isnull().any().any():
        print('WARNING in violin_plot(): X contains NaN values.')
    if nan_warning and isinstance(X, np.ndarray) and np.isnan(X).any():
        print('WARNING in violin_plot(): X contains NaN values.')
    if isinstance(X, list) and not all([isinstance(_, list) for _ in X]):
        raise TypeError('If `X` is a list, it must be a list of lists.')

#%%============================================================================
def _preprocess_violin_plot_data(X, data_names=None, nan_warning=False):
    '''
    Helper function. Preprocess raw data (``X``) for violin plot or
    multi-histogram plot.
    '''
    if isinstance(X, pd.Series):
        n_datasets = 1
        data = X.dropna().values
    elif isinstance(X, pd.DataFrame):
        n_datasets = X.shape[1]
        data = []
        for j in range(n_datasets):
            data.append(X.iloc[:,j].dropna().values)
    elif isinstance(X, np.ndarray):  # use columns
        if X.ndim == 1:  # 1D numpy array
            n_datasets = 1
            data = X[np.isfinite(X)].copy()
        elif X.ndim == 2:  # 2D numpy array
            n_datasets = X.shape[1]
            data = []
            for j in range(n_datasets):  # go through every column
                x = X[:,j]
                data.append(x[np.isfinite(x)])  # remove NaN values
        else:
            raise hlp.DimensionError('`X` should be a 1D or 2D numpy array.')
    elif isinstance(X, list):  # list of lists
        data = X.copy()
        n_datasets = len(data)
    else:  # dict --> extract its values
        n_datasets = len(X)
        data = []
        key_list = []
        for key in X:
            x = X[key]
            key_list.append(key)
            if isinstance(x, pd.Series):
                x_ = x.values
            elif isinstance(x, np.ndarray) and x.ndim == 1:
                x_ = x.copy()
            elif isinstance(x, list):
                x_ = np.array(x)
            else:
                raise TypeError(
                    'Unknown data type in X["%s"]. Should be either '
                    'pandas.Series, 1D numpy array, or a list.' % key
                )
            if nan_warning and np.isnan(x_).any():
                print(
                    'WARNING in violin_plot() or hist_multi(): '
                    'X[%s] contains NaN values.' % key
                )
            data.append(x_[np.isfinite(x_)])

    if not data_names and isinstance(X, dict):
        data_names = key_list

    assert(len(data) == n_datasets)
    if len(data_names) != 0 and len(data_names) != n_datasets:
        raise hlp.LengthError('Length of `data_names` must equal the number of datasets.')

    if not data_names:  # [] or None
        if isinstance(X, pd.Series):
            data_names = [X.name]
        elif isinstance(X, pd.DataFrame):
            data_names = list(X.columns)
        elif isinstance(X, dict):
            data_names = list(X.keys())
        else:  # numpy array or list of lists
            data_names = ['data_' + str(_) for _ in range(n_datasets)]

    return data, data_names, n_datasets

#%%============================================================================
def _prepare_violin_plot_data(data, data_names, sort_by=None, vert=False):
    '''
    Package ``data`` and ``data_names`` into a dictionary with the specified
    sorting option.

    Parameters
    ----------
    data : list<list>
        All the data. Each element of ``data`` is an array of data points.
    data_names : list<str>
        The names of the data. It should have the same length as ``data``.
    sort_by : [None, 'name', 'mean', 'median']
        The method by which to sort the data sets.  If ``None``, then use the
        original order of ``data`` (i.e., left to right if ``vert`` is ``True``,
        top to bottom if ``vert`` is ``False``).
    vert : bool
        Whether to show the histograms as vertical.

    Returns
    -------
    data_with_names_dict : OrderedDict<str, list>
        A mapping from data names to data, ordered by the specification in
        ``sort_by``.
    '''
    from collections import OrderedDict

    assert(len(data) == len(data_names))
    n = len(data)

    data_with_names = []
    for j in range(n):
        data_with_names.append((data_names[j], data[j]))

    reverse = not vert

    if not sort_by:
        if not reverse:
            sorted_list = data_with_names.copy()
        else:  # for "not vert" histograms, we want the first data set on top
            sorted_list = data_with_names[::-1]
    elif sort_by == 'name':
        sorted_list = sorted(
            data_with_names, key=lambda x: x[0], reverse=reverse,
        )
    elif sort_by == 'mean':
        sorted_list = sorted(
            data_with_names, key=lambda x: np.mean(x[1]), reverse=reverse,
        )
    elif sort_by == 'median':
        sorted_list = sorted(
            data_with_names, key=lambda x: np.median(x[1]), reverse=reverse,
        )
    else:
        raise NameError(
            "`sort_by` must be one of {`None`, 'name', 'mean', "
            "'median'}, not '%s'." % sort_by
        )

    data_with_names_dict = OrderedDict()
    for j in range(n):
        data_with_names_dict[sorted_list[j][0]] = sorted_list[j][1]

    return data_with_names_dict

#%%============================================================================
def _violin_plot_helper(
        data_with_names, fig=None, ax=None, figsize=None,
        dpi=100, showmeans=True, showextrema=False,
        showmedians=False, vert=False, rot=45,
        data_ax_label=None, name_ax_label=None, title=None,
        **violinplot_kwargs,
):
    '''
    Helper function for violin plot.

    Parameters
    ----------
    data_with_names : OrderedDict<str, list>
        A dictionary whose keys are the names of the categories and values are
        the actual data.
    '''
    data = []
    data_names = []
    for key, val in data_with_names.items():
        data.append(val)
        data_names.append(key)

    n_datasets = len(data)

    if not figsize:
        l1 = max(3, 0.5 * n_datasets)
        l2 = 3.5
        figsize = (l1, l2) if vert else (l2, l1)

    fig, ax = hlp._process_fig_ax_objects(fig, ax, figsize, dpi)
    ax.violinplot(
        data, vert=vert, showmeans=showmeans, showextrema=showextrema,
        showmedians=showmedians, **violinplot_kwargs,
    )
    ax = hlp.__axes_styling_helper(
        ax, vert, rot, data_names, n_datasets,
        data_ax_label, name_ax_label, title,
    )
    return fig, ax

#%%============================================================================
[docs]def hist_multi(
        X, bins=10, fig=None, ax=None, figsize=None, dpi=100,
        nan_warning=False, showmeans=True, showmedians=False, vert=True,
        data_names=[], rot=45, name_ax_label=None, data_ax_label=None,
        sort_by=None, title=None, show_vals=True, show_pct_diff=False,
        baseline_data_index=0, legend_loc='best',
        show_counts_on_data_ax=True, **extra_kwargs,
):
    '''
    Generate multiple histograms, one for each data set within ``X``.

    Parameters
    ----------
    X : pandas.DataFrame, pandas.Series, numpy.ndarray, or dict
        The data to be visualized. It can be of the following types:

        - pandas.DataFrame:
            + Each column contains a set of data
        - pandas.Series:
            + Contains only one set of data
        - numpy.ndarray:
            + 1D numpy array: only one set of data
            + 2D numpy array: each column contains a set of data
            + Higher dimensional numpy array: not allowed
        - dict:
            + Each key-value pair is one set of data
        - list of lists:
            + Each sub-list is a data set

        Note that the NaN values in the data are implicitly excluded.

    bins : int or sequence or str
        If an integer is given, the whole range of data (i.e., all the numbers
        within ``X``) is divided into ``bins`` segments. If sequence or str,
        they will be passed to the ``bins`` argument of ``matplotlib.pyplot.hist()``.
    fig : matplotlib.figure.Figure or ``None``
        Figure object. If None, a new figure will be created.
    ax : matplotlib.axes._subplots.AxesSubplot or ``None``
        Axes object. If None, a new axes will be created.
    figsize: (float, float)
        Figure size in inches, as a tuple of two numbers. The figure
        size of ``fig`` (if not ``None``) will override this parameter.
    dpi : float
        Figure resolution. The dpi of ``fig`` (if not ``None``) will override
        this parameter.
    nan_warning : bool
        Whether to show a warning if there are NaN values in the data.
    showmeans : bool
        Whether to show the mean values of each data group.
    showmedians : bool
        Whether to show the median values of each data group.
    vert : bool
        Whether to show the "base" of the histograms as vertical.
    data_names : list<str>, ``[]``, or ``None``
        The names of each data set, to be shown as the axis tick label of each
        data set. If ``[]`` or ``None``, it will be determined automatically.
        If ``X`` is a:
            - numpy.ndarray:
                + data_names = ['data_0', 'data_1', 'data_2', ...]
            - pandas.Series:
                + data_names = X.name
            - pd.DataFrame:
                + data_names = list(X.columns)
            - dict:
                + data_names = list(X.keys())
    rot : float
        The rotation (in degrees) of the data_names when shown as the tick
        labels. If vert is False, rot has no effect.
    name_ax_label : str
        The label of the "name axis". ("Name axis" is the axis along which
        different violins are presented.)
    data_ax_label : str
        The labels of the "data axis". ("Data axis" is the axis along which
        the data values are presented.)
    sort_by : {'name', 'mean', 'median', ``None``}
        Option to sort the different data groups in ``X`` in the violin plot.
        ``None`` means no sorting, keeping the violin plot order as provided;
        'mean' and 'median' mean sorting the violins according to the
        mean/median values of each data group; 'name' means sorting the violins
        according to the names of the groups.
    title : str
        The title of the plot.
    show_vals : bool
        Whether to show mean and/or median values along the mean/median bars.
        Only effective if ``showmeans`` and/or ``showmedians`` are turned on.
    show_pct_diff : bool
        Whether to show percent difference of mean and/or median values
        between different data sets. Only effective when ``show_vals`` is
        set to ``True``.
    baseline_data_index : int
        Which data set is considered the "baseline" when showing percent
        differences.
    legend_loc : str
        The location specification for the legend.
    show_counts_on_data_ax : bool
        Whether to show counts besides the histograms.
    **extra_kwargs : dict
        Other keyword arguments to be passed to ``matplotlib.pyplot.bar()``.

    Returns
    -------
    fig : matplotlib.figure.Figure
        The figure object being created or being passed into this function.
    ax : matplotlib.axes._subplots.AxesSubplot
        The axes object being created or being passed into this function.
    '''
    _check_violin_plot_or_hist_multi_input(X, data_names, nan_warning)

    data, data_names, n_datasets = _preprocess_violin_plot_data(
        X, data_names=data_names, nan_warning=nan_warning,
    )

    data_with_names = _prepare_violin_plot_data(
        data, data_names, sort_by=sort_by, vert=vert,
    )

    if isinstance(bins, int):
        flattened_data = []
        for data_i in data:
            flattened_data.extend(data_i)
        all_X_max = np.max(flattened_data)
        all_X_min = np.min(flattened_data)
        bins = np.linspace(all_X_min, all_X_max, num=bins, endpoint=True)

    fig, ax = _hist_multi_helper(
        data_with_names, bins=bins, fig=fig, ax=ax,
        figsize=figsize, dpi=dpi, showmeans=showmeans,
        showmedians=showmedians, vert=vert, rot=rot,
        data_ax_label=data_ax_label,
        name_ax_label=name_ax_label,
        title=title, show_vals=show_vals,
        show_pct_diff=show_pct_diff,
        baseline_data_index=baseline_data_index,
        legend_loc=legend_loc,
        show_counts_on_data_ax=show_counts_on_data_ax,
        **extra_kwargs,
    )

    return fig, ax

#%%============================================================================
def _hist_multi_helper(
        data_with_names, bins=10, fig=None, ax=None,
        figsize=None, dpi=100, showmeans=True, showmedians=False,
        vert=False, rot=45, data_ax_label=None,
        name_ax_label=None, show_legend=True, title=None,
        show_vals=True, show_pct_diff=False,
        baseline_data_index=0, legend_loc='best',
        show_counts_on_data_ax=True,
        **extra_kwargs,
):
    '''
    Helper function to multi_hist().

    Parameters
    ----------
    data_with_names : OrderedDict<str, list>
        A dictionary whose keys are the names of the categories and values are
        the actual data.
    (Other parameters are the same as multi_hist().)

    Returns
    -------
    Same sa multi_hist()
    '''
    data = []
    data_names = []
    for key, val in data_with_names.items():
        data.append(val)
        data_names.append(key)

    n_datasets = len(data)

    if not figsize:
        l1 = max(3, 1.0 * n_datasets)
        l2 = 3.5
        figsize = (l1, l2) if vert else (l2, l1)

    MAX_RELATIVE_BAR_HEIGHT = 0.8  # limit tallest bar height to 90%

    fig, ax = hlp._process_fig_ax_objects(fig, ax, figsize, dpi)

    mean_vals = []
    median_vals = []
    max_count_each_dataset = []
    for i, data_i in enumerate(data):
        freq_bar_heights, bin_edges = np.histogram(data_i, bins=bins)
        max_bar_height = max(freq_bar_heights)
        bar_full_width = bin_edges[1] - bin_edges[0]
        bar_half_width = bar_full_width / 2.0

        max_count_each_dataset.append(freq_bar_heights.max())

        bin_centers = bin_edges[:-1] + bar_half_width
        bar_heights = freq_bar_heights / max_bar_height * MAX_RELATIVE_BAR_HEIGHT
        extra_kwarg = {'bottom': i + 1} if not vert else {'left': i + 1}

        plot_bar_func = ax.bar if not vert else ax.barh  # flipped compared to violin plot!
        plot_bar_func(
            bin_centers, bar_heights,
            bar_full_width * 0.9,  # leave some space between bars
            align='center', alpha=0.75, lw=0.5, ec='w', **extra_kwarg,
        )

        mbl = 0.8  # mean/median bar length
        if showmeans:
            mean_val = np.mean(data_i)
            mean_vals.append(mean_val)
            label_1 = 'mean' if i == 0 else None
            if vert:
                ax.plot(
                    [i+1, i+1+mbl], [mean_val] * 2, c='k', label=label_1,
                    alpha=0.6,
                )
            else:
                ax.plot(
                    [mean_val] * 2, [i+1, i+1+mbl], c='k', label=label_1,
                    alpha=0.6,
                )
        if showmedians:
            median_val = np.median(data_i)
            median_vals.append(median_val)
            label_2 = 'median' if i == 0 else None
            if vert:
                ax.plot(
                    [i+1, i+1+mbl], [median_val] * 2, c='k', ls='--',
                    alpha=0.6, label=label_2,
                )
            else:
                ax.plot(
                    [median_val] * 2, [i+1, i+1+mbl], c='k', ls='--',
                    alpha=0.6, label=label_2,
                )

    #~~~~~~~~~~ Print values of mean and/or median ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    if show_vals and (len(mean_vals) > 0 or len(median_vals) > 0):
        if len(mean_vals) == 0:
            mean_vals = [None] * n_datasets
        # END
        if len(median_vals) == 0:
            median_vals = [None] * n_datasets
        # END

        bdi = baseline_data_index
        if not isinstance(bdi, int):
            raise TypeError('`baseline_data_index` should be an int.')
        # END
        if bdi > n_datasets - 1:
            raise ValueError(f'`baseline_data_index` not in [0, {n_datasets}).')
        # END

        def _annotate(value, ax, i, base_val, vert=True, below=True):
            if i != bdi:
                if base_val != 0:
                    pct_diff = (value - base_val) / abs(base_val) * 100
                else:
                    pct_diff = None
                # END
            else:  # this value is the base value; no need to calculate pct_diff
                pct_diff = None
            # END

            if not show_pct_diff or pct_diff is None:
                fmt = '%.3g' if abs(value) < 10 else '%.2f'
                txt = fmt % value
            else:
                fmt1 = '%.3g' if abs(value) < 10 else '%.2f'
                fmt2 = '%.1f'
                sign = '+' if pct_diff > 0 else ''
                txt = f'{fmt1} ({sign}{fmt2}%%)' % (value, pct_diff)
            # END

            if vert:
                y_span = ax.get_ylim()[1] - ax.get_ylim()[0]
                gap = y_span / 50
                x_position = i + 1.5
                y_position = value - gap if below else value + gap
                ha = 'center'
                va = 'top' if below else 'bottom'
            else:
                x_span = ax.get_xlim()[1] - ax.get_xlim()[0]
                gap = x_span / 50
                x_position = value - gap if below else value + gap
                y_position = i + 1.5
                ha = 'right' if below else 'left'
                va = 'center'
            # END

            ax.annotate(
                txt, xy=(x_position, y_position), xycoords='data',
                ha=ha, va=va,
            )
            return

        for i in range(n_datasets):
            mean_val = mean_vals[i]
            median_val = median_vals[i]
            if median_val is None:
                _annotate(mean_val, ax, i, mean_vals[bdi], vert=vert, below=False)
            elif mean_val is None:
                _annotate(median_val, ax, i, median_vals[bdi], vert=vert, below=False)
            elif mean_val > median_val:
                _annotate(mean_val, ax, i, mean_vals[bdi], vert=vert, below=False)
                _annotate(median_val, ax, i, median_vals[bdi], vert=vert, below=True)
            elif mean_val < median_val:
                _annotate(mean_val, ax, i, mean_vals[bdi], vert=vert, below=True)
                _annotate(median_val, ax, i, median_vals[bdi], vert=vert, below=False)
            else:  # mean val = median val
                _annotate(mean_val, ax, i, mean_vals[bdi], vert=vert, below=True)
            # END IF
        # END FOR
    # END IF
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    if show_legend:
        ax.legend(loc=legend_loc)
    ax = hlp.__axes_styling_helper(
        ax, vert, rot, data_names, n_datasets,
        data_ax_label, name_ax_label, title,
    )

    if show_counts_on_data_ax:

        def get_ticks_and_labels(
            n_datasets_, max_count_each_dataset, max_relative_bar_height,
        ):
            ticks = []
            tick_labels = []
            for i in range(n_datasets_):
                ticks.extend([1 + i, 1 + i + max_relative_bar_height])
                tick_labels.extend([0, max_count_each_dataset[i]])
            # END
            ticks.append(1 + n_datasets_)
            tick_labels.append('')
            return ticks, tick_labels

        if not vert:
            ax2 = ax.twinx()
            ax2.set_ylabel('Counts')
            ticks, tick_labels = get_ticks_and_labels(
                n_datasets, max_count_each_dataset, MAX_RELATIVE_BAR_HEIGHT,
            )
            ax2.set_yticks(ticks)
            ax2.set_yticklabels(tick_labels)
            ax.set_ylim(1, n_datasets + 1)
            ax2.set_ylim(1, n_datasets + 1)
        else:
            ax2 = ax.twiny()
            ax2.set_xlabel('Counts')
            ticks, tick_labels = get_ticks_and_labels(
                n_datasets, max_count_each_dataset, MAX_RELATIVE_BAR_HEIGHT,
            )
            ax2.set_xticks(ticks)
            ax2.set_xticklabels(tick_labels, rotation=45)
            ax.set_xlim(1, n_datasets + 1)
            ax2.set_xlim(1, n_datasets + 1)

    return fig, ax