Source code for plot_utils.two_columns

# -*- coding: utf-8 -*-

import itertools
import numpy as np
import pandas as pd
from scipy import stats

from . import misc
from . import helper as hlp
from . import multiple_columns as mc
from . import colors_and_lines as cl

#%%============================================================================
[docs]def category_means(
        categorical_array, continuous_array, fig=None, ax=None,
        figsize=None, dpi=100, title=None, xlabel=None, ylabel=None,
        rot=0, dropna=False, show_stats=True, sort_by='name',
        vert=True, plot_violins=True, **extra_kwargs,
):
    '''
    Summarize the mean values of entries of ``continuous_array`` corresponding
    to each distinct category in ``categorical_array``, and show a violin plot
    to visualize it. The violin plot will show the distribution of values in
    ``continuous_array`` corresponding to each category in
    ``categorical_array``.

    Also, a one-way ANOVA test (H0: different categories in ``categorical_array``
    yield the same average values in ``continuous_array``) is performed, and
    F statistics and p-value are returned.

    Parameters
    ----------
    categorical_array : list, numpy.ndarray, or pandas.Series
        An vector of categorical values.
    continuous_array : list, numpy.ndarray, or pandas.Series
        The target variable whose values correspond to the values in x. Must
        have the same length as x. It is natural that y contains continuous
        values, but if y contains categorical values (expressed as integers,
        not strings), this function should also work.
    fig : matplotlib.figure.Figure or ``None``
        Figure object. If None, a new figure will be created.
    ax : matplotlib.axes._subplots.AxesSubplot or ``None``
        Axes object. If None, a new axes will be created.
    figsize: (float, float)
        Figure size in inches, as a tuple of two numbers. The figure
        size of ``fig`` (if not ``None``) will override this parameter.
    dpi : float
        Figure resolution. The dpi of ``fig`` (if not ``None``) will override
        this parameter.
    title : str
        The title of the violin plot, usually the name of ``categorical_array`
    xlabel : str
        The label for the x axis (i.e., categories) of the violin plot. If
        ``None`` and ``categorical_array`` is a pandas Series, use the 'name'
        attribute of ``categorical_array`` as xlabel.
    ylabel : str
        The label for the y axis (i.e., average ``continuous_array`` values)
        of the violin plot. If ``None`` and ``continuous_array`` is a pandas
        Series, use the 'name' attribute of ``continuous_array`` as ylabel.
    rot : float
        The rotation (in degrees) of the x axis labels.
    dropna : bool
        Whether or not to exclude N/A records in the data.
    show_stats : bool
        Whether or not to show the statistical test results (F statistics
        and p-value) on the figure.
    sort_by : {'name', 'mean', 'median', None}
        Option to arrange the different categories in `categorical_array` in
        the violin plot. ``None`` means no sorting, i.e., using the hashed
        order of the category names; 'mean' and 'median' mean sorting the
        violins according to the mean/median values of each category; 'name'
        means sorting the violins according to the category names.
    vert : bool
        Whether to show the violins as vertical.
    plot_violins : bool
        If ``True``, use violin plots to illustrate the distribution of groups.
        Otherwise, use multi-histogram (hist_multi()).
    **extra_kwargs :
        Keyword arguments to be passed to plt.violinplot() or hist_multi().
        (https://matplotlib.org/api/_as_gen/matplotlib.axes.Axes.violinplot.html)
        Note that this subroutine overrides the default behavior of violinplot:
        showmeans is overriden to True and showextrema to False.

    Return
    ------
    fig : matplotlib.figure.Figure
        The figure object being created or being passed into this function.
    ax : matplotlib.axes._subplots.AxesSubplot
        The axes object being created or being passed into this function.
    mean_values : dict
        A dictionary whose keys are the categories in x, and their corresponding
        values are the mean values in y.
    F_test_result : tuple<float>
        A tuple in the order of (F_stat, p_value), where F_stat is the computed
        F-value of the one-way ANOVA test, and p_value is the associated
        p-value from the F-distribution.
    '''
    x = categorical_array
    y = continuous_array

    if not isinstance(x, hlp._array_like):
        raise TypeError(
            '`categorical_array` must be pandas.Series, numpy.ndarray, or list.'
        )
    if not isinstance(y, hlp._array_like):
        raise TypeError(
            '`continuous_array` must be pandas.Series, numpy.ndarray, or list.'
        )
    if len(x) != len(y):
        raise hlp.LengthError(
            'Lengths of `categorical_array` and `continuous_array` must be the same.'
        )
    if isinstance(x, np.ndarray) and x.ndim > 1:
        raise hlp.DimensionError('`categorical_array` must be a 1D numpy array.')
    if isinstance(y, np.ndarray) and y.ndim > 1:
        raise hlp.DimensionError('`continuous_array` must be a 1D numpy array..')

    if not xlabel and isinstance(x, pd.Series): #xlabel = x.name
        if vert:
            xlabel = x.name
        else:
            xlabel = y.name
        # END IF-ELSE
    # END IF
    if not ylabel and isinstance(y, pd.Series): #ylabel = y.name
        if vert:
            ylabel = y.name
        else:
            ylabel = x.name
        # END IF-ELSE
    # END IF

    if isinstance(x, (list, np.ndarray)): x = pd.Series(x)
    if isinstance(y, (list, np.ndarray)): y = pd.Series(y)

    x = hlp._upcast_dtype(x)

    if not dropna: x = x.fillna('N/A')  # input arrays are unchanged

    x_classes = x.unique()
    x_classes_copy = list(x_classes.copy())
    y_values = []  # each element in y_values represent the values of a category
    mean_values = {}  # each entry in the dict corresponds to a category
    for cat in x_classes:
        cat_index = (x == cat)
        y_cat = y[cat_index]
        mean_values[cat] = y_cat.mean(skipna=True)
        if not y_cat.isnull().all():
            y_values.append(list(y_cat[np.isfinite(y_cat)]))  # convert to list to avoid 'reshape' deprecation warning
        else:  # all the y values in the current category is NaN
            print('*****WARNING: category %s contains only NaN values.*****' % str(cat))
            x_classes_copy.remove(cat)

    F_stat, p_value = stats.f_oneway(*y_values)  # pass every group into f_oneway()

    if plot_violins:
        if 'showextrema' not in extra_kwargs:
            extra_kwargs['showextrema'] = False  # override default behavior of violinplot
        if 'showmeans' not in extra_kwargs:
            extra_kwargs['showmeans'] = True

    data_names = [str(_) for _ in x_classes_copy]

    if plot_violins:
        fig, ax = mc.violin_plot(
            y_values, fig=fig, ax=ax, figsize=figsize,
            dpi=dpi, data_names=data_names,
            sort_by=sort_by, vert=vert, **extra_kwargs,
        )
    else:
        fig, ax = mc.hist_multi(
            y_values, bins='auto',
            fig=fig, ax=ax, figsize=figsize, dpi=dpi,
            data_names=data_names, sort_by=sort_by, vert=vert,
            show_legend=False, **extra_kwargs,
        )

    if show_stats:
        ha = 'left' if vert else 'right'
        xy = (0.05, 0.92) if vert else (0.95, 0.92)
        ax.annotate(
            'F=%.2f, p_val=%.2g' % (F_stat, p_value), ha=ha,
            xy=xy, xycoords='axes fraction',
        )

    if title: ax.set_title(title)
    if xlabel: ax.set_xlabel(xlabel)
    if ylabel: ax.set_ylabel(ylabel)

    return fig, ax, mean_values, (F_stat, p_value)

#%%============================================================================
[docs]def positive_rate(
        categorical_array, two_classes_array, fig=None, ax=None,
        figsize=None, dpi=100, barh=True, top_n=None, dropna=False,
        xlabel=None, ylabel=None, show_stats=True,
):
    '''
    Calculate the proportions of the different categories in
    ``categorical_array`` that fall into class "1" (or ``True``) in
    ``two_classes_array``, and optionally show a figure.

    Also, a Pearson's chi-squared test is performed to test the independence
    between ``categorical_array`` and ``two_classes_array``. The chi-squared
    statistics, p-value, and degree-of-freedom are returned.

    Parameters
    ----------
    categorical_array : list, numpy.ndarray, or pandas.Series
        An array of categorical values.
    two_class_array : list, numpy.ndarray, or pandas.Series
        The target variable containing two classes. Each value in this
        parameter correspond to a value in ``categorical_array`` (at the same
        index). It must have the same length as ``categorical_array``. The
        second unique value in this parameter will be considered as the
        positive class (for example, "True" in [True, False, True], or "3" in
        [1, 1, 3, 3, 1]).
    fig : matplotlib.figure.Figure or ``None``
        Figure object. If None, a new figure will be created.
    ax : matplotlib.axes._subplots.AxesSubplot or ``None``
        Axes object. If None, a new axes will be created.
    figsize: (float, float)
        Figure size in inches, as a tuple of two numbers. The figure
        size of ``fig`` (if not ``None``) will override this parameter.
    dpi : float
        Figure resolution. The dpi of ``fig`` (if not ``None``) will override
        this parameter.
    barh : bool
        Whether or not to show the bars as horizontal (otherwise, vertical).
    top_n : int
        Only shows ``top_n`` categories (ranked by their positive rate) in the
        figure. Useful when there are too many categories. If ``None``, show
        all categories.
    dropna : bool
        If ``True``, ignore entries (in both arrays) where there are missing
        values in at least one array. If ``False``, the missing values are
        treated as a new category: "N/A".
    xlabel : str
        X axes label.
    ylabel : str
        Y axes label.
    show_stats : bool
        Whether or not to show the statistical test results (chi2 statistics
        and p-value) on the figure.

    Returns
    -------
    fig : matplotlib.figure.Figure
        The figure object being created or being passed into this function.
    ax : matplotlib.axes._subplots.AxesSubplot
        The axes object being created or being passed into this function.
    pos_rate : pandas.Series
        The positive rate of each categories in x
    chi2_results : tuple<float>
        A tuple in the order of (chi2, p_value, degree_of_freedom)
    '''
    import collections

    x = categorical_array
    y = two_classes_array

    if not isinstance(categorical_array, hlp._array_like):
        raise TypeError(
            '`categorical_array` must be pandas.Series, numpy.ndarray, or list.'
        )
    if not isinstance(two_classes_array, hlp._array_like):
        raise TypeError(
            '`two_classes_array` must be pandas.Series, numpy.array, or list.'
        )
    if len(x) != len(y):
        raise hlp.LengthError(
            'Lengths of `categorical_array` and `two_classes_array` must be the same.'
        )
    if isinstance(x, np.ndarray) and x.ndim > 1:
        raise hlp.DimensionError('`categorical_array` must be a 1D numpy array.')
    if isinstance(y, np.ndarray) and y.ndim > 1:
        raise hlp.DimensionError('`two_classes_array` must be a 1D numpy array.')

    if isinstance(x, (list, np.ndarray)): x = pd.Series(x)
    if isinstance(y, (list, np.ndarray)): y = pd.Series(y)

    x = hlp._upcast_dtype(x)
    y = hlp._upcast_dtype(y)

    if dropna:
        x = x[pd.notnull(x) & pd.notnull(y)]  # input arrays are not changed
        y = y[pd.notnull(x) & pd.notnull(y)]
    else:
        x = x.fillna('N/A')  # input arrays are not changed
        y = y.fillna('N/A')

    if len(np.unique(y)) != 2:
        raise ValueError('`two_classes_array` should have only two unique values.')

    nr_classes = len(x.unique())  # this is not sorted
    y_classes = list(np.unique(y))  # use numpy's unique() to get sorted classes
    y_pos_index = (y == y_classes[1])  # treat the last class as the positive class

    count_all_classes = collections.Counter(x)
    count_pos_class = collections.Counter(x[y_pos_index])

    pos_rate = pd.Series(count_pos_class)/pd.Series(count_all_classes)
    pos_rate = pos_rate.fillna(0.0)  # keys not in count_pos_class show up as NaN

    observed = pd.crosstab(y, x)
    chi2, p_val, dof, expected = stats.chi2_contingency(observed)

    if not figsize:
        if barh:
            figsize = (5, nr_classes * 0.26)  # 0.26 inch = height for each category
        else:
            figsize = (nr_classes * 0.26, 5)

    if xlabel is None and isinstance(x, pd.Series): xlabel = x.name
    if ylabel is None and isinstance(y, pd.Series):
        char = '\n' if (not barh and figsize[1] <= 1.5) else ' '
        ylabel = 'Positive rate%sof "%s"' % (char, y.name)

    fig, ax = hlp._process_fig_ax_objects(fig, ax, figsize, dpi)
    fig, ax = misc.plot_ranking(
        pos_rate, fig=fig, ax=ax, top_n=top_n, barh=barh,
        score_ax_label=ylabel, name_ax_label=xlabel,
    )

    if show_stats:
        ax.annotate(
            'chi^2=%.2f, p_val=%.2g' % (chi2, p_val), ha='right',
            xy=(0.99, 1.05), xycoords='axes fraction', va='bottom',
        )

    return fig, ax, pos_rate, (chi2, p_val, dof)

#%%============================================================================
def _crosstab_to_arrays(cross_tab):
    '''
    Helper function. Convert a contingency table to two arrays, which is the
    reversed operation of pandas.crosstab().

    Parameter
    ---------
    cross_tab : numpy.ndarray or pandas.DataFrame
        A contingency table to be converted to two arrays

    Returns
    -------
    x : list<float>
        The first output array
    y : list<float>
        The second output array
    '''
    if isinstance(cross_tab, (list, pd.Series)):
        raise hlp.DimensionError('Please pass a 2D data structure.')
    if isinstance(cross_tab,(np.ndarray,pd.DataFrame)) and min(cross_tab.shape)==1:
        raise hlp.DimensionError('Please pass a 2D data structure.')

    if isinstance(cross_tab, np.ndarray): cross_tab = pd.DataFrame(cross_tab)

    factor_1 = list(cross_tab.columns)
    factor_2 = list(cross_tab.index)

    combinations = itertools.product(factor_1, factor_2)
    result = []
    for fact_1 ,fact_2 in combinations:
        lst = [[fact_2, fact_1]] * cross_tab.loc[fact_2,fact_1]
        result.extend(lst)

    x, y = list(zip(*result))

    return list(x), list(y)  # convert tuple into list

#%%============================================================================
[docs]def contingency_table(
        array_horizontal, array_vertical, fig=None, ax=None,
        figsize='auto', dpi=100, color_map='auto', xlabel=None,
        ylabel=None, dropna=False, rot=45, normalize=True,
        symm_cbar=True, show_stats=True,
):
    '''
    Calculate and visualize the contingency table from two categorical arrays.
    Also perform a Pearson's chi-squared test to evaluate whether the two
    arrays are independent.

    Parameters
    ----------
    array_horizontal : list, numpy.ndarray, or pandas.Series
        Array to show as the horizontal margin in the contigency table (i.e.,
        its categories are the column headers).
    array_vertical : list, numpy.ndarray, or pandas.Series
        Array to show as the vertical margin in the contigency table (i.e.,
        its categories are the row names).
    fig : matplotlib.figure.Figure or ``None``
        Figure object. If None, a new figure will be created.
    ax : matplotlib.axes._subplots.AxesSubplot or ``None``
        Axes object. If None, a new axes will be created.
    figsize: (float, float)
        Figure size in inches, as a tuple of two numbers. The figure
        size of ``fig`` (if not ``None``) will override this parameter.
    dpi : float
        Figure resolution. The dpi of ``fig`` (if not ``None``) will override
        this parameter.
    color_map : str or matplotlib.colors.Colormap
        The color scheme specifications. Valid names are listed in
        https://matplotlib.org/users/colormaps.html.
        If relative_color is True, use diverging color maps (e.g., PiYG, PRGn,
        BrBG, PuOr, RdGy, RdBu, RdYlBu, RdYlGn, Spectral, coolwarm, bwr,
        seismic). Otherwise, use sequential color maps (e.g., viridis, jet).
    xlabel : str
        The label for the horizontal axis. If ``None`` and ``array_horizontal``
        is a pandas Series, use the 'name' attribute of ``array_horizontal``
        as xlabel.
    ylabel : str
        The label for the vertical axis. If ``None`` and ``array_vertical``
        is a pandas Series, use the 'name' attribute of ``array_vertical`` as
        ylabel.
    dropna : bool
        If ``True``, ignore entries (in both arrays) where there are missing
        values in at least one array. If ``False``, the missing values are
        treated as a new category: "N/A".
    rot : float or 'vertical' or 'horizontal'
        The rotation of the x axis labels (in degrees).
    normalize : bool
        If ``True``, plot the contingency table as the relative difference
        between the observed and the expected (i.e., (obs. - exp.)/exp. ).
        If ``False``, plot the original "observed frequency".
    symm_cbar : bool
        If ``True``, the limits of the color bar are symmetric. Otherwise, the
        limits are the natural minimum/maximum of the table to be plotted.
        It has no effect if "normalize" is set to ``False``.
    show_stats : bool
        Whether or not to show the statistical test results (chi2 statistics
        and p-value) on the figure.

    Returns
    -------
    fig : matplotlib.figure.Figure
        The figure object being created or being passed into this function.
    ax : matplotlib.axes._subplots.AxesSubplot
        The axes object being created or being passed into this function.
    chi2_results : tuple<float>
        A tuple in the order of (chi2, p_value, degree_of_freedom).
    correlation_metrics : tuple<float>
        A tuple in the order of (phi coef., coeff. of contingency, Cramer's V).
    '''
    from mpl_toolkits.axes_grid1 import make_axes_locatable

    x = array_horizontal
    y = array_vertical

    if not isinstance(x, hlp._array_like):
        raise TypeError(
            'The input `array_horizontal` must be pandas.Series, '
            'numpy.ndarray, or list.'
        )
    if not isinstance(y, hlp._array_like):
        raise TypeError(
            'The input `array_vertical` must be pandas.Series, '
            'numpy.array, or list.'
        )
    if len(x) != len(y):
        raise hlp.LengthError(
            'Lengths of `array_horizontal` and `array_vertical` '
            'must be the same.'
        )
    if isinstance(x, np.ndarray) and len(x.shape) > 1:
        raise hlp.DimensionError('`array_horizontal` must be a 1D numpy array.')
    if isinstance(y, np.ndarray) and len(y.shape) > 1:
        raise hlp.DimensionError('`array_vertical` must be a 1D numpy array.')

    if xlabel is None and isinstance(x, pd.Series): xlabel = x.name
    if ylabel is None and isinstance(y, pd.Series): ylabel = y.name

    if isinstance(x, (list, np.ndarray)): x = pd.Series(x)
    if isinstance(y, (list, np.ndarray)): y = pd.Series(y)

    x = hlp._upcast_dtype(x)
    y = hlp._upcast_dtype(y)

    if not dropna:  # keep missing values: replace them with actual string "N/A"
        x = x.fillna('N/A')  # this is to avoid changing the input arrays
        y = y.fillna('N/A')

    observed = pd.crosstab(np.array(y), x)  # use at least one numpy array to avoid possible index matching errors
    chi2, p_val, dof, expected = stats.chi2_contingency(observed)
    expected = pd.DataFrame(
        expected, index=observed.index, columns=observed.columns,
    )
    relative_diff = (observed - expected) / expected

    if figsize == 'auto':
        figsize = observed.shape

    fig, ax = hlp._process_fig_ax_objects(fig, ax, figsize, dpi)

    table = relative_diff if normalize else observed
    peak = max(abs(table.min().min()), abs(table.max().max()))
    max_val = table.max().max()
    min_val = table.min().min()

    if color_map == 'auto':
        color_map = 'RdBu_r' if normalize else 'viridis'

    divider = make_axes_locatable(ax)
    cax = divider.append_axes("right", size="8%", pad=0.08)

    if normalize:
        if symm_cbar:
            if peak <= 1:
                peak = 1.0  # still set color bar limits to [-1.0, 1.0]
            norm = hlp._MidpointNormalize(midpoint=0.0, vmin=-peak, vmax=peak)
        else:  # limits of color bar are the natural minimum/maximum of "table"
            norm = hlp._MidpointNormalize(midpoint=0.0, vmin=min_val, vmax=max_val)
    else:
        norm = None  # no need to set midpoint of color bar

    im = ax.matshow(table, cmap=color_map, norm=norm)
    cb = fig.colorbar(im, cax=cax)  # 'cb' is a Colorbar instance
    if normalize:
        cb.set_label('(Obs$-$Exp)/Exp')
    else:
        cb.set_label('Observed freq.')

    ax.set_xticks(range(table.shape[1]))
    ax.set_yticks(range(table.shape[0]))

    ha = 'center' if (0 <= rot < 30 or rot == 90) else 'left'
    ax.set_xticklabels(table.columns, rotation=rot, ha=ha)
    ax.set_yticklabels(table.index)

    fmt = '.2f' if normalize else 'd'

    if normalize:
        text_color = lambda x: 'white' if abs(x) > peak/2.0 else 'black'
    else:
        lo_3 = min_val + (max_val - min_val)/3.0  # lower-third boundary
        up_3 = max_val - (max_val - min_val)/3.0  # upper-third boundary
        text_color = lambda x: 'k' if x > up_3 else ('y' if x > lo_3 else 'w')

    for i, j in itertools.product(range(table.shape[0]), range(table.shape[1])):
        ax.text(
            j, i, format(table.iloc[i, j], fmt), ha="center", va='center',
            fontsize=9, color=text_color(table.iloc[i, j]),
        )

    if xlabel:
        ax.xaxis.set_label_position('top')
        ax.set_xlabel(xlabel)
    if ylabel:
        ax.yaxis.set_label_position('left')
        ax.set_ylabel(ylabel)

    tables = (observed, expected, relative_diff)
    chi2_results = (chi2, p_val, dof)

    phi = np.sqrt(chi2 / len(x))  # https://en.wikipedia.org/wiki/Phi_coefficient
    cc = np.sqrt(chi2 / (chi2 + len(x)))  # http://www.statisticshowto.com/contingency-coefficient/
    R, C = table.shape
    V = np.sqrt(phi**2. / min(C-1, R-1))  # https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V
    correlation_metrics = (phi, cc, V)

    if show_stats:
        ax.annotate(
            '$\chi^2$=%.2f, p_val=%.2g\n'
            '$\phi$=%.2g, CoC=%.2g, V=%.2g' % (chi2, p_val, phi, cc, V),
            ha='center', xy=(0.5, -0.09), xycoords='axes fraction',
            va='top',
        )

    return fig, ax, tables, chi2_results, correlation_metrics

#%%============================================================================
[docs]def scatter_plot_two_cols(
        X, two_columns, fig=None, ax=None,
        figsize=(3,3), dpi=100, alpha=0.5, color=None,
        grid_on=True, logx=False, logy=False,
):
    '''
    Produce scatter plots of two of the columns in ``X`` (the data matrix).
    The correlation between the two columns are shown on top of the plot.

    Parameters
    ----------
    X : pandas.DataFrame
        The dataset. Currently only supports pandas dataframe.
    two_columns : [str, str] or [int, int]
        The names or indices of the two columns within ``X``. Must be a list of
        length 2. The elements must either be both integers, or both strings.
    fig : matplotlib.figure.Figure or ``None``
        Figure object. If None, a new figure will be created.
    ax : matplotlib.axes._subplots.AxesSubplot or ``None``
        Axes object. If None, a new axes will be created.
    figsize: (float, float)
        Figure size in inches, as a tuple of two numbers. The figure
        size of ``fig`` (if not ``None``) will override this parameter.
    dpi : float
        Figure resolution. The dpi of ``fig`` (if not ``None``) will override
        this parameter.
    alpha : float
        Opacity of the scatter points.
    color : str, list<float>, tuple<float>, or ``None``
        Color of the scatter points. If ``None``, default matplotlib color
        palette will be used.
    grid_on : bool
        Whether or not to show grids on the plot.

    Returns
    -------
    fig : matplotlib.figure.Figure
        The figure object being created or being passed into this function.
    ax : matplotlib.axes._subplots.AxesSubplot
        The axes object being created or being passed into this function.
    '''
    fig, ax = hlp._process_fig_ax_objects(fig, ax, figsize, dpi)

    if not isinstance(X, pd.DataFrame):
        raise TypeError('`X` must be a pandas DataFrame.')

    if not isinstance(two_columns, list):
        raise TypeError('`two_columns` must be a list of length 2.')
    if len(two_columns) != 2:
        raise hlp.LengthError('Length of `two_columns` must be 2.')

    if isinstance(two_columns[0], str):
        x = X[two_columns[0]]
        xlabel = two_columns[0]
    elif isinstance(two_columns[0], (int, np.integer)):
        x = X.iloc[:, two_columns[0]]
        xlabel = X.columns[two_columns[0]]
    else:
        raise TypeError('`two_columns` must be a list of str or int.')

    if isinstance(two_columns[1], str):
        y = X[two_columns[1]]
        ylabel = two_columns[1]
    elif isinstance(two_columns[1], (int, np.integer)):
        y = X.iloc[:,two_columns[1]]
        ylabel = X.columns[two_columns[1]]
    else:
        raise TypeError('`two_columns` must be a list of str or int.')

    x = np.array(x)  # convert to numpy array so that x[ind] runs correctly
    y = np.array(y)

    try:
        nan_index_in_x = np.where(np.isnan(x))[0]
    except TypeError:
        raise TypeError('Cannot cast the first column safely into numerical types.')
    try:
        nan_index_in_y = np.where(np.isnan(y))[0]
    except TypeError:
        raise TypeError('Cannot cast the second column safely into numerical types.')
    nan_index = set(nan_index_in_x) | set(nan_index_in_y)
    not_nan_index = list(set(range(len(x))) - nan_index)
    _, _, r_value, _, _ = stats.linregress(x[not_nan_index], y[not_nan_index])

    ax.scatter(x, y, alpha=alpha, color=color)
    if xlabel: ax.set_xlabel(xlabel)
    if ylabel: ax.set_ylabel(ylabel)
    ax.set_title('$r$ = %.2f' % r_value)
    if logx:
        ax.set_xscale('log')
    if logy:
        ax.set_yscale('log')
    if grid_on == True:
        ax.grid(ls=':', lw=0.5)
        ax.set_axisbelow(True)

    return fig, ax

#%%============================================================================
[docs]def bin_and_mean(
        xdata, ydata, bins=10, distribution='normal', show_fig=True,
        fig=None, ax=None, figsize=None, dpi=100, show_bins=True,
        raw_data_label='raw data', mean_data_label='average',
        xlabel=None, ylabel=None, logx=False, logy=False, grid_on=True,
        error_bounds=True, err_bound_type='shade', legend_on=True,
        subsamp_thres=None, show_stats=True, show_SE=False,
        err_bound_shade_opacity=0.5,
):
    '''
    Calculate the "bin-and-mean" results and optionally show the "bin-and-mean"
    plot.

    A "bin-and-mean" plot is a more salient way to show the dependency of
    ``ydata`` on ``xdata``. The data points (``xdata``, ``ydata``) are divided
    into different bins according to the values in ``xdata`` (via ``bins``),
    and within each bin, the mean values of x and y are calculated, and treated
    as the representative x and y values.

    "Bin-and-mean" is preferred when data points are highly skewed (e.g.,
    a lot of data points for when x is small, but very few for large x). The
    data points when x is large are usually not noises, and could be even more
    valuable (think of the case where x is earthquake magnitude and y is the
    related economic loss). If we want to study the relationship between
    economic loss and earthquake magnitude, we need to bin-and-mean raw data
    and draw conclusions from the mean data points.

    The theory that enables this method is the assumption that the data points
    with similar x values follow the same distribution. Naively, we assume the
    data points are normally distributed, then y_mean is the arithmetic mean of
    the data points within a bin. We also often assume the data points follow
    log-normal distribution (if we want to assert that y values are all
    positive), then y_mean is the expected value of the log-normal distribution,
    while x_mean for any bins are still just the arithmetic mean.

    Notes:
      (1) For log-normal distribution, the expective value of y is:
                    E(Y) = exp(mu + (1/2)*sigma^2)
          and the variance is:
                 Var(Y) = [exp(sigma^2) - 1] * exp(2*mu + sigma^2)
          where mu and sigma are the two parameters of the distribution.
      (2) Knowing E(Y) and Var(Y), mu and sigma can be back-calculated::

                              ___________________
             mu = ln[ E(Y) / V 1 + Var(Y)/E^2(Y)  ]

                      _________________________
             sigma = V ln[ 1 + Var(Y)/E^2(Y) ]

          (Reference: https://en.wikipedia.org/wiki/Log-normal_distribution)

    Parameters
    ----------
    xdata : list, numpy.ndarray, or pandas.Series
        X data.
    ydata : list, numpy.ndarray, or pandas.Series
        Y data.
    bins : int, list, numpy.ndarray, or pandas.Series
        Number of bins (an integer), or an array representing the actual bin
        edges. If ``bins`` means bin edges, the edges are inclusive on the
        lower bound, e.g., a value 2 shall fall into the bin [2, 3), but not
        the bin [1, 2). Note that the binning is done according to the X values.
    distribution : {'normal', 'lognormal'}
        Specifies which distribution the Y values within a bin follow. Use
        'lognormal' if you want to assert all positive Y values. Only supports
        normal and log-normal distributions at this time.
    show_fig : bool
        Whether or not to show a bin-and-mean plot.
    fig : matplotlib.figure.Figure or ``None``
        Figure object. If None, a new figure will be created.
    ax : matplotlib.axes._subplots.AxesSubplot or ``None``
        Axes object. If None, a new axes will be created.
    figsize: (float, float)
        Figure size in inches, as a tuple of two numbers. The figure
        size of ``fig`` (if not ``None``) will override this parameter.
    dpi : float
        Figure resolution. The dpi of ``fig`` (if not ``None``) will override
        this parameter.
    show_bins : bool
        Whether or not to show the bin edges as vertical lines on the plots.
    raw_data_label : str
        The label name of the raw data to be shown in the legend (such as
        "raw data"). It has no effects if ``show_legend`` is ``False``.
    mean_data_label : str
        The label name of the mean data to be shown in the legend (such as
        "averaged data"). It has no effects if ``show_legend`` is ``False``.
    xlabel : str or ``None``
        X axis label. If ``None`` and ``xdata`` is a pandas Series, use
        ``xdata``'s "name" attribute as ``xlabel``.
    ylabel : str of ``None``
        Y axis label. If ``None`` and ``ydata`` is a pandas Series, use
        ``ydata``'s "name" attribute as ``ylabel``.
    logx : bool
        Whether or not to show the X axis in log scale.
    logy : bool
        Whether or not to show the Y axis in log scale.
    grid_on : bool
        Whether or not to show grids on the plot.
    error_bounds : bool
        Whether or not to show error bounds of each bin.
    err_bound_type : {'shade', 'bar'}
        Type of error bound: shaded area or error bars. It has no effects if
        error_bounds is set to ``False``.
    legend_on : bool
        Whether or not to show a legend.
    subsamp_thres : int
        A positive integer that defines the number of data points in each bin
        to show in the scatter plot. The smaller this number, the faster the
        plotting process. If larger than the number of data points in a bin,
        then all data points from that bin are plotted. If ``None``, then all
        data points from all bins are plotted.
    show_stats : bool
        Whether or not to show R^2 scores, correlation coefficients of the raw
        data and the binned averages on the plot.
    show_SE : bool
        If ``True``, show the standard error of y_mean (orange dots) of each
        bin as the shaded area beneath the mean value lines. If ``False``, show
        the standard deviation of raw Y values (gray dots) within each bin.
    err_bound_shade_opacity : float
        The opacity of the shaded area representing the error bound. 0 means
        completely transparent, and 1 means completely opaque. It has no effect
        if ``error_bound_type`` is ``'bar'``.

    Returns
    -------
    fig : matplotlib.figure.Figure
        The figure object being created or being passed into this function.
        ``None``, if ``show_fig`` is set to ``False``.
    ax : matplotlib.axes._subplots.AxesSubplot
        The axes object being created or being passed into this function.
        ``None``, if ``show_fig`` is set to ``False``.
    x_mean : numpy.ndarray
        Mean X values of each data bin (in terms of X values).
    y_mean : numpy.ndarray
        Mean Y values of each data bin (in terms of X values).
    y_std : numpy.ndarray
        Standard deviation of Y values or each data bin (in terms of X values).
    y_SE : numpy.ndarray
        Standard error of ``y_mean``. It describes how far ``y_mean`` is from
        the population mean (or the "true mean value") within each bin, which
        is a different concept from ``y_std``.
        See https://en.wikipedia.org/wiki/Standard_error#Standard_error_of_mean_versus_standard_deviation
        for further information.
    stats_ : tuple<float>
        A tuple in the order of (r2_score_raw, corr_coeff_raw, r2_score_binned,
        corr_coeff_binned), which are the R^2 score and correlation coefficient
        of the raw data (``xdata`` and ``ydata``) and the binned averages
        (``x_mean`` and ``y_mean``).
    '''
    if not isinstance(xdata, hlp._array_like) or not isinstance(ydata, hlp._array_like):
        raise TypeError(
            '`xdata` and `ydata` must be lists, numpy arrays, or pandas Series.'
        )

    if len(xdata) != len(ydata):
        raise hlp.LengthError('`xdata` and `ydata` must have the same length.')

    if isinstance(xdata, list): xdata = np.array(xdata)  # otherwise boolean
    if isinstance(ydata, list): ydata = np.array(ydata)  # indexing won't work

    #------------Pre-process "bins"--------------------------------------------
    if isinstance(bins,(int,np.integer)):  # if user specifies number of bins
        if bins <= 0:
            raise ValueError('`bins` must be a positive integer.')
        else:
            nr = bins + 1  # create bins with percentiles in xdata
            x_uni = np.unique(xdata)
            bins = [np.nanpercentile(x_uni,(j+0.)/bins*100) for j in range(nr)]
            if not all(x <= y for x,y in zip(bins,bins[1:])):  # https://stackoverflow.com/a/4983359/8892243
                print(
                    '\nWARNING: Resulting "bins" array is not monotonically '
                    'increasing. Please use a smaller "bins" to avoid potential '
                    'issues.\n'
                )
    elif isinstance(bins,(list,np.ndarray)):  # if user specifies array
        nr = len(bins)
    else:
        raise TypeError('`bins` must be either an integer or an array.')

    #-----------Pre-process xlabel and ylabel----------------------------------
    if not xlabel and isinstance(xdata, pd.Series):  # xdata has 'name' attr
        xlabel = xdata.name
    if not ylabel and isinstance(ydata, pd.Series):  # ydata has 'name' attr
        ylabel = ydata.name

    #-----------Group data into bins-------------------------------------------
    inds = np.digitize(xdata, bins)
    x_mean = np.zeros(nr-1)
    y_mean = np.zeros(nr-1)
    y_std  = np.zeros(nr-1)
    y_SE   = np.zeros(nr-1)
    x_subs = []  # subsampled x data (for faster scatter plots)
    y_subs = []
    for j in range(nr-1):  # loop over every bin
        x_in_bin = xdata[inds == j+1]
        y_in_bin = ydata[inds == j+1]

        #------------Calculate mean and std------------------------------------
        if len(x_in_bin) == 0:  # no point falls into current bin
            x_mean[j] = np.nan  # this is to prevent numpy from throwing...
            y_mean[j] = np.nan  #...confusing warning messages
            y_std[j]  = np.nan
            y_SE[j] = np.nan
        else:
            x_mean[j] = np.nanmean(x_in_bin)
            if distribution == 'normal':
                y_mean[j] = np.nanmean(y_in_bin)
                y_std[j] = np.nanstd(y_in_bin)
                y_SE[j] = stats.sem(y_in_bin)
            elif distribution == 'lognormal':
                s, loc, scale = stats.lognorm.fit(y_in_bin, floc=0)
                estimated_mu = np.log(scale)
                estimated_sigma = s
                y_mean[j] = np.exp(estimated_mu + estimated_sigma**2.0/2.0)
                y_std[j]  = np.sqrt(
                    np.exp(2.*estimated_mu + estimated_sigma**2.) \
                    * (np.exp(estimated_sigma**2.) - 1)
                )
                y_SE[j] = y_std[j] / np.sqrt(len(y_in_bin))
            else:
                raise ValueError(
                    "Valid values of `distribution` are "
                    "{'normal', 'lognormal'}. Not '%s'." % distribution
                )

        #------------Pick subsets of data, for faster plotting-----------------
        #------------Note that this does not affect mean and std---------------
        if subsamp_thres is not None and show_fig:
            if not isinstance(subsamp_thres, (int, np.integer)) or subsamp_thres <= 0:
                raise TypeError('`subsamp_thres` must be a positive integer or None.')
            if len(x_in_bin) > subsamp_thres:
                x_subs.extend(np.random.choice(x_in_bin,subsamp_thres,replace=False))
                y_subs.extend(np.random.choice(y_in_bin,subsamp_thres,replace=False))
            else:
                x_subs.extend(x_in_bin)
                y_subs.extend(y_in_bin)

    #-------------Calculate R^2 and corr. coeff.-------------------------------
    non_nan_indices = ~np.isnan(xdata) & ~np.isnan(ydata)
    xdata_without_nan = xdata[non_nan_indices]
    ydata_without_nan = ydata[non_nan_indices]

    r2_score_raw = hlp._calc_r2_score(ydata_without_nan, xdata_without_nan)  # treat "xdata" as "y_pred"
    corr_coeff_raw = np.corrcoef(xdata_without_nan, ydata_without_nan)[0, 1]
    r2_score_binned = hlp._calc_r2_score(y_mean, x_mean)
    corr_coeff_binned = np.corrcoef(x_mean, y_mean)[0, 1]
    stats_ = (r2_score_raw, corr_coeff_raw, r2_score_binned, corr_coeff_binned)

    #-------------Plot data on figure------------------------------------------
    if show_fig:
        fig, ax = hlp._process_fig_ax_objects(fig, ax, figsize, dpi)

        if subsamp_thres: xdata, ydata = x_subs, y_subs
        ax.scatter(xdata,ydata,c='gray',alpha=0.3,label=raw_data_label,zorder=1)
        if error_bounds:
            if err_bound_type == 'shade':
                ax.plot(
                    x_mean, y_mean, '-o', c='orange', lw=2,
                    label=mean_data_label, zorder=3,
                )
                if show_SE:
                    ax.fill_between(
                        x_mean, y_mean + y_SE, y_mean - y_SE,
                        label='$\pm$ S.E.', facecolor='orange',
                        alpha=err_bound_shade_opacity, zorder=2.5,
                    )
                else:
                    ax.fill_between(
                        x_mean, y_mean + y_std, y_mean - y_std,
                        label='$\pm$ std', facecolor='orange',
                        alpha=err_bound_shade_opacity, zorder=2.5,
                    )
                # END IF-ELSE
            elif err_bound_type == 'bar':
                if show_SE:
                    mean_data_label += '$\pm$ S.E.'
                    ax.errorbar(
                        x_mean, y_mean, yerr=y_SE, ls='-', marker='o',
                        c='orange', lw=2, elinewidth=1, capsize=2,
                        label=mean_data_label, zorder=3,
                    )
                else:
                    mean_data_label += '$\pm$ std'
                    ax.errorbar(
                        x_mean, y_mean, yerr=y_std, ls='-', marker='o',
                        c='orange', lw=2, elinewidth=1, capsize=2,
                        label=mean_data_label, zorder=3,
                    )
                # END IF-ELSE
            else:
                raise ValueError(
                    'Valid "err_bound_type" name are {"bound", '
                    '"bar"}, not "%s".' % err_bound_type
                )
        else:
            ax.plot(
                x_mean, y_mean, '-o', c='orange', lw=2, label=mean_data_label,
                zorder=3,
            )

        ax.set_axisbelow(True)
        if xlabel: ax.set_xlabel(xlabel)
        if ylabel: ax.set_ylabel(ylabel)
        if logx:
            ax.set_xscale('log')
        if logy:
            ax.set_yscale('log')
        if grid_on:
            ax.grid(ls=':')
            ax.set_axisbelow(True)
        if show_bins:
            ylims = ax.get_ylim()
            for k, edge in enumerate(bins):
                lab_ = 'bin edges' if k==0 else None  # only label 1st edge
                ec = cl.get_colors(N=1)[0]
                ax.plot([edge]*2,ylims,'--',c=ec,lw=1.0,zorder=2,label=lab_)
        if legend_on:
            ax.legend(loc='best')
        if show_stats:
            stats_text = "$R^2_{\mathrm{raw}}$=%.2f, $r_{\mathrm{raw}}$=%.2f, " \
                         "$R^2_{\mathrm{avg}}$=%.2f, " \
                         "$r_{\mathrm{avg}}$=%.2f" % stats_
            ax.set_title(stats_text)

        return fig, ax, x_mean, y_mean, y_std, y_SE, stats_
    else:
        return None, None, x_mean, y_mean, y_std, y_SE, stats_