# -*- coding: utf-8 -*-
import itertools
import numpy as np
import pandas as pd
from scipy import stats
from . import misc
from . import helper as hlp
from . import multiple_columns as mc
from . import colors_and_lines as cl
#%%============================================================================
[docs]def category_means(
categorical_array, continuous_array, fig=None, ax=None,
figsize=None, dpi=100, title=None, xlabel=None, ylabel=None,
rot=0, dropna=False, show_stats=True, sort_by='name',
vert=True, plot_violins=True, **extra_kwargs,
):
'''
Summarize the mean values of entries of ``continuous_array`` corresponding
to each distinct category in ``categorical_array``, and show a violin plot
to visualize it. The violin plot will show the distribution of values in
``continuous_array`` corresponding to each category in
``categorical_array``.
Also, a one-way ANOVA test (H0: different categories in ``categorical_array``
yield the same average values in ``continuous_array``) is performed, and
F statistics and p-value are returned.
Parameters
----------
categorical_array : list, numpy.ndarray, or pandas.Series
An vector of categorical values.
continuous_array : list, numpy.ndarray, or pandas.Series
The target variable whose values correspond to the values in x. Must
have the same length as x. It is natural that y contains continuous
values, but if y contains categorical values (expressed as integers,
not strings), this function should also work.
fig : matplotlib.figure.Figure or ``None``
Figure object. If None, a new figure will be created.
ax : matplotlib.axes._subplots.AxesSubplot or ``None``
Axes object. If None, a new axes will be created.
figsize: (float, float)
Figure size in inches, as a tuple of two numbers. The figure
size of ``fig`` (if not ``None``) will override this parameter.
dpi : float
Figure resolution. The dpi of ``fig`` (if not ``None``) will override
this parameter.
title : str
The title of the violin plot, usually the name of ``categorical_array`
xlabel : str
The label for the x axis (i.e., categories) of the violin plot. If
``None`` and ``categorical_array`` is a pandas Series, use the 'name'
attribute of ``categorical_array`` as xlabel.
ylabel : str
The label for the y axis (i.e., average ``continuous_array`` values)
of the violin plot. If ``None`` and ``continuous_array`` is a pandas
Series, use the 'name' attribute of ``continuous_array`` as ylabel.
rot : float
The rotation (in degrees) of the x axis labels.
dropna : bool
Whether or not to exclude N/A records in the data.
show_stats : bool
Whether or not to show the statistical test results (F statistics
and p-value) on the figure.
sort_by : {'name', 'mean', 'median', None}
Option to arrange the different categories in `categorical_array` in
the violin plot. ``None`` means no sorting, i.e., using the hashed
order of the category names; 'mean' and 'median' mean sorting the
violins according to the mean/median values of each category; 'name'
means sorting the violins according to the category names.
vert : bool
Whether to show the violins as vertical.
plot_violins : bool
If ``True``, use violin plots to illustrate the distribution of groups.
Otherwise, use multi-histogram (hist_multi()).
**extra_kwargs :
Keyword arguments to be passed to plt.violinplot() or hist_multi().
(https://matplotlib.org/api/_as_gen/matplotlib.axes.Axes.violinplot.html)
Note that this subroutine overrides the default behavior of violinplot:
showmeans is overriden to True and showextrema to False.
Return
------
fig : matplotlib.figure.Figure
The figure object being created or being passed into this function.
ax : matplotlib.axes._subplots.AxesSubplot
The axes object being created or being passed into this function.
mean_values : dict
A dictionary whose keys are the categories in x, and their corresponding
values are the mean values in y.
F_test_result : tuple<float>
A tuple in the order of (F_stat, p_value), where F_stat is the computed
F-value of the one-way ANOVA test, and p_value is the associated
p-value from the F-distribution.
'''
x = categorical_array
y = continuous_array
if not isinstance(x, hlp._array_like):
raise TypeError(
'`categorical_array` must be pandas.Series, numpy.ndarray, or list.'
)
if not isinstance(y, hlp._array_like):
raise TypeError(
'`continuous_array` must be pandas.Series, numpy.ndarray, or list.'
)
if len(x) != len(y):
raise hlp.LengthError(
'Lengths of `categorical_array` and `continuous_array` must be the same.'
)
if isinstance(x, np.ndarray) and x.ndim > 1:
raise hlp.DimensionError('`categorical_array` must be a 1D numpy array.')
if isinstance(y, np.ndarray) and y.ndim > 1:
raise hlp.DimensionError('`continuous_array` must be a 1D numpy array..')
if not xlabel and isinstance(x, pd.Series): #xlabel = x.name
if vert:
xlabel = x.name
else:
xlabel = y.name
# END IF-ELSE
# END IF
if not ylabel and isinstance(y, pd.Series): #ylabel = y.name
if vert:
ylabel = y.name
else:
ylabel = x.name
# END IF-ELSE
# END IF
if isinstance(x, (list, np.ndarray)): x = pd.Series(x)
if isinstance(y, (list, np.ndarray)): y = pd.Series(y)
x = hlp._upcast_dtype(x)
if not dropna: x = x.fillna('N/A') # input arrays are unchanged
x_classes = x.unique()
x_classes_copy = list(x_classes.copy())
y_values = [] # each element in y_values represent the values of a category
mean_values = {} # each entry in the dict corresponds to a category
for cat in x_classes:
cat_index = (x == cat)
y_cat = y[cat_index]
mean_values[cat] = y_cat.mean(skipna=True)
if not y_cat.isnull().all():
y_values.append(list(y_cat[np.isfinite(y_cat)])) # convert to list to avoid 'reshape' deprecation warning
else: # all the y values in the current category is NaN
print('*****WARNING: category %s contains only NaN values.*****' % str(cat))
x_classes_copy.remove(cat)
F_stat, p_value = stats.f_oneway(*y_values) # pass every group into f_oneway()
if plot_violins:
if 'showextrema' not in extra_kwargs:
extra_kwargs['showextrema'] = False # override default behavior of violinplot
if 'showmeans' not in extra_kwargs:
extra_kwargs['showmeans'] = True
data_names = [str(_) for _ in x_classes_copy]
if plot_violins:
fig, ax = mc.violin_plot(
y_values, fig=fig, ax=ax, figsize=figsize,
dpi=dpi, data_names=data_names,
sort_by=sort_by, vert=vert, **extra_kwargs,
)
else:
fig, ax = mc.hist_multi(
y_values, bins='auto',
fig=fig, ax=ax, figsize=figsize, dpi=dpi,
data_names=data_names, sort_by=sort_by, vert=vert,
show_legend=False, **extra_kwargs,
)
if show_stats:
ha = 'left' if vert else 'right'
xy = (0.05, 0.92) if vert else (0.95, 0.92)
ax.annotate(
'F=%.2f, p_val=%.2g' % (F_stat, p_value), ha=ha,
xy=xy, xycoords='axes fraction',
)
if title: ax.set_title(title)
if xlabel: ax.set_xlabel(xlabel)
if ylabel: ax.set_ylabel(ylabel)
return fig, ax, mean_values, (F_stat, p_value)
#%%============================================================================
[docs]def positive_rate(
categorical_array, two_classes_array, fig=None, ax=None,
figsize=None, dpi=100, barh=True, top_n=None, dropna=False,
xlabel=None, ylabel=None, show_stats=True,
):
'''
Calculate the proportions of the different categories in
``categorical_array`` that fall into class "1" (or ``True``) in
``two_classes_array``, and optionally show a figure.
Also, a Pearson's chi-squared test is performed to test the independence
between ``categorical_array`` and ``two_classes_array``. The chi-squared
statistics, p-value, and degree-of-freedom are returned.
Parameters
----------
categorical_array : list, numpy.ndarray, or pandas.Series
An array of categorical values.
two_class_array : list, numpy.ndarray, or pandas.Series
The target variable containing two classes. Each value in this
parameter correspond to a value in ``categorical_array`` (at the same
index). It must have the same length as ``categorical_array``. The
second unique value in this parameter will be considered as the
positive class (for example, "True" in [True, False, True], or "3" in
[1, 1, 3, 3, 1]).
fig : matplotlib.figure.Figure or ``None``
Figure object. If None, a new figure will be created.
ax : matplotlib.axes._subplots.AxesSubplot or ``None``
Axes object. If None, a new axes will be created.
figsize: (float, float)
Figure size in inches, as a tuple of two numbers. The figure
size of ``fig`` (if not ``None``) will override this parameter.
dpi : float
Figure resolution. The dpi of ``fig`` (if not ``None``) will override
this parameter.
barh : bool
Whether or not to show the bars as horizontal (otherwise, vertical).
top_n : int
Only shows ``top_n`` categories (ranked by their positive rate) in the
figure. Useful when there are too many categories. If ``None``, show
all categories.
dropna : bool
If ``True``, ignore entries (in both arrays) where there are missing
values in at least one array. If ``False``, the missing values are
treated as a new category: "N/A".
xlabel : str
X axes label.
ylabel : str
Y axes label.
show_stats : bool
Whether or not to show the statistical test results (chi2 statistics
and p-value) on the figure.
Returns
-------
fig : matplotlib.figure.Figure
The figure object being created or being passed into this function.
ax : matplotlib.axes._subplots.AxesSubplot
The axes object being created or being passed into this function.
pos_rate : pandas.Series
The positive rate of each categories in x
chi2_results : tuple<float>
A tuple in the order of (chi2, p_value, degree_of_freedom)
'''
import collections
x = categorical_array
y = two_classes_array
if not isinstance(categorical_array, hlp._array_like):
raise TypeError(
'`categorical_array` must be pandas.Series, numpy.ndarray, or list.'
)
if not isinstance(two_classes_array, hlp._array_like):
raise TypeError(
'`two_classes_array` must be pandas.Series, numpy.array, or list.'
)
if len(x) != len(y):
raise hlp.LengthError(
'Lengths of `categorical_array` and `two_classes_array` must be the same.'
)
if isinstance(x, np.ndarray) and x.ndim > 1:
raise hlp.DimensionError('`categorical_array` must be a 1D numpy array.')
if isinstance(y, np.ndarray) and y.ndim > 1:
raise hlp.DimensionError('`two_classes_array` must be a 1D numpy array.')
if isinstance(x, (list, np.ndarray)): x = pd.Series(x)
if isinstance(y, (list, np.ndarray)): y = pd.Series(y)
x = hlp._upcast_dtype(x)
y = hlp._upcast_dtype(y)
if dropna:
x = x[pd.notnull(x) & pd.notnull(y)] # input arrays are not changed
y = y[pd.notnull(x) & pd.notnull(y)]
else:
x = x.fillna('N/A') # input arrays are not changed
y = y.fillna('N/A')
if len(np.unique(y)) != 2:
raise ValueError('`two_classes_array` should have only two unique values.')
nr_classes = len(x.unique()) # this is not sorted
y_classes = list(np.unique(y)) # use numpy's unique() to get sorted classes
y_pos_index = (y == y_classes[1]) # treat the last class as the positive class
count_all_classes = collections.Counter(x)
count_pos_class = collections.Counter(x[y_pos_index])
pos_rate = pd.Series(count_pos_class)/pd.Series(count_all_classes)
pos_rate = pos_rate.fillna(0.0) # keys not in count_pos_class show up as NaN
observed = pd.crosstab(y, x)
chi2, p_val, dof, expected = stats.chi2_contingency(observed)
if not figsize:
if barh:
figsize = (5, nr_classes * 0.26) # 0.26 inch = height for each category
else:
figsize = (nr_classes * 0.26, 5)
if xlabel is None and isinstance(x, pd.Series): xlabel = x.name
if ylabel is None and isinstance(y, pd.Series):
char = '\n' if (not barh and figsize[1] <= 1.5) else ' '
ylabel = 'Positive rate%sof "%s"' % (char, y.name)
fig, ax = hlp._process_fig_ax_objects(fig, ax, figsize, dpi)
fig, ax = misc.plot_ranking(
pos_rate, fig=fig, ax=ax, top_n=top_n, barh=barh,
score_ax_label=ylabel, name_ax_label=xlabel,
)
if show_stats:
ax.annotate(
'chi^2=%.2f, p_val=%.2g' % (chi2, p_val), ha='right',
xy=(0.99, 1.05), xycoords='axes fraction', va='bottom',
)
return fig, ax, pos_rate, (chi2, p_val, dof)
#%%============================================================================
def _crosstab_to_arrays(cross_tab):
'''
Helper function. Convert a contingency table to two arrays, which is the
reversed operation of pandas.crosstab().
Parameter
---------
cross_tab : numpy.ndarray or pandas.DataFrame
A contingency table to be converted to two arrays
Returns
-------
x : list<float>
The first output array
y : list<float>
The second output array
'''
if isinstance(cross_tab, (list, pd.Series)):
raise hlp.DimensionError('Please pass a 2D data structure.')
if isinstance(cross_tab,(np.ndarray,pd.DataFrame)) and min(cross_tab.shape)==1:
raise hlp.DimensionError('Please pass a 2D data structure.')
if isinstance(cross_tab, np.ndarray): cross_tab = pd.DataFrame(cross_tab)
factor_1 = list(cross_tab.columns)
factor_2 = list(cross_tab.index)
combinations = itertools.product(factor_1, factor_2)
result = []
for fact_1 ,fact_2 in combinations:
lst = [[fact_2, fact_1]] * cross_tab.loc[fact_2,fact_1]
result.extend(lst)
x, y = list(zip(*result))
return list(x), list(y) # convert tuple into list
#%%============================================================================
[docs]def contingency_table(
array_horizontal, array_vertical, fig=None, ax=None,
figsize='auto', dpi=100, color_map='auto', xlabel=None,
ylabel=None, dropna=False, rot=45, normalize=True,
symm_cbar=True, show_stats=True,
):
'''
Calculate and visualize the contingency table from two categorical arrays.
Also perform a Pearson's chi-squared test to evaluate whether the two
arrays are independent.
Parameters
----------
array_horizontal : list, numpy.ndarray, or pandas.Series
Array to show as the horizontal margin in the contigency table (i.e.,
its categories are the column headers).
array_vertical : list, numpy.ndarray, or pandas.Series
Array to show as the vertical margin in the contigency table (i.e.,
its categories are the row names).
fig : matplotlib.figure.Figure or ``None``
Figure object. If None, a new figure will be created.
ax : matplotlib.axes._subplots.AxesSubplot or ``None``
Axes object. If None, a new axes will be created.
figsize: (float, float)
Figure size in inches, as a tuple of two numbers. The figure
size of ``fig`` (if not ``None``) will override this parameter.
dpi : float
Figure resolution. The dpi of ``fig`` (if not ``None``) will override
this parameter.
color_map : str or matplotlib.colors.Colormap
The color scheme specifications. Valid names are listed in
https://matplotlib.org/users/colormaps.html.
If relative_color is True, use diverging color maps (e.g., PiYG, PRGn,
BrBG, PuOr, RdGy, RdBu, RdYlBu, RdYlGn, Spectral, coolwarm, bwr,
seismic). Otherwise, use sequential color maps (e.g., viridis, jet).
xlabel : str
The label for the horizontal axis. If ``None`` and ``array_horizontal``
is a pandas Series, use the 'name' attribute of ``array_horizontal``
as xlabel.
ylabel : str
The label for the vertical axis. If ``None`` and ``array_vertical``
is a pandas Series, use the 'name' attribute of ``array_vertical`` as
ylabel.
dropna : bool
If ``True``, ignore entries (in both arrays) where there are missing
values in at least one array. If ``False``, the missing values are
treated as a new category: "N/A".
rot : float or 'vertical' or 'horizontal'
The rotation of the x axis labels (in degrees).
normalize : bool
If ``True``, plot the contingency table as the relative difference
between the observed and the expected (i.e., (obs. - exp.)/exp. ).
If ``False``, plot the original "observed frequency".
symm_cbar : bool
If ``True``, the limits of the color bar are symmetric. Otherwise, the
limits are the natural minimum/maximum of the table to be plotted.
It has no effect if "normalize" is set to ``False``.
show_stats : bool
Whether or not to show the statistical test results (chi2 statistics
and p-value) on the figure.
Returns
-------
fig : matplotlib.figure.Figure
The figure object being created or being passed into this function.
ax : matplotlib.axes._subplots.AxesSubplot
The axes object being created or being passed into this function.
chi2_results : tuple<float>
A tuple in the order of (chi2, p_value, degree_of_freedom).
correlation_metrics : tuple<float>
A tuple in the order of (phi coef., coeff. of contingency, Cramer's V).
'''
from mpl_toolkits.axes_grid1 import make_axes_locatable
x = array_horizontal
y = array_vertical
if not isinstance(x, hlp._array_like):
raise TypeError(
'The input `array_horizontal` must be pandas.Series, '
'numpy.ndarray, or list.'
)
if not isinstance(y, hlp._array_like):
raise TypeError(
'The input `array_vertical` must be pandas.Series, '
'numpy.array, or list.'
)
if len(x) != len(y):
raise hlp.LengthError(
'Lengths of `array_horizontal` and `array_vertical` '
'must be the same.'
)
if isinstance(x, np.ndarray) and len(x.shape) > 1:
raise hlp.DimensionError('`array_horizontal` must be a 1D numpy array.')
if isinstance(y, np.ndarray) and len(y.shape) > 1:
raise hlp.DimensionError('`array_vertical` must be a 1D numpy array.')
if xlabel is None and isinstance(x, pd.Series): xlabel = x.name
if ylabel is None and isinstance(y, pd.Series): ylabel = y.name
if isinstance(x, (list, np.ndarray)): x = pd.Series(x)
if isinstance(y, (list, np.ndarray)): y = pd.Series(y)
x = hlp._upcast_dtype(x)
y = hlp._upcast_dtype(y)
if not dropna: # keep missing values: replace them with actual string "N/A"
x = x.fillna('N/A') # this is to avoid changing the input arrays
y = y.fillna('N/A')
observed = pd.crosstab(np.array(y), x) # use at least one numpy array to avoid possible index matching errors
chi2, p_val, dof, expected = stats.chi2_contingency(observed)
expected = pd.DataFrame(
expected, index=observed.index, columns=observed.columns,
)
relative_diff = (observed - expected) / expected
if figsize == 'auto':
figsize = observed.shape
fig, ax = hlp._process_fig_ax_objects(fig, ax, figsize, dpi)
table = relative_diff if normalize else observed
peak = max(abs(table.min().min()), abs(table.max().max()))
max_val = table.max().max()
min_val = table.min().min()
if color_map == 'auto':
color_map = 'RdBu_r' if normalize else 'viridis'
divider = make_axes_locatable(ax)
cax = divider.append_axes("right", size="8%", pad=0.08)
if normalize:
if symm_cbar:
if peak <= 1:
peak = 1.0 # still set color bar limits to [-1.0, 1.0]
norm = hlp._MidpointNormalize(midpoint=0.0, vmin=-peak, vmax=peak)
else: # limits of color bar are the natural minimum/maximum of "table"
norm = hlp._MidpointNormalize(midpoint=0.0, vmin=min_val, vmax=max_val)
else:
norm = None # no need to set midpoint of color bar
im = ax.matshow(table, cmap=color_map, norm=norm)
cb = fig.colorbar(im, cax=cax) # 'cb' is a Colorbar instance
if normalize:
cb.set_label('(Obs$-$Exp)/Exp')
else:
cb.set_label('Observed freq.')
ax.set_xticks(range(table.shape[1]))
ax.set_yticks(range(table.shape[0]))
ha = 'center' if (0 <= rot < 30 or rot == 90) else 'left'
ax.set_xticklabels(table.columns, rotation=rot, ha=ha)
ax.set_yticklabels(table.index)
fmt = '.2f' if normalize else 'd'
if normalize:
text_color = lambda x: 'white' if abs(x) > peak/2.0 else 'black'
else:
lo_3 = min_val + (max_val - min_val)/3.0 # lower-third boundary
up_3 = max_val - (max_val - min_val)/3.0 # upper-third boundary
text_color = lambda x: 'k' if x > up_3 else ('y' if x > lo_3 else 'w')
for i, j in itertools.product(range(table.shape[0]), range(table.shape[1])):
ax.text(
j, i, format(table.iloc[i, j], fmt), ha="center", va='center',
fontsize=9, color=text_color(table.iloc[i, j]),
)
if xlabel:
ax.xaxis.set_label_position('top')
ax.set_xlabel(xlabel)
if ylabel:
ax.yaxis.set_label_position('left')
ax.set_ylabel(ylabel)
tables = (observed, expected, relative_diff)
chi2_results = (chi2, p_val, dof)
phi = np.sqrt(chi2 / len(x)) # https://en.wikipedia.org/wiki/Phi_coefficient
cc = np.sqrt(chi2 / (chi2 + len(x))) # http://www.statisticshowto.com/contingency-coefficient/
R, C = table.shape
V = np.sqrt(phi**2. / min(C-1, R-1)) # https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V
correlation_metrics = (phi, cc, V)
if show_stats:
ax.annotate(
'$\chi^2$=%.2f, p_val=%.2g\n'
'$\phi$=%.2g, CoC=%.2g, V=%.2g' % (chi2, p_val, phi, cc, V),
ha='center', xy=(0.5, -0.09), xycoords='axes fraction',
va='top',
)
return fig, ax, tables, chi2_results, correlation_metrics
#%%============================================================================
[docs]def scatter_plot_two_cols(
X, two_columns, fig=None, ax=None,
figsize=(3,3), dpi=100, alpha=0.5, color=None,
grid_on=True, logx=False, logy=False,
):
'''
Produce scatter plots of two of the columns in ``X`` (the data matrix).
The correlation between the two columns are shown on top of the plot.
Parameters
----------
X : pandas.DataFrame
The dataset. Currently only supports pandas dataframe.
two_columns : [str, str] or [int, int]
The names or indices of the two columns within ``X``. Must be a list of
length 2. The elements must either be both integers, or both strings.
fig : matplotlib.figure.Figure or ``None``
Figure object. If None, a new figure will be created.
ax : matplotlib.axes._subplots.AxesSubplot or ``None``
Axes object. If None, a new axes will be created.
figsize: (float, float)
Figure size in inches, as a tuple of two numbers. The figure
size of ``fig`` (if not ``None``) will override this parameter.
dpi : float
Figure resolution. The dpi of ``fig`` (if not ``None``) will override
this parameter.
alpha : float
Opacity of the scatter points.
color : str, list<float>, tuple<float>, or ``None``
Color of the scatter points. If ``None``, default matplotlib color
palette will be used.
grid_on : bool
Whether or not to show grids on the plot.
Returns
-------
fig : matplotlib.figure.Figure
The figure object being created or being passed into this function.
ax : matplotlib.axes._subplots.AxesSubplot
The axes object being created or being passed into this function.
'''
fig, ax = hlp._process_fig_ax_objects(fig, ax, figsize, dpi)
if not isinstance(X, pd.DataFrame):
raise TypeError('`X` must be a pandas DataFrame.')
if not isinstance(two_columns, list):
raise TypeError('`two_columns` must be a list of length 2.')
if len(two_columns) != 2:
raise hlp.LengthError('Length of `two_columns` must be 2.')
if isinstance(two_columns[0], str):
x = X[two_columns[0]]
xlabel = two_columns[0]
elif isinstance(two_columns[0], (int, np.integer)):
x = X.iloc[:, two_columns[0]]
xlabel = X.columns[two_columns[0]]
else:
raise TypeError('`two_columns` must be a list of str or int.')
if isinstance(two_columns[1], str):
y = X[two_columns[1]]
ylabel = two_columns[1]
elif isinstance(two_columns[1], (int, np.integer)):
y = X.iloc[:,two_columns[1]]
ylabel = X.columns[two_columns[1]]
else:
raise TypeError('`two_columns` must be a list of str or int.')
x = np.array(x) # convert to numpy array so that x[ind] runs correctly
y = np.array(y)
try:
nan_index_in_x = np.where(np.isnan(x))[0]
except TypeError:
raise TypeError('Cannot cast the first column safely into numerical types.')
try:
nan_index_in_y = np.where(np.isnan(y))[0]
except TypeError:
raise TypeError('Cannot cast the second column safely into numerical types.')
nan_index = set(nan_index_in_x) | set(nan_index_in_y)
not_nan_index = list(set(range(len(x))) - nan_index)
_, _, r_value, _, _ = stats.linregress(x[not_nan_index], y[not_nan_index])
ax.scatter(x, y, alpha=alpha, color=color)
if xlabel: ax.set_xlabel(xlabel)
if ylabel: ax.set_ylabel(ylabel)
ax.set_title('$r$ = %.2f' % r_value)
if logx:
ax.set_xscale('log')
if logy:
ax.set_yscale('log')
if grid_on == True:
ax.grid(ls=':', lw=0.5)
ax.set_axisbelow(True)
return fig, ax
#%%============================================================================
[docs]def bin_and_mean(
xdata, ydata, bins=10, distribution='normal', show_fig=True,
fig=None, ax=None, figsize=None, dpi=100, show_bins=True,
raw_data_label='raw data', mean_data_label='average',
xlabel=None, ylabel=None, logx=False, logy=False, grid_on=True,
error_bounds=True, err_bound_type='shade', legend_on=True,
subsamp_thres=None, show_stats=True, show_SE=False,
err_bound_shade_opacity=0.5,
):
'''
Calculate the "bin-and-mean" results and optionally show the "bin-and-mean"
plot.
A "bin-and-mean" plot is a more salient way to show the dependency of
``ydata`` on ``xdata``. The data points (``xdata``, ``ydata``) are divided
into different bins according to the values in ``xdata`` (via ``bins``),
and within each bin, the mean values of x and y are calculated, and treated
as the representative x and y values.
"Bin-and-mean" is preferred when data points are highly skewed (e.g.,
a lot of data points for when x is small, but very few for large x). The
data points when x is large are usually not noises, and could be even more
valuable (think of the case where x is earthquake magnitude and y is the
related economic loss). If we want to study the relationship between
economic loss and earthquake magnitude, we need to bin-and-mean raw data
and draw conclusions from the mean data points.
The theory that enables this method is the assumption that the data points
with similar x values follow the same distribution. Naively, we assume the
data points are normally distributed, then y_mean is the arithmetic mean of
the data points within a bin. We also often assume the data points follow
log-normal distribution (if we want to assert that y values are all
positive), then y_mean is the expected value of the log-normal distribution,
while x_mean for any bins are still just the arithmetic mean.
Notes:
(1) For log-normal distribution, the expective value of y is:
E(Y) = exp(mu + (1/2)*sigma^2)
and the variance is:
Var(Y) = [exp(sigma^2) - 1] * exp(2*mu + sigma^2)
where mu and sigma are the two parameters of the distribution.
(2) Knowing E(Y) and Var(Y), mu and sigma can be back-calculated::
___________________
mu = ln[ E(Y) / V 1 + Var(Y)/E^2(Y) ]
_________________________
sigma = V ln[ 1 + Var(Y)/E^2(Y) ]
(Reference: https://en.wikipedia.org/wiki/Log-normal_distribution)
Parameters
----------
xdata : list, numpy.ndarray, or pandas.Series
X data.
ydata : list, numpy.ndarray, or pandas.Series
Y data.
bins : int, list, numpy.ndarray, or pandas.Series
Number of bins (an integer), or an array representing the actual bin
edges. If ``bins`` means bin edges, the edges are inclusive on the
lower bound, e.g., a value 2 shall fall into the bin [2, 3), but not
the bin [1, 2). Note that the binning is done according to the X values.
distribution : {'normal', 'lognormal'}
Specifies which distribution the Y values within a bin follow. Use
'lognormal' if you want to assert all positive Y values. Only supports
normal and log-normal distributions at this time.
show_fig : bool
Whether or not to show a bin-and-mean plot.
fig : matplotlib.figure.Figure or ``None``
Figure object. If None, a new figure will be created.
ax : matplotlib.axes._subplots.AxesSubplot or ``None``
Axes object. If None, a new axes will be created.
figsize: (float, float)
Figure size in inches, as a tuple of two numbers. The figure
size of ``fig`` (if not ``None``) will override this parameter.
dpi : float
Figure resolution. The dpi of ``fig`` (if not ``None``) will override
this parameter.
show_bins : bool
Whether or not to show the bin edges as vertical lines on the plots.
raw_data_label : str
The label name of the raw data to be shown in the legend (such as
"raw data"). It has no effects if ``show_legend`` is ``False``.
mean_data_label : str
The label name of the mean data to be shown in the legend (such as
"averaged data"). It has no effects if ``show_legend`` is ``False``.
xlabel : str or ``None``
X axis label. If ``None`` and ``xdata`` is a pandas Series, use
``xdata``'s "name" attribute as ``xlabel``.
ylabel : str of ``None``
Y axis label. If ``None`` and ``ydata`` is a pandas Series, use
``ydata``'s "name" attribute as ``ylabel``.
logx : bool
Whether or not to show the X axis in log scale.
logy : bool
Whether or not to show the Y axis in log scale.
grid_on : bool
Whether or not to show grids on the plot.
error_bounds : bool
Whether or not to show error bounds of each bin.
err_bound_type : {'shade', 'bar'}
Type of error bound: shaded area or error bars. It has no effects if
error_bounds is set to ``False``.
legend_on : bool
Whether or not to show a legend.
subsamp_thres : int
A positive integer that defines the number of data points in each bin
to show in the scatter plot. The smaller this number, the faster the
plotting process. If larger than the number of data points in a bin,
then all data points from that bin are plotted. If ``None``, then all
data points from all bins are plotted.
show_stats : bool
Whether or not to show R^2 scores, correlation coefficients of the raw
data and the binned averages on the plot.
show_SE : bool
If ``True``, show the standard error of y_mean (orange dots) of each
bin as the shaded area beneath the mean value lines. If ``False``, show
the standard deviation of raw Y values (gray dots) within each bin.
err_bound_shade_opacity : float
The opacity of the shaded area representing the error bound. 0 means
completely transparent, and 1 means completely opaque. It has no effect
if ``error_bound_type`` is ``'bar'``.
Returns
-------
fig : matplotlib.figure.Figure
The figure object being created or being passed into this function.
``None``, if ``show_fig`` is set to ``False``.
ax : matplotlib.axes._subplots.AxesSubplot
The axes object being created or being passed into this function.
``None``, if ``show_fig`` is set to ``False``.
x_mean : numpy.ndarray
Mean X values of each data bin (in terms of X values).
y_mean : numpy.ndarray
Mean Y values of each data bin (in terms of X values).
y_std : numpy.ndarray
Standard deviation of Y values or each data bin (in terms of X values).
y_SE : numpy.ndarray
Standard error of ``y_mean``. It describes how far ``y_mean`` is from
the population mean (or the "true mean value") within each bin, which
is a different concept from ``y_std``.
See https://en.wikipedia.org/wiki/Standard_error#Standard_error_of_mean_versus_standard_deviation
for further information.
stats_ : tuple<float>
A tuple in the order of (r2_score_raw, corr_coeff_raw, r2_score_binned,
corr_coeff_binned), which are the R^2 score and correlation coefficient
of the raw data (``xdata`` and ``ydata``) and the binned averages
(``x_mean`` and ``y_mean``).
'''
if not isinstance(xdata, hlp._array_like) or not isinstance(ydata, hlp._array_like):
raise TypeError(
'`xdata` and `ydata` must be lists, numpy arrays, or pandas Series.'
)
if len(xdata) != len(ydata):
raise hlp.LengthError('`xdata` and `ydata` must have the same length.')
if isinstance(xdata, list): xdata = np.array(xdata) # otherwise boolean
if isinstance(ydata, list): ydata = np.array(ydata) # indexing won't work
#------------Pre-process "bins"--------------------------------------------
if isinstance(bins,(int,np.integer)): # if user specifies number of bins
if bins <= 0:
raise ValueError('`bins` must be a positive integer.')
else:
nr = bins + 1 # create bins with percentiles in xdata
x_uni = np.unique(xdata)
bins = [np.nanpercentile(x_uni,(j+0.)/bins*100) for j in range(nr)]
if not all(x <= y for x,y in zip(bins,bins[1:])): # https://stackoverflow.com/a/4983359/8892243
print(
'\nWARNING: Resulting "bins" array is not monotonically '
'increasing. Please use a smaller "bins" to avoid potential '
'issues.\n'
)
elif isinstance(bins,(list,np.ndarray)): # if user specifies array
nr = len(bins)
else:
raise TypeError('`bins` must be either an integer or an array.')
#-----------Pre-process xlabel and ylabel----------------------------------
if not xlabel and isinstance(xdata, pd.Series): # xdata has 'name' attr
xlabel = xdata.name
if not ylabel and isinstance(ydata, pd.Series): # ydata has 'name' attr
ylabel = ydata.name
#-----------Group data into bins-------------------------------------------
inds = np.digitize(xdata, bins)
x_mean = np.zeros(nr-1)
y_mean = np.zeros(nr-1)
y_std = np.zeros(nr-1)
y_SE = np.zeros(nr-1)
x_subs = [] # subsampled x data (for faster scatter plots)
y_subs = []
for j in range(nr-1): # loop over every bin
x_in_bin = xdata[inds == j+1]
y_in_bin = ydata[inds == j+1]
#------------Calculate mean and std------------------------------------
if len(x_in_bin) == 0: # no point falls into current bin
x_mean[j] = np.nan # this is to prevent numpy from throwing...
y_mean[j] = np.nan #...confusing warning messages
y_std[j] = np.nan
y_SE[j] = np.nan
else:
x_mean[j] = np.nanmean(x_in_bin)
if distribution == 'normal':
y_mean[j] = np.nanmean(y_in_bin)
y_std[j] = np.nanstd(y_in_bin)
y_SE[j] = stats.sem(y_in_bin)
elif distribution == 'lognormal':
s, loc, scale = stats.lognorm.fit(y_in_bin, floc=0)
estimated_mu = np.log(scale)
estimated_sigma = s
y_mean[j] = np.exp(estimated_mu + estimated_sigma**2.0/2.0)
y_std[j] = np.sqrt(
np.exp(2.*estimated_mu + estimated_sigma**2.) \
* (np.exp(estimated_sigma**2.) - 1)
)
y_SE[j] = y_std[j] / np.sqrt(len(y_in_bin))
else:
raise ValueError(
"Valid values of `distribution` are "
"{'normal', 'lognormal'}. Not '%s'." % distribution
)
#------------Pick subsets of data, for faster plotting-----------------
#------------Note that this does not affect mean and std---------------
if subsamp_thres is not None and show_fig:
if not isinstance(subsamp_thres, (int, np.integer)) or subsamp_thres <= 0:
raise TypeError('`subsamp_thres` must be a positive integer or None.')
if len(x_in_bin) > subsamp_thres:
x_subs.extend(np.random.choice(x_in_bin,subsamp_thres,replace=False))
y_subs.extend(np.random.choice(y_in_bin,subsamp_thres,replace=False))
else:
x_subs.extend(x_in_bin)
y_subs.extend(y_in_bin)
#-------------Calculate R^2 and corr. coeff.-------------------------------
non_nan_indices = ~np.isnan(xdata) & ~np.isnan(ydata)
xdata_without_nan = xdata[non_nan_indices]
ydata_without_nan = ydata[non_nan_indices]
r2_score_raw = hlp._calc_r2_score(ydata_without_nan, xdata_without_nan) # treat "xdata" as "y_pred"
corr_coeff_raw = np.corrcoef(xdata_without_nan, ydata_without_nan)[0, 1]
r2_score_binned = hlp._calc_r2_score(y_mean, x_mean)
corr_coeff_binned = np.corrcoef(x_mean, y_mean)[0, 1]
stats_ = (r2_score_raw, corr_coeff_raw, r2_score_binned, corr_coeff_binned)
#-------------Plot data on figure------------------------------------------
if show_fig:
fig, ax = hlp._process_fig_ax_objects(fig, ax, figsize, dpi)
if subsamp_thres: xdata, ydata = x_subs, y_subs
ax.scatter(xdata,ydata,c='gray',alpha=0.3,label=raw_data_label,zorder=1)
if error_bounds:
if err_bound_type == 'shade':
ax.plot(
x_mean, y_mean, '-o', c='orange', lw=2,
label=mean_data_label, zorder=3,
)
if show_SE:
ax.fill_between(
x_mean, y_mean + y_SE, y_mean - y_SE,
label='$\pm$ S.E.', facecolor='orange',
alpha=err_bound_shade_opacity, zorder=2.5,
)
else:
ax.fill_between(
x_mean, y_mean + y_std, y_mean - y_std,
label='$\pm$ std', facecolor='orange',
alpha=err_bound_shade_opacity, zorder=2.5,
)
# END IF-ELSE
elif err_bound_type == 'bar':
if show_SE:
mean_data_label += '$\pm$ S.E.'
ax.errorbar(
x_mean, y_mean, yerr=y_SE, ls='-', marker='o',
c='orange', lw=2, elinewidth=1, capsize=2,
label=mean_data_label, zorder=3,
)
else:
mean_data_label += '$\pm$ std'
ax.errorbar(
x_mean, y_mean, yerr=y_std, ls='-', marker='o',
c='orange', lw=2, elinewidth=1, capsize=2,
label=mean_data_label, zorder=3,
)
# END IF-ELSE
else:
raise ValueError(
'Valid "err_bound_type" name are {"bound", '
'"bar"}, not "%s".' % err_bound_type
)
else:
ax.plot(
x_mean, y_mean, '-o', c='orange', lw=2, label=mean_data_label,
zorder=3,
)
ax.set_axisbelow(True)
if xlabel: ax.set_xlabel(xlabel)
if ylabel: ax.set_ylabel(ylabel)
if logx:
ax.set_xscale('log')
if logy:
ax.set_yscale('log')
if grid_on:
ax.grid(ls=':')
ax.set_axisbelow(True)
if show_bins:
ylims = ax.get_ylim()
for k, edge in enumerate(bins):
lab_ = 'bin edges' if k==0 else None # only label 1st edge
ec = cl.get_colors(N=1)[0]
ax.plot([edge]*2,ylims,'--',c=ec,lw=1.0,zorder=2,label=lab_)
if legend_on:
ax.legend(loc='best')
if show_stats:
stats_text = "$R^2_{\mathrm{raw}}$=%.2f, $r_{\mathrm{raw}}$=%.2f, " \
"$R^2_{\mathrm{avg}}$=%.2f, " \
"$r_{\mathrm{avg}}$=%.2f" % stats_
ax.set_title(stats_text)
return fig, ax, x_mean, y_mean, y_std, y_SE, stats_
else:
return None, None, x_mean, y_mean, y_std, y_SE, stats_