Source code for xhistogram.xarray

"""
Xarray API for xhistogram.
"""

import xarray as xr
from collections import OrderedDict
from .core import histogram as _histogram

# range is a keyword so save the builtin so they can use it.
_range = range


[docs]def histogram( *args, bins=None, range=None, dim=None, weights=None, density=False, block_size="auto", keep_coords=False, bin_dim_suffix="_bin", ): """Histogram applied along specified dimensions. Parameters ---------- args : xarray.DataArray objects Input data. The number of input arguments determines the dimensonality of the histogram. For example, two arguments prodocue a 2D histogram. All args must be aligned and have the same dimensions. bins : int, str or numpy array or a list of ints, strs and/or arrays, optional If a list, there should be one entry for each item in ``args``. The bin specifications are as follows: * If int; the number of bins for all arguments in ``args``. * If str; the method used to automatically calculate the optimal bin width for all arguments in ``args``, as defined by numpy `histogram_bin_edges`. * If numpy array; the bin edges for all arguments in ``args``. * If a list of ints, strs and/or arrays; the bin specification as above for every argument in ``args``. When bin edges are specified, all but the last (righthand-most) bin include the left edge and exclude the right edge. The last bin includes both edges. A TypeError will be raised if args or weights contains dask arrays and bins are not specified explicitly as an array or list of arrays. This is because other bin specifications trigger computation. range : (float, float) or a list of (float, float), optional If a list, there should be one entry for each item in ``args``. The range specifications are as follows: * If (float, float); the lower and upper range(s) of the bins for all arguments in ``args``. Values outside the range are ignored. The first element of the range must be less than or equal to the second. `range` affects the automatic bin computation as well. In this case, while bin width is computed to be optimal based on the actual data within `range`, the bin count will fill the entire range including portions containing no data. * If a list of (float, float); the ranges as above for every argument in ``args``. * If not provided, range is simply ``(arg.min(), arg.max())`` for each arg. dim : tuple of strings, optional Dimensions over which which the histogram is computed. The default is to compute the histogram of the flattened array. weights : array_like, optional An array of weights, of the same shape as `a`. Each value in `a` only contributes its associated weight towards the bin count (instead of 1). If `density` is True, the weights are normalized, so that the integral of the density over the range remains 1. NaNs in the weights input will fill the entire bin with NaNs. If there are NaNs in the weights input call ``.fillna(0.)`` before running ``histogram()``. density : bool, optional If ``False``, the result will contain the number of samples in each bin. If ``True``, the result is the value of the probability *density* function at the bin, normalized such that the *integral* over the range is 1. Note that the sum of the histogram values will not be equal to 1 unless bins of unity width are chosen; it is not a probability *mass* function. block_size : int or 'auto', optional A parameter which governs the algorithm used to compute the histogram. Using a nonzero value splits the histogram calculation over the non-histogram axes into blocks of size ``block_size``, iterating over them with a loop (numpy inputs) or in parallel (dask inputs). If ``'auto'``, blocks will be determined either by the underlying dask chunks (dask inputs) or an experimental built-in heuristic (numpy inputs). keep_coords : bool, optional If ``True``, keep all coordinates. Default: ``False`` bin_dim_suffix : str, optional Suffix to append to input arg names to define names of output bin dimensions Returns ------- hist : xarray.DataArray The values of the histogram. For each bin, the midpoint of the bin edges is given along the bin coordinates. """ args = list(args) N_args = len(args) # TODO: allow list of weights as well N_weights = 1 if weights is not None else 0 for a in args: if not isinstance(a, xr.DataArray): raise TypeError( "xhistogram.xarray.histogram accepts only xarray.DataArray " + f"objects but a {type(a).__name__} was provided" ) for a in args: assert a.name is not None, "all arrays must have a name" # we drop coords to simplify alignment if not keep_coords: args = [da.reset_coords(drop=True) for da in args] if N_weights: args += [weights.reset_coords(drop=True)] # explicitly broadcast so we understand what is going into apply_ufunc # (apply_ufunc might be doing this by itself again) args = list(xr.align(*args, join="exact")) # what happens if we skip this? # args = list(xr.broadcast(*args)) a0 = args[0] a_coords = a0.coords # roll our own broadcasting # now manually expand the arrays all_dims = [d for a in args for d in a.dims] all_dims_ordered = list(OrderedDict.fromkeys(all_dims)) args_expanded = [] for a in args: expand_keys = [d for d in all_dims_ordered if d not in a.dims] a_expanded = a.expand_dims({k: 1 for k in expand_keys}) args_expanded.append(a_expanded) # only transpose if necessary, to avoid creating unnecessary dask tasks args_transposed = [] for a in args_expanded: if a.dims != all_dims_ordered: args_transposed.append(a.transpose(*all_dims_ordered)) else: args.transposed.append(a) args_data = [a.data for a in args_transposed] if N_weights: weights_data = args_data.pop() else: weights_data = None if dim is not None: dims_to_keep = [d for d in all_dims_ordered if d not in dim] axis = [args_transposed[0].get_axis_num(d) for d in dim] else: dims_to_keep = [] axis = None h_data, bins = _histogram( *args_data, weights=weights_data, bins=bins, range=range, axis=axis, density=density, block_size=block_size, ) # create output dims new_dims = [a.name + bin_dim_suffix for a in args[:N_args]] output_dims = dims_to_keep + new_dims # create new coords bin_centers = [0.5 * (bin[:-1] + bin[1:]) for bin in bins] new_coords = { name: ((name,), bin_center, a.attrs) for name, bin_center, a in zip(new_dims, bin_centers, args) } # old coords associated with dims old_dim_coords = {name: a0[name] for name in dims_to_keep if name in a_coords} all_coords = {} all_coords.update(old_dim_coords) all_coords.update(new_coords) # add compatible coords if keep_coords: for c in a_coords: if c not in all_coords and set(a0[c].dims).issubset(output_dims): all_coords[c] = a0[c] output_name = "_".join(["histogram"] + [a.name for a in args[:N_args]]) da_out = xr.DataArray(h_data, dims=output_dims, coords=all_coords, name=output_name) return da_out
# we need weights to be passed through apply_func's alignment algorithm, # so we include it as an arg, so we create a wrapper function to do so # this feels like a hack # def _histogram_wrapped(*args, **kwargs): # alist = list(args) # weights = [alist.pop() for n in _range(N_weights)] # if N_weights == 0: # weights = None # elif N_weights == 1: # weights = weights[0] # squeeze # return _histogram(*alist, weights=weights, **kwargs)