1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129 |
- """
- Histogram-related functions
- """
- import contextlib
- import functools
- import operator
- import warnings
- import numpy as np
- from numpy.core import overrides
- __all__ = ['histogram', 'histogramdd', 'histogram_bin_edges']
- array_function_dispatch = functools.partial(
- overrides.array_function_dispatch, module='numpy')
- # range is a keyword argument to many functions, so save the builtin so they can
- # use it.
- _range = range
- def _ptp(x):
- """Peak-to-peak value of x.
- This implementation avoids the problem of signed integer arrays having a
- peak-to-peak value that cannot be represented with the array's data type.
- This function returns an unsigned value for signed integer arrays.
- """
- return _unsigned_subtract(x.max(), x.min())
- def _hist_bin_sqrt(x, range):
- """
- Square root histogram bin estimator.
- Bin width is inversely proportional to the data size. Used by many
- programs for its simplicity.
- Parameters
- ----------
- x : array_like
- Input data that is to be histogrammed, trimmed to range. May not
- be empty.
- Returns
- -------
- h : An estimate of the optimal bin width for the given data.
- """
- del range # unused
- return _ptp(x) / np.sqrt(x.size)
- def _hist_bin_sturges(x, range):
- """
- Sturges histogram bin estimator.
- A very simplistic estimator based on the assumption of normality of
- the data. This estimator has poor performance for non-normal data,
- which becomes especially obvious for large data sets. The estimate
- depends only on size of the data.
- Parameters
- ----------
- x : array_like
- Input data that is to be histogrammed, trimmed to range. May not
- be empty.
- Returns
- -------
- h : An estimate of the optimal bin width for the given data.
- """
- del range # unused
- return _ptp(x) / (np.log2(x.size) + 1.0)
- def _hist_bin_rice(x, range):
- """
- Rice histogram bin estimator.
- Another simple estimator with no normality assumption. It has better
- performance for large data than Sturges, but tends to overestimate
- the number of bins. The number of bins is proportional to the cube
- root of data size (asymptotically optimal). The estimate depends
- only on size of the data.
- Parameters
- ----------
- x : array_like
- Input data that is to be histogrammed, trimmed to range. May not
- be empty.
- Returns
- -------
- h : An estimate of the optimal bin width for the given data.
- """
- del range # unused
- return _ptp(x) / (2.0 * x.size ** (1.0 / 3))
- def _hist_bin_scott(x, range):
- """
- Scott histogram bin estimator.
- The binwidth is proportional to the standard deviation of the data
- and inversely proportional to the cube root of data size
- (asymptotically optimal).
- Parameters
- ----------
- x : array_like
- Input data that is to be histogrammed, trimmed to range. May not
- be empty.
- Returns
- -------
- h : An estimate of the optimal bin width for the given data.
- """
- del range # unused
- return (24.0 * np.pi**0.5 / x.size)**(1.0 / 3.0) * np.std(x)
- def _hist_bin_stone(x, range):
- """
- Histogram bin estimator based on minimizing the estimated integrated squared error (ISE).
- The number of bins is chosen by minimizing the estimated ISE against the unknown true distribution.
- The ISE is estimated using cross-validation and can be regarded as a generalization of Scott's rule.
- https://en.wikipedia.org/wiki/Histogram#Scott.27s_normal_reference_rule
- This paper by Stone appears to be the origination of this rule.
- http://digitalassets.lib.berkeley.edu/sdtr/ucb/text/34.pdf
- Parameters
- ----------
- x : array_like
- Input data that is to be histogrammed, trimmed to range. May not
- be empty.
- range : (float, float)
- The lower and upper range of the bins.
- Returns
- -------
- h : An estimate of the optimal bin width for the given data.
- """
- n = x.size
- ptp_x = _ptp(x)
- if n <= 1 or ptp_x == 0:
- return 0
- def jhat(nbins):
- hh = ptp_x / nbins
- p_k = np.histogram(x, bins=nbins, range=range)[0] / n
- return (2 - (n + 1) * p_k.dot(p_k)) / hh
- nbins_upper_bound = max(100, int(np.sqrt(n)))
- nbins = min(_range(1, nbins_upper_bound + 1), key=jhat)
- if nbins == nbins_upper_bound:
- warnings.warn("The number of bins estimated may be suboptimal.",
- RuntimeWarning, stacklevel=3)
- return ptp_x / nbins
- def _hist_bin_doane(x, range):
- """
- Doane's histogram bin estimator.
- Improved version of Sturges' formula which works better for
- non-normal data. See
- stats.stackexchange.com/questions/55134/doanes-formula-for-histogram-binning
- Parameters
- ----------
- x : array_like
- Input data that is to be histogrammed, trimmed to range. May not
- be empty.
- Returns
- -------
- h : An estimate of the optimal bin width for the given data.
- """
- del range # unused
- if x.size > 2:
- sg1 = np.sqrt(6.0 * (x.size - 2) / ((x.size + 1.0) * (x.size + 3)))
- sigma = np.std(x)
- if sigma > 0.0:
- # These three operations add up to
- # g1 = np.mean(((x - np.mean(x)) / sigma)**3)
- # but use only one temp array instead of three
- temp = x - np.mean(x)
- np.true_divide(temp, sigma, temp)
- np.power(temp, 3, temp)
- g1 = np.mean(temp)
- return _ptp(x) / (1.0 + np.log2(x.size) +
- np.log2(1.0 + np.absolute(g1) / sg1))
- return 0.0
- def _hist_bin_fd(x, range):
- """
- The Freedman-Diaconis histogram bin estimator.
- The Freedman-Diaconis rule uses interquartile range (IQR) to
- estimate binwidth. It is considered a variation of the Scott rule
- with more robustness as the IQR is less affected by outliers than
- the standard deviation. However, the IQR depends on fewer points
- than the standard deviation, so it is less accurate, especially for
- long tailed distributions.
- If the IQR is 0, this function returns 0 for the bin width.
- Binwidth is inversely proportional to the cube root of data size
- (asymptotically optimal).
- Parameters
- ----------
- x : array_like
- Input data that is to be histogrammed, trimmed to range. May not
- be empty.
- Returns
- -------
- h : An estimate of the optimal bin width for the given data.
- """
- del range # unused
- iqr = np.subtract(*np.percentile(x, [75, 25]))
- return 2.0 * iqr * x.size ** (-1.0 / 3.0)
- def _hist_bin_auto(x, range):
- """
- Histogram bin estimator that uses the minimum width of the
- Freedman-Diaconis and Sturges estimators if the FD bin width is non-zero.
- If the bin width from the FD estimator is 0, the Sturges estimator is used.
- The FD estimator is usually the most robust method, but its width
- estimate tends to be too large for small `x` and bad for data with limited
- variance. The Sturges estimator is quite good for small (<1000) datasets
- and is the default in the R language. This method gives good off-the-shelf
- behaviour.
- .. versionchanged:: 1.15.0
- If there is limited variance the IQR can be 0, which results in the
- FD bin width being 0 too. This is not a valid bin width, so
- ``np.histogram_bin_edges`` chooses 1 bin instead, which may not be optimal.
- If the IQR is 0, it's unlikely any variance-based estimators will be of
- use, so we revert to the Sturges estimator, which only uses the size of the
- dataset in its calculation.
- Parameters
- ----------
- x : array_like
- Input data that is to be histogrammed, trimmed to range. May not
- be empty.
- Returns
- -------
- h : An estimate of the optimal bin width for the given data.
- See Also
- --------
- _hist_bin_fd, _hist_bin_sturges
- """
- fd_bw = _hist_bin_fd(x, range)
- sturges_bw = _hist_bin_sturges(x, range)
- del range # unused
- if fd_bw:
- return min(fd_bw, sturges_bw)
- else:
- # limited variance, so we return a len dependent bw estimator
- return sturges_bw
- # Private dict initialized at module load time
- _hist_bin_selectors = {'stone': _hist_bin_stone,
- 'auto': _hist_bin_auto,
- 'doane': _hist_bin_doane,
- 'fd': _hist_bin_fd,
- 'rice': _hist_bin_rice,
- 'scott': _hist_bin_scott,
- 'sqrt': _hist_bin_sqrt,
- 'sturges': _hist_bin_sturges}
- def _ravel_and_check_weights(a, weights):
- """ Check a and weights have matching shapes, and ravel both """
- a = np.asarray(a)
- # Ensure that the array is a "subtractable" dtype
- if a.dtype == np.bool_:
- warnings.warn("Converting input from {} to {} for compatibility."
- .format(a.dtype, np.uint8),
- RuntimeWarning, stacklevel=3)
- a = a.astype(np.uint8)
- if weights is not None:
- weights = np.asarray(weights)
- if weights.shape != a.shape:
- raise ValueError(
- 'weights should have the same shape as a.')
- weights = weights.ravel()
- a = a.ravel()
- return a, weights
- def _get_outer_edges(a, range):
- """
- Determine the outer bin edges to use, from either the data or the range
- argument
- """
- if range is not None:
- first_edge, last_edge = range
- if first_edge > last_edge:
- raise ValueError(
- 'max must be larger than min in range parameter.')
- if not (np.isfinite(first_edge) and np.isfinite(last_edge)):
- raise ValueError(
- "supplied range of [{}, {}] is not finite".format(first_edge, last_edge))
- elif a.size == 0:
- # handle empty arrays. Can't determine range, so use 0-1.
- first_edge, last_edge = 0, 1
- else:
- first_edge, last_edge = a.min(), a.max()
- if not (np.isfinite(first_edge) and np.isfinite(last_edge)):
- raise ValueError(
- "autodetected range of [{}, {}] is not finite".format(first_edge, last_edge))
- # expand empty range to avoid divide by zero
- if first_edge == last_edge:
- first_edge = first_edge - 0.5
- last_edge = last_edge + 0.5
- return first_edge, last_edge
- def _unsigned_subtract(a, b):
- """
- Subtract two values where a >= b, and produce an unsigned result
- This is needed when finding the difference between the upper and lower
- bound of an int16 histogram
- """
- # coerce to a single type
- signed_to_unsigned = {
- np.byte: np.ubyte,
- np.short: np.ushort,
- np.intc: np.uintc,
- np.int_: np.uint,
- np.longlong: np.ulonglong
- }
- dt = np.result_type(a, b)
- try:
- dt = signed_to_unsigned[dt.type]
- except KeyError:
- return np.subtract(a, b, dtype=dt)
- else:
- # we know the inputs are integers, and we are deliberately casting
- # signed to unsigned
- return np.subtract(a, b, casting='unsafe', dtype=dt)
- def _get_bin_edges(a, bins, range, weights):
- """
- Computes the bins used internally by `histogram`.
- Parameters
- ==========
- a : ndarray
- Ravelled data array
- bins, range
- Forwarded arguments from `histogram`.
- weights : ndarray, optional
- Ravelled weights array, or None
- Returns
- =======
- bin_edges : ndarray
- Array of bin edges
- uniform_bins : (Number, Number, int):
- The upper bound, lowerbound, and number of bins, used in the optimized
- implementation of `histogram` that works on uniform bins.
- """
- # parse the overloaded bins argument
- n_equal_bins = None
- bin_edges = None
- if isinstance(bins, str):
- bin_name = bins
- # if `bins` is a string for an automatic method,
- # this will replace it with the number of bins calculated
- if bin_name not in _hist_bin_selectors:
- raise ValueError(
- "{!r} is not a valid estimator for `bins`".format(bin_name))
- if weights is not None:
- raise TypeError("Automated estimation of the number of "
- "bins is not supported for weighted data")
- first_edge, last_edge = _get_outer_edges(a, range)
- # truncate the range if needed
- if range is not None:
- keep = (a >= first_edge)
- keep &= (a <= last_edge)
- if not np.logical_and.reduce(keep):
- a = a[keep]
- if a.size == 0:
- n_equal_bins = 1
- else:
- # Do not call selectors on empty arrays
- width = _hist_bin_selectors[bin_name](a, (first_edge, last_edge))
- if width:
- n_equal_bins = int(np.ceil(_unsigned_subtract(last_edge, first_edge) / width))
- else:
- # Width can be zero for some estimators, e.g. FD when
- # the IQR of the data is zero.
- n_equal_bins = 1
- elif np.ndim(bins) == 0:
- try:
- n_equal_bins = operator.index(bins)
- except TypeError as e:
- raise TypeError(
- '`bins` must be an integer, a string, or an array') from e
- if n_equal_bins < 1:
- raise ValueError('`bins` must be positive, when an integer')
- first_edge, last_edge = _get_outer_edges(a, range)
- elif np.ndim(bins) == 1:
- bin_edges = np.asarray(bins)
- if np.any(bin_edges[:-1] > bin_edges[1:]):
- raise ValueError(
- '`bins` must increase monotonically, when an array')
- else:
- raise ValueError('`bins` must be 1d, when an array')
- if n_equal_bins is not None:
- # gh-10322 means that type resolution rules are dependent on array
- # shapes. To avoid this causing problems, we pick a type now and stick
- # with it throughout.
- bin_type = np.result_type(first_edge, last_edge, a)
- if np.issubdtype(bin_type, np.integer):
- bin_type = np.result_type(bin_type, float)
- # bin edges must be computed
- bin_edges = np.linspace(
- first_edge, last_edge, n_equal_bins + 1,
- endpoint=True, dtype=bin_type)
- return bin_edges, (first_edge, last_edge, n_equal_bins)
- else:
- return bin_edges, None
- def _search_sorted_inclusive(a, v):
- """
- Like `searchsorted`, but where the last item in `v` is placed on the right.
- In the context of a histogram, this makes the last bin edge inclusive
- """
- return np.concatenate((
- a.searchsorted(v[:-1], 'left'),
- a.searchsorted(v[-1:], 'right')
- ))
- def _histogram_bin_edges_dispatcher(a, bins=None, range=None, weights=None):
- return (a, bins, weights)
- @array_function_dispatch(_histogram_bin_edges_dispatcher)
- def histogram_bin_edges(a, bins=10, range=None, weights=None):
- r"""
- Function to calculate only the edges of the bins used by the `histogram`
- function.
- Parameters
- ----------
- a : array_like
- Input data. The histogram is computed over the flattened array.
- bins : int or sequence of scalars or str, optional
- If `bins` is an int, it defines the number of equal-width
- bins in the given range (10, by default). If `bins` is a
- sequence, it defines the bin edges, including the rightmost
- edge, allowing for non-uniform bin widths.
- If `bins` is a string from the list below, `histogram_bin_edges` will use
- the method chosen to calculate the optimal bin width and
- consequently the number of bins (see `Notes` for more detail on
- the estimators) from the data that falls within the requested
- range. While the bin width will be optimal for the actual data
- in the range, the number of bins will be computed to fill the
- entire range, including the empty portions. For visualisation,
- using the 'auto' option is suggested. Weighted data is not
- supported for automated bin size selection.
- 'auto'
- Maximum of the 'sturges' and 'fd' estimators. Provides good
- all around performance.
- 'fd' (Freedman Diaconis Estimator)
- Robust (resilient to outliers) estimator that takes into
- account data variability and data size.
- 'doane'
- An improved version of Sturges' estimator that works better
- with non-normal datasets.
- 'scott'
- Less robust estimator that that takes into account data
- variability and data size.
- 'stone'
- Estimator based on leave-one-out cross-validation estimate of
- the integrated squared error. Can be regarded as a generalization
- of Scott's rule.
- 'rice'
- Estimator does not take variability into account, only data
- size. Commonly overestimates number of bins required.
- 'sturges'
- R's default method, only accounts for data size. Only
- optimal for gaussian data and underestimates number of bins
- for large non-gaussian datasets.
- 'sqrt'
- Square root (of data size) estimator, used by Excel and
- other programs for its speed and simplicity.
- range : (float, float), optional
- The lower and upper range of the bins. If not provided, range
- is simply ``(a.min(), a.max())``. Values outside the range are
- ignored. The first element of the range must be less than or
- equal to the second. `range` affects the automatic bin
- computation as well. While bin width is computed to be optimal
- based on the actual data within `range`, the bin count will fill
- the entire range including portions containing no data.
- weights : array_like, optional
- An array of weights, of the same shape as `a`. Each value in
- `a` only contributes its associated weight towards the bin count
- (instead of 1). This is currently not used by any of the bin estimators,
- but may be in the future.
- Returns
- -------
- bin_edges : array of dtype float
- The edges to pass into `histogram`
- See Also
- --------
- histogram
- Notes
- -----
- The methods to estimate the optimal number of bins are well founded
- in literature, and are inspired by the choices R provides for
- histogram visualisation. Note that having the number of bins
- proportional to :math:`n^{1/3}` is asymptotically optimal, which is
- why it appears in most estimators. These are simply plug-in methods
- that give good starting points for number of bins. In the equations
- below, :math:`h` is the binwidth and :math:`n_h` is the number of
- bins. All estimators that compute bin counts are recast to bin width
- using the `ptp` of the data. The final bin count is obtained from
- ``np.round(np.ceil(range / h))``. The final bin width is often less
- than what is returned by the estimators below.
- 'auto' (maximum of the 'sturges' and 'fd' estimators)
- A compromise to get a good value. For small datasets the Sturges
- value will usually be chosen, while larger datasets will usually
- default to FD. Avoids the overly conservative behaviour of FD
- and Sturges for small and large datasets respectively.
- Switchover point is usually :math:`a.size \approx 1000`.
- 'fd' (Freedman Diaconis Estimator)
- .. math:: h = 2 \frac{IQR}{n^{1/3}}
- The binwidth is proportional to the interquartile range (IQR)
- and inversely proportional to cube root of a.size. Can be too
- conservative for small datasets, but is quite good for large
- datasets. The IQR is very robust to outliers.
- 'scott'
- .. math:: h = \sigma \sqrt[3]{\frac{24 * \sqrt{\pi}}{n}}
- The binwidth is proportional to the standard deviation of the
- data and inversely proportional to cube root of ``x.size``. Can
- be too conservative for small datasets, but is quite good for
- large datasets. The standard deviation is not very robust to
- outliers. Values are very similar to the Freedman-Diaconis
- estimator in the absence of outliers.
- 'rice'
- .. math:: n_h = 2n^{1/3}
- The number of bins is only proportional to cube root of
- ``a.size``. It tends to overestimate the number of bins and it
- does not take into account data variability.
- 'sturges'
- .. math:: n_h = \log _{2}n+1
- The number of bins is the base 2 log of ``a.size``. This
- estimator assumes normality of data and is too conservative for
- larger, non-normal datasets. This is the default method in R's
- ``hist`` method.
- 'doane'
- .. math:: n_h = 1 + \log_{2}(n) +
- \log_{2}(1 + \frac{|g_1|}{\sigma_{g_1}})
- g_1 = mean[(\frac{x - \mu}{\sigma})^3]
- \sigma_{g_1} = \sqrt{\frac{6(n - 2)}{(n + 1)(n + 3)}}
- An improved version of Sturges' formula that produces better
- estimates for non-normal datasets. This estimator attempts to
- account for the skew of the data.
- 'sqrt'
- .. math:: n_h = \sqrt n
- The simplest and fastest estimator. Only takes into account the
- data size.
- Examples
- --------
- >>> arr = np.array([0, 0, 0, 1, 2, 3, 3, 4, 5])
- >>> np.histogram_bin_edges(arr, bins='auto', range=(0, 1))
- array([0. , 0.25, 0.5 , 0.75, 1. ])
- >>> np.histogram_bin_edges(arr, bins=2)
- array([0. , 2.5, 5. ])
- For consistency with histogram, an array of pre-computed bins is
- passed through unmodified:
- >>> np.histogram_bin_edges(arr, [1, 2])
- array([1, 2])
- This function allows one set of bins to be computed, and reused across
- multiple histograms:
- >>> shared_bins = np.histogram_bin_edges(arr, bins='auto')
- >>> shared_bins
- array([0., 1., 2., 3., 4., 5.])
- >>> group_id = np.array([0, 1, 1, 0, 1, 1, 0, 1, 1])
- >>> hist_0, _ = np.histogram(arr[group_id == 0], bins=shared_bins)
- >>> hist_1, _ = np.histogram(arr[group_id == 1], bins=shared_bins)
- >>> hist_0; hist_1
- array([1, 1, 0, 1, 0])
- array([2, 0, 1, 1, 2])
- Which gives more easily comparable results than using separate bins for
- each histogram:
- >>> hist_0, bins_0 = np.histogram(arr[group_id == 0], bins='auto')
- >>> hist_1, bins_1 = np.histogram(arr[group_id == 1], bins='auto')
- >>> hist_0; hist_1
- array([1, 1, 1])
- array([2, 1, 1, 2])
- >>> bins_0; bins_1
- array([0., 1., 2., 3.])
- array([0. , 1.25, 2.5 , 3.75, 5. ])
- """
- a, weights = _ravel_and_check_weights(a, weights)
- bin_edges, _ = _get_bin_edges(a, bins, range, weights)
- return bin_edges
- def _histogram_dispatcher(
- a, bins=None, range=None, normed=None, weights=None, density=None):
- return (a, bins, weights)
- @array_function_dispatch(_histogram_dispatcher)
- def histogram(a, bins=10, range=None, normed=None, weights=None,
- density=None):
- r"""
- Compute the histogram of a dataset.
- Parameters
- ----------
- a : array_like
- Input data. The histogram is computed over the flattened array.
- bins : int or sequence of scalars or str, optional
- If `bins` is an int, it defines the number of equal-width
- bins in the given range (10, by default). If `bins` is a
- sequence, it defines a monotonically increasing array of bin edges,
- including the rightmost edge, allowing for non-uniform bin widths.
- .. versionadded:: 1.11.0
- If `bins` is a string, it defines the method used to calculate the
- optimal bin width, as defined by `histogram_bin_edges`.
- range : (float, float), optional
- The lower and upper range of the bins. If not provided, range
- is simply ``(a.min(), a.max())``. Values outside the range are
- ignored. The first element of the range must be less than or
- equal to the second. `range` affects the automatic bin
- computation as well. While bin width is computed to be optimal
- based on the actual data within `range`, the bin count will fill
- the entire range including portions containing no data.
- normed : bool, optional
- .. deprecated:: 1.6.0
- This is equivalent to the `density` argument, but produces incorrect
- results for unequal bin widths. It should not be used.
- .. versionchanged:: 1.15.0
- DeprecationWarnings are actually emitted.
- weights : array_like, optional
- An array of weights, of the same shape as `a`. Each value in
- `a` only contributes its associated weight towards the bin count
- (instead of 1). If `density` is True, the weights are
- normalized, so that the integral of the density over the range
- remains 1.
- density : bool, optional
- If ``False``, the result will contain the number of samples in
- each bin. If ``True``, the result is the value of the
- probability *density* function at the bin, normalized such that
- the *integral* over the range is 1. Note that the sum of the
- histogram values will not be equal to 1 unless bins of unity
- width are chosen; it is not a probability *mass* function.
- Overrides the ``normed`` keyword if given.
- Returns
- -------
- hist : array
- The values of the histogram. See `density` and `weights` for a
- description of the possible semantics.
- bin_edges : array of dtype float
- Return the bin edges ``(length(hist)+1)``.
- See Also
- --------
- histogramdd, bincount, searchsorted, digitize, histogram_bin_edges
- Notes
- -----
- All but the last (righthand-most) bin is half-open. In other words,
- if `bins` is::
- [1, 2, 3, 4]
- then the first bin is ``[1, 2)`` (including 1, but excluding 2) and
- the second ``[2, 3)``. The last bin, however, is ``[3, 4]``, which
- *includes* 4.
- Examples
- --------
- >>> np.histogram([1, 2, 1], bins=[0, 1, 2, 3])
- (array([0, 2, 1]), array([0, 1, 2, 3]))
- >>> np.histogram(np.arange(4), bins=np.arange(5), density=True)
- (array([0.25, 0.25, 0.25, 0.25]), array([0, 1, 2, 3, 4]))
- >>> np.histogram([[1, 2, 1], [1, 0, 1]], bins=[0,1,2,3])
- (array([1, 4, 1]), array([0, 1, 2, 3]))
- >>> a = np.arange(5)
- >>> hist, bin_edges = np.histogram(a, density=True)
- >>> hist
- array([0.5, 0. , 0.5, 0. , 0. , 0.5, 0. , 0.5, 0. , 0.5])
- >>> hist.sum()
- 2.4999999999999996
- >>> np.sum(hist * np.diff(bin_edges))
- 1.0
- .. versionadded:: 1.11.0
- Automated Bin Selection Methods example, using 2 peak random data
- with 2000 points:
- >>> import matplotlib.pyplot as plt
- >>> rng = np.random.RandomState(10) # deterministic random data
- >>> a = np.hstack((rng.normal(size=1000),
- ... rng.normal(loc=5, scale=2, size=1000)))
- >>> _ = plt.hist(a, bins='auto') # arguments are passed to np.histogram
- >>> plt.title("Histogram with 'auto' bins")
- Text(0.5, 1.0, "Histogram with 'auto' bins")
- >>> plt.show()
- """
- a, weights = _ravel_and_check_weights(a, weights)
- bin_edges, uniform_bins = _get_bin_edges(a, bins, range, weights)
- # Histogram is an integer or a float array depending on the weights.
- if weights is None:
- ntype = np.dtype(np.intp)
- else:
- ntype = weights.dtype
- # We set a block size, as this allows us to iterate over chunks when
- # computing histograms, to minimize memory usage.
- BLOCK = 65536
- # The fast path uses bincount, but that only works for certain types
- # of weight
- simple_weights = (
- weights is None or
- np.can_cast(weights.dtype, np.double) or
- np.can_cast(weights.dtype, complex)
- )
- if uniform_bins is not None and simple_weights:
- # Fast algorithm for equal bins
- # We now convert values of a to bin indices, under the assumption of
- # equal bin widths (which is valid here).
- first_edge, last_edge, n_equal_bins = uniform_bins
- # Initialize empty histogram
- n = np.zeros(n_equal_bins, ntype)
- # Pre-compute histogram scaling factor
- norm = n_equal_bins / _unsigned_subtract(last_edge, first_edge)
- # We iterate over blocks here for two reasons: the first is that for
- # large arrays, it is actually faster (for example for a 10^8 array it
- # is 2x as fast) and it results in a memory footprint 3x lower in the
- # limit of large arrays.
- for i in _range(0, len(a), BLOCK):
- tmp_a = a[i:i+BLOCK]
- if weights is None:
- tmp_w = None
- else:
- tmp_w = weights[i:i + BLOCK]
- # Only include values in the right range
- keep = (tmp_a >= first_edge)
- keep &= (tmp_a <= last_edge)
- if not np.logical_and.reduce(keep):
- tmp_a = tmp_a[keep]
- if tmp_w is not None:
- tmp_w = tmp_w[keep]
- # This cast ensures no type promotions occur below, which gh-10322
- # make unpredictable. Getting it wrong leads to precision errors
- # like gh-8123.
- tmp_a = tmp_a.astype(bin_edges.dtype, copy=False)
- # Compute the bin indices, and for values that lie exactly on
- # last_edge we need to subtract one
- f_indices = _unsigned_subtract(tmp_a, first_edge) * norm
- indices = f_indices.astype(np.intp)
- indices[indices == n_equal_bins] -= 1
- # The index computation is not guaranteed to give exactly
- # consistent results within ~1 ULP of the bin edges.
- decrement = tmp_a < bin_edges[indices]
- indices[decrement] -= 1
- # The last bin includes the right edge. The other bins do not.
- increment = ((tmp_a >= bin_edges[indices + 1])
- & (indices != n_equal_bins - 1))
- indices[increment] += 1
- # We now compute the histogram using bincount
- if ntype.kind == 'c':
- n.real += np.bincount(indices, weights=tmp_w.real,
- minlength=n_equal_bins)
- n.imag += np.bincount(indices, weights=tmp_w.imag,
- minlength=n_equal_bins)
- else:
- n += np.bincount(indices, weights=tmp_w,
- minlength=n_equal_bins).astype(ntype)
- else:
- # Compute via cumulative histogram
- cum_n = np.zeros(bin_edges.shape, ntype)
- if weights is None:
- for i in _range(0, len(a), BLOCK):
- sa = np.sort(a[i:i+BLOCK])
- cum_n += _search_sorted_inclusive(sa, bin_edges)
- else:
- zero = np.zeros(1, dtype=ntype)
- for i in _range(0, len(a), BLOCK):
- tmp_a = a[i:i+BLOCK]
- tmp_w = weights[i:i+BLOCK]
- sorting_index = np.argsort(tmp_a)
- sa = tmp_a[sorting_index]
- sw = tmp_w[sorting_index]
- cw = np.concatenate((zero, sw.cumsum()))
- bin_index = _search_sorted_inclusive(sa, bin_edges)
- cum_n += cw[bin_index]
- n = np.diff(cum_n)
- # density overrides the normed keyword
- if density is not None:
- if normed is not None:
- # 2018-06-13, numpy 1.15.0 (this was not noisily deprecated in 1.6)
- warnings.warn(
- "The normed argument is ignored when density is provided. "
- "In future passing both will result in an error.",
- DeprecationWarning, stacklevel=3)
- normed = None
- if density:
- db = np.array(np.diff(bin_edges), float)
- return n/db/n.sum(), bin_edges
- elif normed:
- # 2018-06-13, numpy 1.15.0 (this was not noisily deprecated in 1.6)
- warnings.warn(
- "Passing `normed=True` on non-uniform bins has always been "
- "broken, and computes neither the probability density "
- "function nor the probability mass function. "
- "The result is only correct if the bins are uniform, when "
- "density=True will produce the same result anyway. "
- "The argument will be removed in a future version of "
- "numpy.",
- np.VisibleDeprecationWarning, stacklevel=3)
- # this normalization is incorrect, but
- db = np.array(np.diff(bin_edges), float)
- return n/(n*db).sum(), bin_edges
- else:
- if normed is not None:
- # 2018-06-13, numpy 1.15.0 (this was not noisily deprecated in 1.6)
- warnings.warn(
- "Passing normed=False is deprecated, and has no effect. "
- "Consider passing the density argument instead.",
- DeprecationWarning, stacklevel=3)
- return n, bin_edges
- def _histogramdd_dispatcher(sample, bins=None, range=None, normed=None,
- weights=None, density=None):
- if hasattr(sample, 'shape'): # same condition as used in histogramdd
- yield sample
- else:
- yield from sample
- with contextlib.suppress(TypeError):
- yield from bins
- yield weights
- @array_function_dispatch(_histogramdd_dispatcher)
- def histogramdd(sample, bins=10, range=None, normed=None, weights=None,
- density=None):
- """
- Compute the multidimensional histogram of some data.
- Parameters
- ----------
- sample : (N, D) array, or (D, N) array_like
- The data to be histogrammed.
- Note the unusual interpretation of sample when an array_like:
- * When an array, each row is a coordinate in a D-dimensional space -
- such as ``histogramdd(np.array([p1, p2, p3]))``.
- * When an array_like, each element is the list of values for single
- coordinate - such as ``histogramdd((X, Y, Z))``.
- The first form should be preferred.
- bins : sequence or int, optional
- The bin specification:
- * A sequence of arrays describing the monotonically increasing bin
- edges along each dimension.
- * The number of bins for each dimension (nx, ny, ... =bins)
- * The number of bins for all dimensions (nx=ny=...=bins).
- range : sequence, optional
- A sequence of length D, each an optional (lower, upper) tuple giving
- the outer bin edges to be used if the edges are not given explicitly in
- `bins`.
- An entry of None in the sequence results in the minimum and maximum
- values being used for the corresponding dimension.
- The default, None, is equivalent to passing a tuple of D None values.
- density : bool, optional
- If False, the default, returns the number of samples in each bin.
- If True, returns the probability *density* function at the bin,
- ``bin_count / sample_count / bin_volume``.
- normed : bool, optional
- An alias for the density argument that behaves identically. To avoid
- confusion with the broken normed argument to `histogram`, `density`
- should be preferred.
- weights : (N,) array_like, optional
- An array of values `w_i` weighing each sample `(x_i, y_i, z_i, ...)`.
- Weights are normalized to 1 if normed is True. If normed is False,
- the values of the returned histogram are equal to the sum of the
- weights belonging to the samples falling into each bin.
- Returns
- -------
- H : ndarray
- The multidimensional histogram of sample x. See normed and weights
- for the different possible semantics.
- edges : list
- A list of D arrays describing the bin edges for each dimension.
- See Also
- --------
- histogram: 1-D histogram
- histogram2d: 2-D histogram
- Examples
- --------
- >>> r = np.random.randn(100,3)
- >>> H, edges = np.histogramdd(r, bins = (5, 8, 4))
- >>> H.shape, edges[0].size, edges[1].size, edges[2].size
- ((5, 8, 4), 6, 9, 5)
- """
- try:
- # Sample is an ND-array.
- N, D = sample.shape
- except (AttributeError, ValueError):
- # Sample is a sequence of 1D arrays.
- sample = np.atleast_2d(sample).T
- N, D = sample.shape
- nbin = np.empty(D, int)
- edges = D*[None]
- dedges = D*[None]
- if weights is not None:
- weights = np.asarray(weights)
- try:
- M = len(bins)
- if M != D:
- raise ValueError(
- 'The dimension of bins must be equal to the dimension of the '
- ' sample x.')
- except TypeError:
- # bins is an integer
- bins = D*[bins]
- # normalize the range argument
- if range is None:
- range = (None,) * D
- elif len(range) != D:
- raise ValueError('range argument must have one entry per dimension')
- # Create edge arrays
- for i in _range(D):
- if np.ndim(bins[i]) == 0:
- if bins[i] < 1:
- raise ValueError(
- '`bins[{}]` must be positive, when an integer'.format(i))
- smin, smax = _get_outer_edges(sample[:,i], range[i])
- try:
- n = operator.index(bins[i])
-
- except TypeError as e:
- raise TypeError(
- "`bins[{}]` must be an integer, when a scalar".format(i)
- ) from e
-
- edges[i] = np.linspace(smin, smax, n + 1)
- elif np.ndim(bins[i]) == 1:
- edges[i] = np.asarray(bins[i])
- if np.any(edges[i][:-1] > edges[i][1:]):
- raise ValueError(
- '`bins[{}]` must be monotonically increasing, when an array'
- .format(i))
- else:
- raise ValueError(
- '`bins[{}]` must be a scalar or 1d array'.format(i))
- nbin[i] = len(edges[i]) + 1 # includes an outlier on each end
- dedges[i] = np.diff(edges[i])
- # Compute the bin number each sample falls into.
- Ncount = tuple(
- # avoid np.digitize to work around gh-11022
- np.searchsorted(edges[i], sample[:, i], side='right')
- for i in _range(D)
- )
- # Using digitize, values that fall on an edge are put in the right bin.
- # For the rightmost bin, we want values equal to the right edge to be
- # counted in the last bin, and not as an outlier.
- for i in _range(D):
- # Find which points are on the rightmost edge.
- on_edge = (sample[:, i] == edges[i][-1])
- # Shift these points one bin to the left.
- Ncount[i][on_edge] -= 1
- # Compute the sample indices in the flattened histogram matrix.
- # This raises an error if the array is too large.
- xy = np.ravel_multi_index(Ncount, nbin)
- # Compute the number of repetitions in xy and assign it to the
- # flattened histmat.
- hist = np.bincount(xy, weights, minlength=nbin.prod())
- # Shape into a proper matrix
- hist = hist.reshape(nbin)
- # This preserves the (bad) behavior observed in gh-7845, for now.
- hist = hist.astype(float, casting='safe')
- # Remove outliers (indices 0 and -1 for each dimension).
- core = D*(slice(1, -1),)
- hist = hist[core]
- # handle the aliasing normed argument
- if normed is None:
- if density is None:
- density = False
- elif density is None:
- # an explicit normed argument was passed, alias it to the new name
- density = normed
- else:
- raise TypeError("Cannot specify both 'normed' and 'density'")
- if density:
- # calculate the probability density function
- s = hist.sum()
- for i in _range(D):
- shape = np.ones(D, int)
- shape[i] = nbin[i] - 2
- hist = hist / dedges[i].reshape(shape)
- hist /= s
- if (hist.shape != nbin - 2).any():
- raise RuntimeError(
- "Internal Shape Error")
- return hist, edges
|