Change histogram implementation, use StatsPlots, add new histogram styles

New series recipes for binned data:

* barbins
* scatterbins
* stepbins

New series recipes for histogram:

* barhist (histogram is now an alias for this)
* scatterhist
* stephist

Supports plotting 1D and 2D StatsBase histograms, seriestype can be set to
bar(bins), scatter(bins) or step(bins).

Also adds support for some common auto-binning modes:

* :sturges, :auto - Sturges' formula
* :sqrt - Square-root choice
* :rice - Rice Rule
* :scott - Scott's normal reference rule
* :fd - Freedman–Diaconis rule

Maybe these could be contributed to StatsBase at some point.

Error bars currently don't work correctly for scatterbins and scatterhist,
due to problem with manipulating error bars in a series recipe, but do work
for "plot(h::StatsBase.Histogram, seriestype = :scatter)" (works around
the problem by calling scatter directly, it seems that error bars can be
manipulated correctly in a type recipe).
This commit is contained in:
Oliver Schulz 2017-04-29 17:48:11 +02:00
parent e760b42560
commit 1188230641
7 changed files with 232 additions and 78 deletions

View File

@ -7,3 +7,4 @@ Reexport
FixedSizeArrays
Measures
Showoff
StatsBase 0.14.0

View File

@ -9,6 +9,7 @@ using Base.Meta
@reexport using PlotUtils
@reexport using PlotThemes
import Showoff
import StatsBase
export
grid,
@ -148,6 +149,9 @@ end
@shorthands bar
@shorthands barh
@shorthands histogram
@shorthands barhist
@shorthands stephist
@shorthands scatterhist
@shorthands histogram2d
@shorthands density
@shorthands heatmap

View File

@ -21,7 +21,7 @@ const _arg_desc = KW(
:markerstrokewidth => "Number. Width of the marker stroke (border. in pixels)",
:markerstrokecolor => "Color Type. Color of the marker stroke (border). `:match` will take the value from `:foreground_color_subplot`.",
:markerstrokealpha => "Number in [0,1]. The alpha/opacity override for the marker stroke (border). `nothing` (the default) means it will take the alpha value of markerstrokecolor.",
:bins => "Integer, NTuple{2,Integer}, AbstractVector. For histogram-types, defines the number of bins, or the edges, of the histogram.",
:bins => "Integer, NTuple{2,Integer}, AbstractVector or Symbol. For histogram-types, defines the number of bins, or the edges, of the histogram, or the auto-binning algorithm to use (:sturges, :sqrt, :rice, :scott or :fd)",
:smooth => "Bool. Add a regression line?",
:group => "AbstractVector. Data is split into a separate series, one for each unique value in `group`.",
:x => "Various. Input data. First Dimension",
@ -40,7 +40,7 @@ const _arg_desc = KW(
:ribbon => "Number or AbstractVector. Creates a fillrange around the data points.",
:quiver => "AbstractVector or 2-Tuple of vectors. The directional vectors U,V which specify velocity/gradient vectors for a quiver plot.",
:arrow => "nothing (no arrows), Bool (if true, default arrows), Arrow object, or arg(s) that could be style or head length/widths. Defines arrowheads that should be displayed at the end of path line segments (just before a NaN and the last non-NaN point). Used in quiverplot, streamplot, or similar.",
:normalize => "Bool. Should normalize histogram types? Trying for area == 1.",
:normalize => "Bool or Symbol. Histogram normalization mode. Possible values are: false/:none (no normalization, default), true/:pdf (normalize to a PDF with integral of 1) and :density (only normalize in respect to bin sizes).",
:weights => "AbstractVector. Used in histogram types for weighted counts.",
:contours => "Bool. Add contours to the side-grids of 3D plots? Used in surface/wireframe.",
:match_dimensions => "Bool. For heatmap types... should the first dimension of a matrix (rows) correspond to the first dimension of the plot (x-axis)? The default is false, which matches the behavior of Matplotlib, Plotly, and others. Note: when passing a function for z, the function should still map `(x,y) -> z`.",

View File

@ -35,7 +35,9 @@ const _3dTypes = [
]
const _allTypes = vcat([
:none, :line, :path, :steppre, :steppost, :sticks, :scatter,
:heatmap, :hexbin, :histogram, :histogram2d, :histogram3d, :density, :bar, :hline, :vline,
:heatmap, :hexbin, :barbins, :barhist, :histogram, :scatterbins,
:scatterhist, :stepbins, :stephist, :bins2d, :histogram2d, :histogram3d,
:density, :bar, :hline, :vline,
:contour, :pie, :shape, :image
], _3dTypes)
@ -78,7 +80,7 @@ const _typeAliases = Dict{Symbol,Symbol}(
add_non_underscore_aliases!(_typeAliases)
like_histogram(seriestype::Symbol) = seriestype == :histogram
like_histogram(seriestype::Symbol) = seriestype in (:histogram, :barhist, :barbins)
like_line(seriestype::Symbol) = seriestype in (:line, :path, :steppre, :steppost)
like_surface(seriestype::Symbol) = seriestype in (:contour, :contourf, :contour3d, :heatmap, :surface, :wireframe, :image)
@ -1261,7 +1263,7 @@ function _add_defaults!(d::KW, plt::Plot, sp::Subplot, commandIndex::Int)
end
# scatter plots don't have a line, but must have a shape
if d[:seriestype] in (:scatter, :scatter3d)
if d[:seriestype] in (:scatter, :scatterbins, :scatterhist, :scatter3d)
d[:linewidth] = 0
if d[:markershape] == :none
d[:markershape] = :circle

View File

@ -357,7 +357,7 @@ function _expand_subplot_extrema(sp::Subplot, d::KW, st::Symbol)
expand_extrema!(sp[:xaxis], (0,w))
expand_extrema!(sp[:yaxis], (0,h))
sp[:yaxis].d[:flip] = true
elseif !(st in (:pie, :histogram, :histogram2d))
elseif !(st in (:pie, :histogram, :bins2d, :histogram2d))
expand_extrema!(sp, d)
end
end

View File

@ -378,109 +378,256 @@ end
end
@deps bar shape
# ---------------------------------------------------------------------------
# Histograms
# edges from number of bins
function calc_edges(v, bins::Integer)
vmin, vmax = extrema(v)
linspace(vmin, vmax, bins+1)
_bin_centers(v::AVec) = (v[1:end-1] + v[2:end]) / 2
@recipe function f(::Type{Val{:barbins}}, x, y, z)
edge, weights = x, y
if (d[:bar_width] == nothing)
bar_width := diff(edge)
end
x := _bin_centers(edge)
y := weights
seriestype := :bar
()
end
@deps barbins bins
@recipe function f(::Type{Val{:scatterbins}}, x, y, z)
edge, weights = x, y
xerror := diff(edge)/2
x := _bin_centers(edge)
y := weights
seriestype := :scatter
()
end
@deps scatterbins scatter
function _stepbins_path(edge, weights)
nbins = length(linearindices(weights))
if length(linearindices(edge)) != nbins + 1
error("Edge vector must be 1 longer than weight vector")
end
it_e, it_w = start(edge), start(weights)
px, it_e = next(edge, it_e)
py = zero(eltype(weights))
npathpts = 2 * nbins + 2
x = Vector{eltype(px)}(npathpts)
y = Vector{eltype(py)}(npathpts)
x[1], y[1] = px, py
i = 2
while (i < npathpts - 1)
py, it_w = next(weights, it_w)
x[i], y[i] = px, py
i += 1
px, it_e = next(edge, it_e)
x[i], y[i] = px, py
i += 1
end
assert(i == npathpts)
x[end], y[end] = px, zero(py)
(x, y)
end
# just pass through arrays
calc_edges(v, bins::AVec) = bins
@recipe function f(::Type{Val{:stepbins}}, x, y, z)
edge, weights = x, y
# find the bucket index of this value
function bucket_index(vi, edges)
for (i,e) in enumerate(edges)
if vi <= e
return max(1,i-1)
axis = d[:subplot][Plots.isvertical(d) ? :xaxis : :yaxis]
xpts, ypts = _stepbins_path(edge, weights)
if !Plots.isvertical(d)
xpts, ypts = ypts, xpts
end
# create a secondary series for the markers
if d[:markershape] != :none
@series begin
seriestype := :scatter
x := Plots._bin_centers(edge)
y := weights
fillrange := nothing
label := ""
primary := false
()
end
markershape := :none
xerror := :none
yerror := :none
end
x := xpts
y := ypts
seriestype := :path
ylims --> [0, 1.1 * maximum(weights)]
()
end
Plots.@deps stepbins path
function _auto_binning_nbins{N}(vs::NTuple{N,AbstractVector}, dim::Integer; mode::Symbol = :auto)
_cl(x) = max(ceil(Int, x), 1)
_iqr(v) = quantile(v, 0.75) - quantile(v, 0.25)
_span(v) = maximum(v) - minimum(v)
n_samples = length(linearindices(first(vs)))
# Estimator for number of samples in one row/column of bins along each axis:
n = max(1, n_samples^(1/N))
v = vs[dim]
if mode == :sqrt # Square-root choice
_cl(sqrt(n))
elseif mode == :sturges || mode ==:auto # Sturges' formula
_cl(log2(n)) + 1
elseif mode == :rice # Rice Rule
_cl(2 * n^(1/3))
elseif mode == :scott # Scott's normal reference rule
_cl(_span(v) / (3.5 * std(v) / n^(1/3)))
elseif mode == :fd # FreedmanDiaconis rule
_cl(_span(v) / (2 * _iqr(v) / n^(1/3)))
else
error("Unknown auto-binning mode $mode")
end
return length(edges)-1
end
function my_hist(v, bins; normed = false, weights = nothing)
edges = calc_edges(v, bins)
counts = zeros(length(edges)-1)
_hist_edge{N}(vs::NTuple{N,AbstractVector}, dim::Integer, binning::Integer) = StatsBase.histrange(vs[dim], binning, :left)
_hist_edge{N}(vs::NTuple{N,AbstractVector}, dim::Integer, binning::Symbol) = _hist_edge(vs, dim, _auto_binning_nbins(vs, dim, mode = binning))
_hist_edge{N}(vs::NTuple{N,AbstractVector}, dim::Integer, binning::AbstractVector) = binning
# add a weighted count
for (i,vi) in enumerate(v)
idx = bucket_index(vi, edges)
counts[idx] += (weights == nothing ? 1.0 : weights[i])
end
_hist_edges{N}(vs::NTuple{N,AbstractVector}, binning::NTuple{N}) =
map(dim -> _hist_edge(vs, dim, binning[dim]), (1:N...))
counts = isapprox(extrema(diff(edges))...) ? counts : counts ./ diff(edges) # for uneven bins, normalize to area.
_hist_edges{N}(vs::NTuple{N,AbstractVector}, binning::Union{Integer, Symbol, AbstractVector}) =
map(dim -> _hist_edge(vs, dim, binning), (1:N...))
# normalize by bar area?
norm_denom = normed ? sum(diff(edges) .* counts) : 1.0
if norm_denom == 0
norm_denom = 1.0
end
_hist_norm_mode(mode::Symbol) = mode
_hist_norm_mode(mode::Bool) = mode ? :pdf : :none
edges, counts ./ norm_denom
function _make_hist{N}(vs::NTuple{N,AbstractVector}, binning; normed = false, weights = nothing)
edges = _hist_edges(vs, binning)
h = float( weights == nothing ?
StatsBase.fit(StatsBase.Histogram, vs, edges, closed = :left) :
StatsBase.fit(StatsBase.Histogram, vs, weights, edges, closed = :left)
)
normalize!(h, mode = _hist_norm_mode(normed))
end
@recipe function f(::Type{Val{:histogram}}, x, y, z)
edges, counts = my_hist(y, d[:bins],
normed = d[:normalize],
weights = d[:weights])
bar_width := diff(edges)
x := centers(edges)
y := counts
seriestype := :bar
seriestype := :barhist
()
end
@deps histogram bar
@deps histogram barhist
@recipe function f(::Type{Val{:barhist}}, x, y, z)
h = _make_hist((y,), d[:bins], normed = d[:normalize], weights = d[:weights])
x := h.edges[1]
y := h.weights
seriestype := :barbins
()
end
@deps barhist barbins
@recipe function f(::Type{Val{:stephist}}, x, y, z)
h = _make_hist((y,), d[:bins], normed = d[:normalize], weights = d[:weights])
x := h.edges[1]
y := h.weights
seriestype := :stepbins
()
end
@deps stephist stepbins
@recipe function f(::Type{Val{:scatterhist}}, x, y, z)
h = _make_hist((y,), d[:bins], normed = d[:normalize], weights = d[:weights])
x := h.edges[1]
y := h.weights
seriestype := :scatterbins
()
end
@deps scatterhist scatterbins
@recipe function f{T, E}(h::StatsBase.Histogram{T, 1, E})
seriestype --> :barbins
st_map = Dict(
:bar => :barbins, :scatter => :scatterbins, :step => :stepbins,
:steppost => :stepbins # :step can be mapped to :steppost in pre-processing
)
seriestype := get(st_map, d[:seriestype], d[:seriestype])
if d[:seriestype] == :scatterbins
# Workaround, error bars currently not set correctly by scatterbins
xerror --> diff(h.edges[1])/2
seriestype := :scatter
(Plots._bin_centers(h.edges[1]), h.weights)
else
(h.edges[1], h.weights)
end
end
@recipe function f{H <: StatsBase.Histogram}(hv::AbstractVector{H})
for h in hv
@series begin
h
end
end
end
# ---------------------------------------------------------------------------
# Histogram 2D
# if tuple, map out bins, otherwise use the same for both
calc_edges_2d(x, y, bins) = calc_edges(x, bins), calc_edges(y, bins)
calc_edges_2d{X,Y}(x, y, bins::Tuple{X,Y}) = calc_edges(x, bins[1]), calc_edges(y, bins[2])
@recipe function f(::Type{Val{:bins2d}}, x, y, z)
edge_x, edge_y, weights = x, y, z.surf
# the 2D version
function my_hist_2d(x, y, bins; normed = false, weights = nothing)
xedges, yedges = calc_edges_2d(x, y, bins)
counts = zeros(length(yedges)-1, length(xedges)-1)
# add a weighted count
for i=1:length(x)
r = bucket_index(y[i], yedges)
c = bucket_index(x[i], xedges)
counts[r,c] += (weights == nothing ? 1.0 : weights[i])
float_weights = float(weights)
if is(float_weights, weights)
float_weights = deepcopy(float_weights)
end
# normalize to cubic area of the imaginary surface towers
norm_denom = normed ? sum((diff(yedges) * diff(xedges)') .* counts) : 1.0
if norm_denom == 0
norm_denom = 1.0
end
xedges, yedges, counts ./ norm_denom
end
centers(v::AVec) = 0.5 * (v[1:end-1] + v[2:end])
@recipe function f(::Type{Val{:histogram2d}}, x, y, z)
xedges, yedges, counts = my_hist_2d(x, y, d[:bins],
normed = d[:normalize],
weights = d[:weights])
for (i,c) in enumerate(counts)
for (i, c) in enumerate(float_weights)
if c == 0
counts[i] = NaN
float_weights[i] = NaN
end
end
x := centers(xedges)
y := centers(yedges)
z := Surface(counts)
linewidth := 0
x := Plots._bin_centers(edge_x)
y := Plots._bin_centers(edge_y)
z := Surface(float_weights)
match_dimensions := true
seriestype := :heatmap
()
end
@deps histogram2d heatmap
Plots.@deps bins2d heatmap
@recipe function f(::Type{Val{:histogram2d}}, x, y, z)
h = _make_hist((x, y), d[:bins], normed = d[:normalize], weights = d[:weights])
x := h.edges[1]
y := h.edges[2]
z := Surface(h.weights)
seriestype := :bins2d
()
end
@deps histogram2d bins2d
@recipe function f{T, E}(h::StatsBase.Histogram{T, 2, E})
seriestype --> :bins2d
(h.edges[1], h.edges[2], Surface(h.weights))
end
# ---------------------------------------------------------------------------

View File

@ -39,7 +39,7 @@ series_list(sp::Subplot) = sp.series_list # filter(series -> series.d[:subplot]
function should_add_to_legend(series::Series)
series.d[:primary] && series.d[:label] != "" &&
!(series.d[:seriestype] in (
:hexbin,:histogram2d,:hline,:vline,
:hexbin,:bins2d,:histogram2d,:hline,:vline,
:contour,:contourf,:contour3d,:surface,:wireframe,
:heatmap, :pie, :image
))