Merge pull request #787 from oschulz/new-hist-impl

Rewrite of histogram recipes, using StatsBase.histogram
This commit is contained in:
Michael Krabbe Borregaard 2017-05-04 12:33:42 +02:00 committed by GitHub
commit e41022c7be
7 changed files with 324 additions and 88 deletions

View File

@ -7,3 +7,4 @@ Reexport
FixedSizeArrays
Measures
Showoff
StatsBase 0.14.0

View File

@ -9,6 +9,7 @@ using Base.Meta
@reexport using PlotUtils
@reexport using PlotThemes
import Showoff
import StatsBase
export
grid,
@ -148,6 +149,9 @@ end
@shorthands bar
@shorthands barh
@shorthands histogram
@shorthands barhist
@shorthands stephist
@shorthands scatterhist
@shorthands histogram2d
@shorthands density
@shorthands heatmap

View File

@ -21,7 +21,7 @@ const _arg_desc = KW(
:markerstrokewidth => "Number. Width of the marker stroke (border. in pixels)",
:markerstrokecolor => "Color Type. Color of the marker stroke (border). `:match` will take the value from `:foreground_color_subplot`.",
:markerstrokealpha => "Number in [0,1]. The alpha/opacity override for the marker stroke (border). `nothing` (the default) means it will take the alpha value of markerstrokecolor.",
:bins => "Integer, NTuple{2,Integer}, AbstractVector. For histogram-types, defines the number of bins, or the edges, of the histogram.",
:bins => "Integer, NTuple{2,Integer}, AbstractVector or Symbol. For histogram-types, defines the number of bins, or the edges, of the histogram, or the auto-binning algorithm to use (:sturges, :sqrt, :rice, :scott or :fd)",
:smooth => "Bool. Add a regression line?",
:group => "AbstractVector. Data is split into a separate series, one for each unique value in `group`.",
:x => "Various. Input data. First Dimension",
@ -40,7 +40,7 @@ const _arg_desc = KW(
:ribbon => "Number or AbstractVector. Creates a fillrange around the data points.",
:quiver => "AbstractVector or 2-Tuple of vectors. The directional vectors U,V which specify velocity/gradient vectors for a quiver plot.",
:arrow => "nothing (no arrows), Bool (if true, default arrows), Arrow object, or arg(s) that could be style or head length/widths. Defines arrowheads that should be displayed at the end of path line segments (just before a NaN and the last non-NaN point). Used in quiverplot, streamplot, or similar.",
:normalize => "Bool. Should normalize histogram types? Trying for area == 1.",
:normalize => "Bool or Symbol. Histogram normalization mode. Possible values are: false/:none (no normalization, default), true/:pdf (normalize to a PDF with integral of 1) and :density (only normalize in respect to bin sizes).",
:weights => "AbstractVector. Used in histogram types for weighted counts.",
:contours => "Bool. Add contours to the side-grids of 3D plots? Used in surface/wireframe.",
:match_dimensions => "Bool. For heatmap types... should the first dimension of a matrix (rows) correspond to the first dimension of the plot (x-axis)? The default is false, which matches the behavior of Matplotlib, Plotly, and others. Note: when passing a function for z, the function should still map `(x,y) -> z`.",

View File

@ -35,7 +35,9 @@ const _3dTypes = [
]
const _allTypes = vcat([
:none, :line, :path, :steppre, :steppost, :sticks, :scatter,
:heatmap, :hexbin, :histogram, :histogram2d, :histogram3d, :density, :bar, :hline, :vline,
:heatmap, :hexbin, :barbins, :barhist, :histogram, :scatterbins,
:scatterhist, :stepbins, :stephist, :bins2d, :histogram2d, :histogram3d,
:density, :bar, :hline, :vline,
:contour, :pie, :shape, :image
], _3dTypes)
@ -78,7 +80,7 @@ const _typeAliases = Dict{Symbol,Symbol}(
add_non_underscore_aliases!(_typeAliases)
like_histogram(seriestype::Symbol) = seriestype == :histogram
like_histogram(seriestype::Symbol) = seriestype in (:histogram, :barhist, :barbins)
like_line(seriestype::Symbol) = seriestype in (:line, :path, :steppre, :steppost)
like_surface(seriestype::Symbol) = seriestype in (:contour, :contourf, :contour3d, :heatmap, :surface, :wireframe, :image)
@ -154,6 +156,8 @@ const _markerAliases = Dict{Symbol,Symbol}(
)
const _allScales = [:identity, :ln, :log2, :log10, :asinh, :sqrt]
const _logScales = [:ln, :log2, :log10]
const _logScaleBases = Dict(:ln => e, :log2 => 2.0, :log10 => 10.0)
const _scaleAliases = Dict{Symbol,Symbol}(
:none => :identity,
:log => :log10,
@ -1261,7 +1265,7 @@ function _add_defaults!(d::KW, plt::Plot, sp::Subplot, commandIndex::Int)
end
# scatter plots don't have a line, but must have a shape
if d[:seriestype] in (:scatter, :scatter3d)
if d[:seriestype] in (:scatter, :scatterbins, :scatterhist, :scatter3d)
d[:linewidth] = 0
if d[:markershape] == :none
d[:markershape] = :circle

View File

@ -277,6 +277,13 @@ function _subplot_setup(plt::Plot, d::KW, kw_list::Vector{KW})
attr[Symbol(letter,k)] = v
end
end
for k in (:scale,), letter in (:x,:y,:z)
# Series recipes may need access to this information
lk = Symbol(letter,k)
if haskey(attr, lk)
kw[lk] = attr[lk]
end
end
end
sp_attrs[sp] = attr
end
@ -357,7 +364,7 @@ function _expand_subplot_extrema(sp::Subplot, d::KW, st::Symbol)
expand_extrema!(sp[:xaxis], (0,w))
expand_extrema!(sp[:yaxis], (0,h))
sp[:yaxis].d[:flip] = true
elseif !(st in (:pie, :histogram, :histogram2d))
elseif !(st in (:pie, :histogram, :bins2d, :histogram2d))
expand_extrema!(sp, d)
end
end

View File

@ -323,10 +323,11 @@ end
# create a bar plot as a filled step function
@recipe function f(::Type{Val{:bar}}, x, y, z)
nx, ny = length(x), length(y)
procx, procy, xscale, yscale, baseline = _preprocess_binlike(d, x, y)
nx, ny = length(procx), length(procy)
axis = d[:subplot][isvertical(d) ? :xaxis : :yaxis]
cv = [discrete_value!(axis, xi)[1] for xi=x]
x = if nx == ny
cv = [discrete_value!(axis, xi)[1] for xi=procx]
procx = if nx == ny
cv
elseif nx == ny + 1
0.5diff(cv) + cv[1:end-1]
@ -337,9 +338,9 @@ end
# compute half-width of bars
bw = d[:bar_width]
hw = if bw == nothing
0.5mean(diff(x))
0.5mean(diff(procx))
else
Float64[0.5cycle(bw,i) for i=1:length(x)]
Float64[0.5cycle(bw,i) for i=1:length(procx)]
end
# make fillto a vector... default fills to 0
@ -347,16 +348,21 @@ end
if fillto == nothing
fillto = 0
end
if (yscale in _logScales) && !all(_is_positive, fillto)
fillto = map(x -> _is_positive(x) ? typeof(baseline)(x) : baseline, fillto)
end
# create the bar shapes by adding x/y segments
xseg, yseg = Segments(), Segments()
for i=1:ny
center = x[i]
hwi = cycle(hw,i)
yi = y[i]
fi = cycle(fillto,i)
push!(xseg, center-hwi, center-hwi, center+hwi, center+hwi, center-hwi)
push!(yseg, yi, fi, fi, yi, yi)
yi = procy[i]
if !isnan(yi)
center = procx[i]
hwi = cycle(hw,i)
fi = cycle(fillto,i)
push!(xseg, center-hwi, center-hwi, center+hwi, center+hwi, center-hwi)
push!(yseg, yi, fi, fi, yi, yi)
end
end
# widen limits out a bit
@ -378,109 +384,323 @@ end
end
@deps bar shape
# ---------------------------------------------------------------------------
# Histograms
# edges from number of bins
function calc_edges(v, bins::Integer)
vmin, vmax = extrema(v)
linspace(vmin, vmax, bins+1)
_bin_centers(v::AVec) = (v[1:end-1] + v[2:end]) / 2
_is_positive(x) = (x > 0) && !(x 0)
_positive_else_nan{T}(::Type{T}, x::Real) = _is_positive(x) ? T(x) : T(NaN)
function _scale_adjusted_values{T<:AbstractFloat}(::Type{T}, V::AbstractVector, scale::Symbol)
if scale in _logScales
[_positive_else_nan(T, x) for x in V]
else
[T(x) for x in V]
end
end
# just pass through arrays
calc_edges(v, bins::AVec) = bins
function _hist_ylim_lo{T<:Real}(ymin::T, yscale::Symbol)
if (yscale in _logScales)
ymin / T(_logScaleBases[yscale]^log10(2))
else
zero(T)
end
end
# find the bucket index of this value
function bucket_index(vi, edges)
for (i,e) in enumerate(edges)
if vi <= e
return max(1,i-1)
function _hist_ylim_hi{T<:Real}(ymax::T, yscale::Symbol)
if (yscale in _logScales)
ymax * T(_logScaleBases[yscale]^log10(2))
else
ymax * T(1.1)
end
end
function _preprocess_binlike(d, x, y)
xscale = get(d, :xscale, :identity)
yscale = get(d, :yscale, :identity)
T = float(promote_type(eltype(x), eltype(y)))
edge = map(T, x)
weights = _scale_adjusted_values(T, y, yscale)
w_min = minimum(weights)
baseline = _hist_ylim_lo(isnan(w_min) ? one(T) : w_min, yscale)
edge, weights, xscale, yscale, baseline
end
@recipe function f(::Type{Val{:barbins}}, x, y, z)
edge, weights, xscale, yscale, baseline = _preprocess_binlike(d, x, y)
if (d[:bar_width] == nothing)
bar_width := diff(edge)
end
x := _bin_centers(edge)
y := weights
seriestype := :bar
()
end
@deps barbins bar
@recipe function f(::Type{Val{:scatterbins}}, x, y, z)
edge, weights, xscale, yscale, baseline = _preprocess_binlike(d, x, y)
xerror := diff(edge)/2
x := _bin_centers(edge)
y := weights
seriestype := :scatter
()
end
@deps scatterbins scatter
function _stepbins_path(edge, weights, baseline::Real, xscale::Symbol, yscale::Symbol)
log_scale_x = xscale in _logScales
log_scale_y = yscale in _logScales
nbins = length(linearindices(weights))
if length(linearindices(edge)) != nbins + 1
error("Edge vector must be 1 longer than weight vector")
end
x = eltype(edge)[]
y = eltype(weights)[]
it_e, it_w = start(edge), start(weights)
a, it_e = next(edge, it_e)
last_w = eltype(weights)(NaN)
i = 1
while (!done(edge, it_e) && !done(edge, it_e))
b, it_e = next(edge, it_e)
w, it_w = next(weights, it_w)
if (log_scale_x && a 0)
a = b/_logScaleBases[xscale]^3
end
if isnan(w)
if !isnan(last_w)
push!(x, a)
push!(y, baseline)
end
else
if isnan(last_w)
push!(x, a)
push!(y, baseline)
end
push!(x, a)
push!(y, w)
push!(x, b)
push!(y, w)
end
a = b
last_w = w
end
return length(edges)-1
if (last_w != baseline)
push!(x, a)
push!(y, baseline)
end
(x, y)
end
function my_hist(v, bins; normed = false, weights = nothing)
edges = calc_edges(v, bins)
counts = zeros(length(edges)-1)
# add a weighted count
for (i,vi) in enumerate(v)
idx = bucket_index(vi, edges)
counts[idx] += (weights == nothing ? 1.0 : weights[i])
@recipe function f(::Type{Val{:stepbins}}, x, y, z)
axis = d[:subplot][Plots.isvertical(d) ? :xaxis : :yaxis]
edge, weights, xscale, yscale, baseline = _preprocess_binlike(d, x, y)
xpts, ypts = _stepbins_path(edge, weights, baseline, xscale, yscale)
if !isvertical(d)
xpts, ypts = ypts, xpts
end
counts = isapprox(extrema(diff(edges))...) ? counts : counts ./ diff(edges) # for uneven bins, normalize to area.
# normalize by bar area?
norm_denom = normed ? sum(diff(edges) .* counts) : 1.0
if norm_denom == 0
norm_denom = 1.0
# create a secondary series for the markers
if d[:markershape] != :none
@series begin
seriestype := :scatter
x := _bin_centers(edge)
y := weights
fillrange := nothing
label := ""
primary := false
()
end
markershape := :none
xerror := :none
yerror := :none
end
edges, counts ./ norm_denom
x := xpts
y := ypts
seriestype := :path
ylims --> [baseline, _hist_ylim_hi(maximum(weights), yscale)]
()
end
Plots.@deps stepbins path
function _auto_binning_nbins{N}(vs::NTuple{N,AbstractVector}, dim::Integer; mode::Symbol = :auto)
_cl(x) = max(ceil(Int, x), 1)
_iqr(v) = quantile(v, 0.75) - quantile(v, 0.25)
_span(v) = maximum(v) - minimum(v)
n_samples = length(linearindices(first(vs)))
# Estimator for number of samples in one row/column of bins along each axis:
n = max(1, n_samples^(1/N))
v = vs[dim]
if mode == :sqrt # Square-root choice
_cl(sqrt(n))
elseif mode == :sturges || mode ==:auto # Sturges' formula
_cl(log2(n)) + 1
elseif mode == :rice # Rice Rule
_cl(2 * n^(1/3))
elseif mode == :scott # Scott's normal reference rule
_cl(_span(v) / (3.5 * std(v) / n^(1/3)))
elseif mode == :fd # FreedmanDiaconis rule
_cl(_span(v) / (2 * _iqr(v) / n^(1/3)))
else
error("Unknown auto-binning mode $mode")
end
end
_hist_edge{N}(vs::NTuple{N,AbstractVector}, dim::Integer, binning::Integer) = StatsBase.histrange(vs[dim], binning, :left)
_hist_edge{N}(vs::NTuple{N,AbstractVector}, dim::Integer, binning::Symbol) = _hist_edge(vs, dim, _auto_binning_nbins(vs, dim, mode = binning))
_hist_edge{N}(vs::NTuple{N,AbstractVector}, dim::Integer, binning::AbstractVector) = binning
_hist_edges{N}(vs::NTuple{N,AbstractVector}, binning::NTuple{N}) =
map(dim -> _hist_edge(vs, dim, binning[dim]), (1:N...))
_hist_edges{N}(vs::NTuple{N,AbstractVector}, binning::Union{Integer, Symbol, AbstractVector}) =
map(dim -> _hist_edge(vs, dim, binning), (1:N...))
_hist_norm_mode(mode::Symbol) = mode
_hist_norm_mode(mode::Bool) = mode ? :pdf : :none
function _make_hist{N}(vs::NTuple{N,AbstractVector}, binning; normed = false, weights = nothing)
edges = _hist_edges(vs, binning)
h = float( weights == nothing ?
StatsBase.fit(StatsBase.Histogram, vs, edges, closed = :left) :
StatsBase.fit(StatsBase.Histogram, vs, weights, edges, closed = :left)
)
normalize!(h, mode = _hist_norm_mode(normed))
end
@recipe function f(::Type{Val{:histogram}}, x, y, z)
edges, counts = my_hist(y, d[:bins],
normed = d[:normalize],
weights = d[:weights])
bar_width := diff(edges)
x := centers(edges)
y := counts
seriestype := :bar
seriestype := :barhist
()
end
@deps histogram bar
@deps histogram barhist
@recipe function f(::Type{Val{:barhist}}, x, y, z)
h = _make_hist((y,), d[:bins], normed = d[:normalize], weights = d[:weights])
x := h.edges[1]
y := h.weights
seriestype := :barbins
()
end
@deps barhist barbins
@recipe function f(::Type{Val{:stephist}}, x, y, z)
h = _make_hist((y,), d[:bins], normed = d[:normalize], weights = d[:weights])
x := h.edges[1]
y := h.weights
seriestype := :stepbins
()
end
@deps stephist stepbins
@recipe function f(::Type{Val{:scatterhist}}, x, y, z)
h = _make_hist((y,), d[:bins], normed = d[:normalize], weights = d[:weights])
x := h.edges[1]
y := h.weights
seriestype := :scatterbins
()
end
@deps scatterhist scatterbins
@recipe function f{T, E}(h::StatsBase.Histogram{T, 1, E})
seriestype --> :barbins
st_map = Dict(
:bar => :barbins, :scatter => :scatterbins, :step => :stepbins,
:steppost => :stepbins # :step can be mapped to :steppost in pre-processing
)
seriestype := get(st_map, d[:seriestype], d[:seriestype])
if d[:seriestype] == :scatterbins
# Workaround, error bars currently not set correctly by scatterbins
edge, weights, xscale, yscale, baseline = _preprocess_binlike(d, h.edges[1], h.weights)
info("xscale = $xscale, yscale = $yscale")
xerror --> diff(h.edges[1])/2
seriestype := :scatter
(Plots._bin_centers(edge), weights)
else
(h.edges[1], h.weights)
end
end
@recipe function f{H <: StatsBase.Histogram}(hv::AbstractVector{H})
for h in hv
@series begin
h
end
end
end
# ---------------------------------------------------------------------------
# Histogram 2D
# if tuple, map out bins, otherwise use the same for both
calc_edges_2d(x, y, bins) = calc_edges(x, bins), calc_edges(y, bins)
calc_edges_2d{X,Y}(x, y, bins::Tuple{X,Y}) = calc_edges(x, bins[1]), calc_edges(y, bins[2])
@recipe function f(::Type{Val{:bins2d}}, x, y, z)
edge_x, edge_y, weights = x, y, z.surf
# the 2D version
function my_hist_2d(x, y, bins; normed = false, weights = nothing)
xedges, yedges = calc_edges_2d(x, y, bins)
counts = zeros(length(yedges)-1, length(xedges)-1)
# add a weighted count
for i=1:length(x)
r = bucket_index(y[i], yedges)
c = bucket_index(x[i], xedges)
counts[r,c] += (weights == nothing ? 1.0 : weights[i])
float_weights = float(weights)
if is(float_weights, weights)
float_weights = deepcopy(float_weights)
end
# normalize to cubic area of the imaginary surface towers
norm_denom = normed ? sum((diff(yedges) * diff(xedges)') .* counts) : 1.0
if norm_denom == 0
norm_denom = 1.0
end
xedges, yedges, counts ./ norm_denom
end
centers(v::AVec) = 0.5 * (v[1:end-1] + v[2:end])
@recipe function f(::Type{Val{:histogram2d}}, x, y, z)
xedges, yedges, counts = my_hist_2d(x, y, d[:bins],
normed = d[:normalize],
weights = d[:weights])
for (i,c) in enumerate(counts)
for (i, c) in enumerate(float_weights)
if c == 0
counts[i] = NaN
float_weights[i] = NaN
end
end
x := centers(xedges)
y := centers(yedges)
z := Surface(counts)
linewidth := 0
x := Plots._bin_centers(edge_x)
y := Plots._bin_centers(edge_y)
z := Surface(float_weights)
match_dimensions := true
seriestype := :heatmap
()
end
@deps histogram2d heatmap
Plots.@deps bins2d heatmap
@recipe function f(::Type{Val{:histogram2d}}, x, y, z)
h = _make_hist((x, y), d[:bins], normed = d[:normalize], weights = d[:weights])
x := h.edges[1]
y := h.edges[2]
z := Surface(h.weights)
seriestype := :bins2d
()
end
@deps histogram2d bins2d
@recipe function f{T, E}(h::StatsBase.Histogram{T, 2, E})
seriestype --> :bins2d
(h.edges[1], h.edges[2], Surface(h.weights))
end
# ---------------------------------------------------------------------------

View File

@ -39,7 +39,7 @@ series_list(sp::Subplot) = sp.series_list # filter(series -> series.d[:subplot]
function should_add_to_legend(series::Series)
series.d[:primary] && series.d[:label] != "" &&
!(series.d[:seriestype] in (
:hexbin,:histogram2d,:hline,:vline,
:hexbin,:bins2d,:histogram2d,:hline,:vline,
:contour,:contourf,:contour3d,:surface,:wireframe,
:heatmap, :pie, :image
))