src/hyper.erl

%% @doc `hyper' is the reference implementation of hyperloglog for erlang
%%
%% Hyperloglog is an algorithm based on a probabilistic datastructure to solve
%% the count-distinct problem. It allows an approximation of the distinct
%% element of a large collection, with a limited memory usage and the ability
%% to be distributed and merged losslessly.
%%
%% This module is the interface you use to interact with the hyperloglog. `hyper'
%% accept multiple backend for the implementation of hyperloglog, each with a
%% different set of tradeoff and performance.
%%
%% For more details on how to use it, look at {@link new/3}, {@link insert/2}
%% and {@link card/1}
%%
%% == How does it work ==
%%
%% Without entering in all the details, the hyperloglog datastructure is based
%% on a limited set of elements. Details of how these are implemented will vary
%% between different backend and implementations, but the high level idea stay
%% the same.
%% - A hash function that output a 64bit hash
%% - A set of numbered "bins", controlled by the precision
%% - An estimator function
%%
%% For every element to insert, the element is hashed into a binary N, then
%% the leftmost P bits are used to define which bin the element belong. In this
%% bin, we store the number of zeroes before the first 1 in the leftmost bit of
%% the leftover, after truncating the leftmost N bit. If this number is bigger
%% than the one currently in the bin, we update it. Otherwise we do nothing. Our
%% input is inserted.
%%
%% Later, when we want to get an estimation of the cardinality, we apply the
%% estimator function to the set of bins. The details of the estimator are too
%% complex for this documentation, but suffice to say that they are proven to
%% reduce the estimation error to stay under a tight boundary.
%%
%% == References ==
%% For more details, here is a list of reference papers that have been used to
%% implement this library.
%%
%% <ul>
%% <li><a href="http://static.googleusercontent.com/external_content/untrusted_dlcp/research.google.com/en//pubs/archive/40671.pdf">HyperLogLog in Practice: Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm</a></li>
%% <li><a href="https://arxiv.org/abs/1706.07290"> New Cardinality Estimation Methods for HyperLogLog Sketches]</a></li>
%%</ul>
%%  In particular huge thanks to Omar Ertl for his clear work on this domain.
-module(hyper).

-export([new/1, new/2, new/3, insert/2, insert_many/2]).
-export([union/1, union/2]).
-export([card/1, intersect_card/2]).
-export([to_json/1, from_json/1, from_json/2, precision/1, bytes/1, is_hyper/1]).
-export([compact/1, reduce_precision/2]).
-export([generate_unique/1]).

-type precision() :: 4..16.
-type registers() :: any().
-type version() :: atom().

-record(hyper, {v :: version(), registers :: {module(), registers()}}).

-type value() :: binary().
-type filter() :: #hyper{}.

-export_type([filter/0, precision/0, registers/0]).

-define(DEFAULT_BACKEND, hyper_binary).
-define(DEFAULT_VERSION, 'sha-v1').

% constant for 0.5/ln(2)
-define(HLL_ALPHA_INF, 0.721347520444481703680).

%%
%% API
%%

%% @doc Create a new Hyperloglog structure
%% Equivalent to `hyper:new(P, hyper_binary, 'sha-v1')'
%%
%% see {@link hyper:new/3}
-spec new(precision()) -> filter().
new(P) ->
    new(P, ?DEFAULT_BACKEND, ?DEFAULT_VERSION).

%% @doc Create a new Hyperloglog structure with the specified backend
%% Equivalent to `hyper:new(P, Mod, 'sha-v1')'
%%
%% see {@link hyper:new/3}
-spec new(precision(), module()) -> filter().
new(P, Mod) ->
    new(P, Mod, ?DEFAULT_VERSION).

%% @doc Create a new Hyperloglog structure with the specified backend and the
%% specified version.
%%
%% `hyper' ship with one backend:
%%  - {@link hyper_binary}
%%
%% For implementing your own backend, see {@link hyper_register}
%%
%% %% hyper structure with different version will work transparently for now,
%% but will generate deprecation warning in the future if we change some of the
%% implementation details.
%%
%% You should use {@link new/2} most of the time, so that the versionning is
%% handled by `hyper'. You should only use `new/3' if  you need to do your own
%% versionning for custom backends or for custom versionning problems.
-spec new(precision(), module(), version()) -> filter().
new(P, Mod, Version) when 4 =< P andalso P =< 16 andalso is_atom(Mod) ->
    #hyper{v = Version, registers = {Mod, hyper_register:new(Mod, P)}}.

%% @doc return `true' if the structure passed is a `hyper' structure. `false'
%% otherwise
-spec is_hyper(filter()) -> boolean().
is_hyper(#hyper{}) ->
    true;
is_hyper(_) ->
    false.

%% @doc Insert `Value' inside the Hyper structure passed.
-spec insert(value(), filter()) -> filter().
insert(Value, #hyper{registers = {Mod, Registers}} = Hyper) when
    is_binary(Value)
->
    P = hyper_register:precision(Mod, Registers),
    Hash = crypto:hash(sha, Value),
    <<Index:P, RegisterValue:(64 - P)/bitstring, _/bitstring>> = Hash,

    %% Registers are only allowed to increase, implement by backend
    Hyper#hyper{registers = {Mod, hyper_register:set(Mod, Index, RegisterValue, Registers)}};
insert(_Value, _Hyper) ->
    error(badarg).

%% @doc insert a list of values in the Hyper structure.
-spec insert_many([value()], filter()) -> filter().
insert_many(L, Hyper) ->
    lists:foldl(fun insert/2, Hyper, L).

%% Merge a list of hyperloglog together
%% see {@link union/2} for details and implications.
-spec union([filter()]) -> filter().
union(Filters) when is_list(Filters) ->
    case
        lists:usort(
            lists:map(
                fun(#hyper{registers = {Mod, Registers}}) ->
                    {hyper_register:precision(Mod, Registers), Mod}
                end,
                Filters
            )
        )
    of
        %% same P and backend
        [{_P, Mod}] ->
            Registers = lists:map(
                fun(#hyper{registers = {_, R}}) ->
                    R
                end,
                Filters
            ),
            [First | _] = Filters,
            First#hyper{registers = {Mod, hyper_register:max_merge(Mod, Registers)}};
        %% mixed P, but still must have same backend
        [{MinP, Mod} | _] ->
            FoldedFilters = lists:map(
                fun(#hyper{registers = {M, _}} = F) when M =:= Mod ->
                    hyper:reduce_precision(MinP, F)
                end,
                Filters
            ),
            union(FoldedFilters)
    end.

%% Merge two `hyper' structure together.
%%
%% The two structure need to use the same backend.
%%
%% The precision does not have to be the same but the resulting structure will
%% have the lowest precision of the two.
%%
%% See {@link reduce_precision/2} for details of the implications
union(Small, Big) ->
    union([Small, Big]).

%% @doc Provide the cardinality of the intersection of two `hyper' structure
%% using the inclusion/exclusion principle. Use with caution as the precision
%% of this methods is fairly limited.
-spec intersect_card(filter(), filter()) -> float().
intersect_card(
    Left = #hyper{registers = {ModLeft, RegLeft}}, Right = #hyper{registers = {ModRight, RegRight}}
) ->
    PLeft = hyper_register:precision(ModLeft, RegLeft),
    PRight = hyper_register:precision(ModRight, RegRight),
    case PLeft =:= PRight of
        true -> max(0.0, card(Left) + card(Right) - card(union(Left, Right)));
        false -> error(not_equal_precision)
    end.

%% @doc Estimate the cardinality of a  `hyper' structure
-spec card(filter()) -> float().
card(#hyper{registers = {Mod, Registers0}}) ->
    P = hyper_register:precision(Mod, Registers0),
    M = trunc(pow(2, P)),
    Qp1 = 65 - P,
    Registers = hyper_register:compact(Mod, Registers0),
    RegisterHisto = hyper_register:register_histogram(Mod, Registers),

    Z = M * tau(M - maps:get(Qp1, RegisterHisto, 0) / M),
    %TODO: drop after Q = 64 - P in histo before folding
    Z1 = lists:foldr(
        fun({_K, V}, Acc) -> (Acc + V) * 0.5 end,
        Z,
        lists:keysort(1, maps:to_list(maps:without([0, Qp1], RegisterHisto)))
    ),
    Zf = Z1 + M * sigma(maps:get(0, RegisterHisto, 0) / M),
    ?HLL_ALPHA_INF * M * M / Zf.

%% @doc return the precision of the `hyper' structure passed.
precision(#hyper{registers = {Mod, Registers}}) ->
    hyper_register:precision(Mod, Registers).

%% @doc return the size in bytes of the `hyper' structure in memory
bytes(#hyper{registers = {Mod, Registers}}) ->
    hyper_register:bytes(Mod, Registers).

%% @doc compact the underlying `hyper' structure.
%% Do not use, considered an implementation detail of the backend.
%% Will be removed in the future.
compact(#hyper{registers = {Mod, Registers}} = Hyper) ->
    Hyper#hyper{registers = {Mod, hyper_register:compact(Mod, Registers)}}.

%% @doc Reduce the precision of the `hyper' structure to P.
%% This is lossless in the sense that the old structure hold all the data needed
%% to fill the reduced precision one.
%% That said, reducing precision does grow the error in the estimator, and
%% there is no way to get it back. Do this knowing you are losing precision.
reduce_precision(P, #hyper{registers = {Mod, Registers}} = Hyper) ->
    OldP = hyper_register:precision(Mod, Registers),
    case {P, OldP} of
        {P1, P2} when P1 =:= P2 ->
            Hyper;
        {P1, P2} when P1 < P2 ->
            Hyper#hyper{registers = {Mod, hyper_register:reduce_precision(Mod, P, Registers)}};
        _ ->
            error(new_precision_superior_to_old)
    end.

%%
%% SERIALIZATION
%%

%% @deprecated 0.6.0
%% @doc Serialize the `hyper' structure to a json friendly format that can be
%% decoded with {@link from_json/2}
%%
%% Do not use, this is going to be replaced with better solution in the future
-spec to_json(filter()) -> any().
to_json(#hyper{v = V, registers = {Mod, Registers}}) ->
    P = hyper_register:precision(Mod, Registers),
    Compact = hyper_register:compact(Mod, Registers),
    {[
        {<<"p">>, P},
        {<<"v">>, atom_to_binary(V)},
        {<<"registers">>, base64:encode(zlib:gzip(hyper_register:encode_registers(Mod, Compact)))}
    ]}.

%% @deprecated 0.6.0
%% @doc Deserialize the json friendly format from {@link to_json/1} to a `hyper'
%% structure
%%
%% Equivalent to `from_json(Struct, hyper_binary)'
%%
%% Do not use, this is going to be replaced with better solution in the future
-spec from_json(any()) -> filter().
from_json(Struct) ->
    from_json(Struct, ?DEFAULT_BACKEND).

%% @deprecated 0.6.0
%% @doc Deserialize the json friendly format from {@link to_json/1} to a `hyper'
%% structure, with `Mod' as backend
%%
%% Do not use, this is going to be replaced with better solution in the future
-spec from_json(any(), module()) -> filter().
from_json({Struct}, Mod) ->
    P = proplists:get_value(<<"p">>, Struct),
    V = binary_to_atom(proplists:get_value(<<"v">>, Struct)),
    Bytes = zlib:gunzip(base64:decode(proplists:get_value(<<"registers">>, Struct))),
    Registers = hyper_register:decode_registers(Mod, Bytes, P),

    #hyper{v = V, registers = {Mod, Registers}}.

%%
%% HELPERS
%%

generate_unique(N) ->
    generate_unique(lists:usort(random_bytes(N)), N).

generate_unique(L, N) ->
    case length(L) of
        N ->
            L;
        Less ->
            generate_unique(lists:usort(random_bytes(N - Less) ++ L), N)
    end.

random_bytes(N) ->
    random_bytes([], N).

random_bytes(Acc, 0) ->
    Acc;
random_bytes(Acc, N) ->
    Int = rand:uniform(100000000000000),
    random_bytes([<<Int:64/integer>> | Acc], N - 1).

sigma(1.0) ->
    infinity;
sigma(X) ->
    sigma_sum(X, first, X, 1.0).

sigma_sum(Z, Z, _X, _Y) ->
    Z;
sigma_sum(Z, _Zp, X, Y) ->
    X1 = X * X,
    Z1 = (X1 * Y) + Z,
    sigma_sum(Z1, Z, X1, Y + Y).

tau(0.0) ->
    0.0;
tau(1.0) ->
    0.0;
tau(X) ->
    tau_sum((1 - X), first, X, 1.0) / 3.

tau_sum(Z, Z, _X, _Y) ->
    Z;
tau_sum(Z, _Zp, X, Y) ->
    X1 = math:sqrt(X),
    Y1 = Y * 0.5,
    Z1 = Z - (math:pow(1 - X1, 2) * Y1),
    tau_sum(Z1, Z, X1, Y1).

pow(X, Y) ->
    math:pow(X, Y).