src/support/z_sites_manager.erl

%% @author Marc Worrell <marc@worrell.nl>
%% @copyright 2009-2020 Marc Worrell
%% @doc Server managing all sites running inside Zotonic. Starts the sites
%% according to the config files in the sites subdirectories. Handles scanning
%% of all site directories for config files.

%% Copyright 2009-2020 Marc Worrell
%%
%% Licensed under the Apache License, Version 2.0 (the "License");
%% you may not use this file except in compliance with the License.
%% You may obtain a copy of the License at
%%
%%     http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing, software
%% distributed under the License is distributed on an "AS IS" BASIS,
%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
%% See the License for the specific language governing permissions and
%% limitations under the License.

-module(z_sites_manager).
-author('Marc Worrell <marc@worrell.nl>').
-behaviour(gen_server).

%% gen_server exports
-export([init/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2, code_change/3]).
-export([start_link/0]).

%% API exports
-export([
    upgrade/0,
    get_sites/0,
    get_site_status/1,
    set_site_status/2,
    is_sites_running/0,
    get_site_contexts/0,
    get_site_config/1,
    get_fallback_site/0,
    get_builtin_sites/0,
    get_sites_hosts/0,
    module_loaded/1,
    info/0,
    foreach/1,

    stop/1,
    start/1,
    restart/1,
    await_startup/1,

    wait_for_running/1,
    wait_for_running/2,

    get_site_config_overrides/1,
    put_site_config_overrides/2,

    filechanged_observer/2
]).

%% Testing
-export([
    do_scan_sites/0,
    do_scan_sites/1
]).

-include("../../include/zotonic.hrl").
-include_lib("zotonic_filehandler/include/zotonic_filehandler.hrl").
-include_lib("zotonic_notifier/include/zotonic_notifier.hrl").

-type site_status() :: new
                     | starting
                     | running
                     | stopping
                     | retrying
                     | failed
                     | stopped
                     | removing.

-record(state, {
    sites :: map(),
    site_monitors :: map()
}).

-record(site_status, {
    site :: atom(),
    is_enabled = true,
    status = new :: site_status(),
    pid = undefined :: undefined | pid(),
    start_time = undefined :: undefined | erlang:timestam(),
    stop_time = undefined :: undefined | erlang:timestamp(),
    stop_count = 0 :: integer(),
    crash_time = undefined :: undefined | erlang:timestamp(),
    crash_count = 0 :: integer(),
    config = [] :: list()
}).

-export_type([site_status/0]).


% Backoff periods (in seconds) for restarting failed sites
-define(BACKOFF_SHORT, 2).
-define(BACKOFF_LONG, 60).

% Every minute check, check for new sites and removed sites
-define(PERIODIC_UPGRADE, 60000).

% Every second, check if any site needs a (re)start
-define(PERIODIC_START, 1000).

% Seconds after we declare a site as non-crashed and clear the backoff
% (Defaults to 5 minutes)
-define(PERIOD_CLEAR_CRASH, 300).

% Number of sites that can be started in parallel
-define(MAX_PARALLEL_START, 5).

% Ets table holding a quick lookup of a site's status
-define(SITES_STATUS_TABLE, sites_manager_sites_status).

% Timeout when waiting for a site to become available
-define(MAX_WAIT_FOR_RUNNING, 30).


%%====================================================================
%% API
%%====================================================================
%% @spec start_link() -> {ok,Pid} | ignore | {error,Error}
%% @doc Starts the server
start_link() ->
    gen_server:start_link({local, ?MODULE}, ?MODULE, [], []).

%% @doc Sync the supervised sites with the sites in the sites directory.
%% Removes and stops deleted sites, adds (but does not start) new sites.
-spec upgrade() -> ok.
upgrade() ->
    gen_server:cast(?MODULE, upgrade).

%% @doc Return a list of all sites and their current running status.
%%      This is coming from the ets table, so might be a bit delayed.
-spec get_sites() -> #{ atom() => site_status() }.
get_sites() ->
    List = ets:tab2list(?SITES_STATUS_TABLE),
    maps:from_list(List).

%% @doc Return a list of all sites, their current running status and their hosts configs
-spec get_sites_hosts() -> {ok, #{ atom() => {site_status(), [ {Host::binary(), Prio :: pos_integer()} ]} }}.
get_sites_hosts() ->
    gen_server:call(?MODULE, get_sites_hosts, infinity).

%% @doc Get the status of a particular site
-spec get_site_status(z:context()|atom()) -> {ok, site_status()} | {error, bad_name}.
get_site_status(#context{site=Site}) ->
    get_site_status(Site);
get_site_status(Site) when is_atom(Site) ->
    case ets:lookup(?SITES_STATUS_TABLE, Site) of
        [] -> {error, bad_name};
        [{Site, Status}] -> {ok, Status}
    end.

%% @doc Set the status of a site, called by the site supervisor
-spec set_site_status(atom(), site_status()) -> ok.
set_site_status(Site, Status) when is_atom(Site) ->
    gen_server:cast(?MODULE, {set_site_status, Site, Status}).

%% @doc Return information on all running sites.
-spec info() -> {ok, #{ atom() => #site_status{}} }.
info() ->
    gen_server:call(?MODULE, info).

%% @doc Do something for all sites that are currently running.
-spec foreach( fun((z:context()) -> any()) ) -> ok.
foreach(Fun) ->
    maps:fold(
        fun
            (Site, running, _) ->
                try
                    Fun( z_context:new(Site) ),
                    ok
                catch
                    _:_ -> ok
                end;
            (_Site, _Status, _) ->
                ok
        end,
        ok,
        get_sites()).


%% @doc Return true iff all sites are running. Don't count sites manually stopped.
-spec is_sites_running() -> boolean().
is_sites_running() ->
    maps:fold(
        fun
            (_Site, _Status, false) -> false;
            (_Site, running, true) -> true;
            (_Site, stopped, true) -> true;
            (_Site, stopping, true) -> true;
            (_Site, _Status, _Acc) -> false
        end,
        true,
        get_sites()).

%% @doc Return a list of contexts for all running sites.
-spec get_site_contexts() -> [ z:context() ].
get_site_contexts() ->
    maps:fold(
        fun
            (Site, running, Acc) ->
                try
                    [ z_context:new(Site) | Acc ]
                catch
                    _:_ -> Acc
                end;
            (_Site, _Status, Acc) ->
                Acc
        end,
        [],
        get_sites()).

%% @doc Fetch the configuration of a specific site.
-spec get_site_config(atom()) -> {ok, list()} | {error, bad_name|term()}.
get_site_config(Site) ->
    gen_server:call(?MODULE, {get_site_config, Site}, infinity).

%% @doc Return the name of the site to handle unknown Host requests
-spec get_fallback_site() -> atom() | undefined.
get_fallback_site() ->
    gen_server:call(?MODULE, get_fallback_site).

%% @doc The list of builtin sites, they are located in the zotonic/apps/ directory.
-spec get_builtin_sites() -> [ atom() ].
get_builtin_sites() ->
    [ zotonic_site_status, zotonic_site_testsandbox ].

%% @doc Stop a site or multiple sites.
stop([Node, Site]) ->
    rpc:call(Node, ?MODULE, stop, [Site]);
stop(Site) when is_atom(Site) ->
    gen_server:call(?MODULE, {stop, Site}).

%% @doc Start a site or multiple sites.
start([Node, Site]) ->
    rpc:call(Node, ?MODULE, start, [Site]);
start(Site) when is_atom(Site) ->
    gen_server:call(?MODULE, {start, Site}).

%% @doc Restart a site or multiple sites.
restart([Node, Site]) ->
    rpc:call(Node, ?MODULE, restart, [Site]);
restart(Site) when is_atom(Site) ->
    case get_site_status(Site) of
        {ok, running} ->
            gen_server:call(?MODULE, {stop, Site}),
            restart(Site);
        {ok, stopping} ->
            timer:sleep(100),
            restart(Site);
        {ok, starting} ->
            await_startup(Site);
        {ok, S} when S =:= failed; S =:= new; S =:= stopped ->
            start(Site),
            await_startup(Site);
        {ok, Status} ->
            {error, Status};
        {error, _} = Error ->
            Error
    end.

%% @doc Tell the sites manager that a module was loaded, check
%%      changes to observers, schema.
module_loaded(Module) ->
    gen_server:cast(?MODULE, {module_loaded, Module}).


%% @doc Wait for a site to complete its startup sequence.
-spec await_startup( z:context() | atom() ) -> ok | {error, bad_name | failed | removing | stopped | stopping}.
await_startup(Context = #context{}) ->
    await_startup(z_context:site(Context));
await_startup(Site) when is_atom(Site) ->
    case get_site_status(Site) of
        {ok, running} ->
            % Now wait for the sites modules to be started
            Context = z_context:new(Site),
            z_module_manager:upgrade_await(Context);
        {ok, starting} ->
            timer:sleep(1000),
            await_startup(Site);
        {ok, new} ->
            timer:sleep(1000),
            await_startup(Site);
        {ok, retrying} ->
            timer:sleep(1000),
            await_startup(Site);
        {ok, Other} ->
            {error, Other};
        {error, _} = Error ->
            Error
    end.


%% @doc Wait for a site to be running, max 30 secs.
-spec wait_for_running(atom()) -> ok | {error, bad_name | timeout | stopped | removing | term()}.
wait_for_running(Site) when is_atom(Site) ->
    wait_for_running(Site, ?MAX_WAIT_FOR_RUNNING).

% running -> ok
% new -> request start
% starting -> wait max 30 secs, otherwise 503
% retrying -> wait max 30 secs, otherwise 503
% failing -> wait max 30 secs, otherwise 503
% failed -> check scheduled retry time, maybe wait, otherwise 503
% stopping -> status site
% stopped -> status site
% removing -> status site

-spec wait_for_running(atom(), Secs::integer()) -> ok | {error, bad_name | timeout | stopped | removing | term()}.
wait_for_running(Site, Timeout) when is_atom(Site) ->
    case ets:lookup(?SITES_STATUS_TABLE, Site) of
        [] -> {error, bad_name};
        [{Site, running}] -> ok;
        [{Site, stopped}] -> {error, stopped};
        [{Site, stopping}] -> {error, stopping};
        [{Site, removing}] -> {error, removing};
        [{Site, Status}] -> wait_for_running_1(Site, Status, Timeout)
    end.

wait_for_running_1(Site, new, Timeout) when Timeout >= 0 ->
    start(Site),
    timer:sleep(1000),
    wait_for_running(Site, Timeout-1);
wait_for_running_1(_Site, _Status, Timeout) when Timeout =< 0 ->
    {error, timeout};
wait_for_running_1(Site, failed, Timeout) ->
    Now = z_datetime:timestamp(),
    case gen_server:call(?MODULE, {get_status_start, Site}) of
        {ok, {running, _RestartTime}} ->
            ok;
        {ok, {failed, RestartTime}} when RestartTime =< Now + Timeout ->
            Sleep = erlang:max(1, RestartTime - Now + 1),
            timer:sleep(Sleep*1000),
            wait_for_running(Site, Timeout - Sleep);
        {ok, {failed, _RestartTime}} ->
            % Backoff - we need to wait longer than our max timeout
            {error, timeout};
        {ok, {OtherState, _RestartTime}} ->
            % Status changed between ets lookup and the gen_server call
            wait_for_running_1(Site, OtherState, Timeout);
        {error, _} = Error ->
            Error
    end;
wait_for_running_1(Site, _State, Timeout) ->
    timer:sleep(1000),
    wait_for_running(Site, Timeout - 1).



%% @doc Called by the zotonic_filehandler after a file has been changed. This relays the
%% file change event to all sites using the #filewatcher{} event.
-spec filechanged_observer(#zotonic_filehandler_filechange{}, term()) -> ok.
filechanged_observer(#zotonic_filehandler_filechange{} = ChangeEvent, _CallContext) ->
    #zotonic_filehandler_filechange{
        verb = Verb,
        file = File,
        basename = Basename,
        extension = Extension
    } = ChangeEvent,
    Event = #filewatcher{
        verb = Verb,
        file = File,
        basename = Basename,
        extension = Extension
    },
    z_sites_manager:foreach(
        fun(Context) ->
            z_notifier:notify_sync(Event, Context)
        end).


%%====================================================================
%% gen_server callbacks
%%====================================================================

%% @spec init(Args) -> {ok, State} |
%%                     {ok, State, Timeout} |
%%                     ignore               |
%%                     {stop, Reason}
%% @doc Initiates the server.
init([]) ->
    z_module_indexer:new_ets(),
    z_mediaclass:new_ets(),
    ets:new(?SITES_STATUS_TABLE, [set, public, named_table, {keypos, 1}]),
    ok = gen_server:cast(self(), upgrade),
    timer:send_after(?PERIODIC_UPGRADE, periodic_upgrade),
    timer:send_after(?PERIODIC_START, periodic_start),
    zotonic_notifier:observe(
        ?SYSTEM_NOTIFIER, zotonic_filehandler_filechange,
        {?MODULE, filechanged_observer},
        self(), 500),
    {ok, #state{
        sites = #{},
        site_monitors = #{}
    }}.


%% @spec handle_call(Request, From, State) -> {reply, Reply, State} |
%%                                      {reply, Reply, State, Timeout} |
%%                                      {noreply, State} |
%%                                      {noreply, State, Timeout} |
%%                                      {stop, Reason, Reply, State} |
%%                                      {stop, Reason, State}
%% @doc Return all sites
handle_call(get_sites_status, _From, #state{ sites = Sites } = State) ->
    SiteStatus = maps:fold(
        fun(Site, Status, Acc) ->
            Acc#{ Site => Status#site_status.status }
        end,
        #{},
        Sites),
    {reply, SiteStatus, State};

%% @doc Start a site.
handle_call({start, Site}, _From, #state{ sites = Sites } = State) ->
    case do_reload_site_config(Site, Sites) of
        {ok, Sites1} ->
            State1 = State#state{ sites = Sites1 },
            case do_start(Site, State1) of
                {ok, StateStarting} ->
                    do_sync_status(StateStarting#state.sites),
                    {reply, ok, StateStarting};
                {error, _} = Error ->
                    {reply, Error, State1}
            end;
        {error, _} = Error ->
            {reply, Error, State}
    end;

%% @doc Stop a site.
handle_call({stop, Site}, _From, State) ->
    case do_stop(Site, State) of
        {ok, StateStopping} ->
            do_sync_status(StateStopping#state.sites),
            {reply, ok, StateStopping};
        {error, _} = Error ->
            {reply, Error, State}
    end;

handle_call({get_site_config, Site}, _From, #state{ sites = Sites } = State) ->
    case maps:find(Site, Sites) of
        {ok, #site_status{ config = Config }} ->
            {reply, {ok, Config}, State};
        error ->
            {reply, {error, bad_name}, State}
    end;

handle_call({get_status_start, Site}, _From, #state{ sites = Sites } = State) ->
    Reply = case maps:find(Site, Sites) of
        {ok, #site_status{ status = Status, start_time = StartTime }} ->
            {ok, {Status, StartTime}};
        error ->
            {error, bad_name}
    end,
    {reply, Reply, State};

handle_call(get_sites_hosts, _From, #state{ sites = Sites } = State) ->
    {reply, {ok, do_get_sites_hosts(Sites)}, State};

handle_call(get_fallback_site, _From, #state{ sites = Sites } = State) ->
    {reply, do_get_fallback_site(Sites), State};

handle_call(info, _From, #state{ sites = Sites } = State) ->
    {reply, {ok, Sites}, State};

%% @doc Trap unknown calls
handle_call(Message, _From, State) ->
    {stop, {unknown_call, Message}, State}.


%% @spec handle_cast(Msg, State) -> {noreply, State} |
%%                                  {noreply, State, Timeout} |
%%                                  {stop, Reason, State}

handle_cast(scan_sites, State) ->
    State1 = rescan_sites(State),
    do_sync_status(State1#state.sites),
    {noreply, State1};

%% @doc Sync known sites with loaded sites
handle_cast(upgrade, #state{ sites = Sites } = State) ->
    UpgradedSites = do_upgrade(Sites),
    do_sync_status(UpgradedSites),
    {noreply, State#state{ sites = UpgradedSites }};

%% @doc Handle load of a module, check observers and schema
handle_cast({module_loaded, Module}, State) ->
    do_load_module(Module, State),
    {noreply, State};

handle_cast({set_site_status, Site, Status}, #state{ sites = Sites } = State) ->
    Sites1 = case maps:find(Site, Sites) of
        {ok, #site_status{ status = Status }} ->
            Sites;
        {ok, #site_status{ status = starting } = S} when Status =:= running ->
            z_sites_dispatcher:update_dispatchinfo(),
            S1 = S#site_status{
                status = running,
                start_time = z_datetime:timestamp()
            },
            Sites#{ Site => S1 };
        {ok, #site_status{ status = retrying } = S} when Status =:= running; Status =:= starting ->
            z_sites_dispatcher:update_dispatchinfo(),
            S1 = S#site_status{
                status = running,
                start_time = z_datetime:timestamp()
            },
            Sites#{ Site => S1 };
        {ok, #site_status{ status = CurStatus }} ->
            ?LOG_NOTICE(#{
                text => <<"Site status change">>,
                in => zotonic_core,
                old_status => CurStatus,
                status => Status
            }, #{ site => Site }),
            Sites;
        error ->
            ?LOG_NOTICE(#{
                text => <<"Site status change">>,
                in => zotonic_core,
                old_status => unknown,
                status => Status
            }, #{ site => Site }),
            Sites
    end,
    do_sync_status(Sites1),
    {noreply, State#state{ sites = Sites1 }};

%% @doc Trap unknown casts
handle_cast(Message, State) ->
    {stop, {unknown_cast, Message}, State}.


%% @spec handle_info(Info, State) -> {noreply, State} |
%%                                       {noreply, State, Timeout} |
%%                                       {stop, Reason, State}

handle_info({'DOWN', MRef, process, Pid, Reason}, State) ->
    State1 = handle_down(MRef, Pid, Reason, State),
    do_sync_status(State1#state.sites),
    {noreply, State1};

handle_info(periodic_upgrade, #state{ sites = Sites } = State) ->
    UpgradedSites = do_upgrade(Sites),
    timer:send_after(?PERIODIC_UPGRADE, periodic_upgrade),
    do_sync_status(UpgradedSites),
    erlang:garbage_collect(),
    {noreply, State#state{ sites = UpgradedSites }};

handle_info(periodic_start, State) ->
    State1 = do_start_sites(State),
    State2 = do_cleanup_crash_state(State1),
    timer:send_after(?PERIODIC_START, periodic_start),
    do_sync_status(State2#state.sites),
    {noreply, State2};

%% @doc Handling all non call/cast messages
handle_info(_Info, State) ->
    ?DEBUG({z_sites_manager, _Info}),
    {noreply, State}.


%% @spec terminate(Reason, State) -> void()
%% @doc This function is called by a gen_server when it is about to
%% terminate.
terminate(_Reason, _State) ->
    ok.

%% @spec code_change(OldVsn, State, Extra) -> {ok, NewState}
%% @doc Convert process state when code is changed
code_change(_OldVsn, State, _Extra) ->
    {ok, State}.


%%====================================================================
%% support functions
%%====================================================================


%% @doc Sync the status of all sites to the SITES_STATUS_TABLE ets table.
do_sync_status(Sites) ->
    % Update existing, remove unknown
    IsChanged = lists:foldl(
        fun({Site, Status}, AccChanged) ->
            case maps:find(Site, Sites) of
                {ok, #site_status{ status = Status }} ->
                    AccChanged;
                {ok, #site_status{ status = NewStatus }} ->
                    ets:insert(?SITES_STATUS_TABLE, {Site, NewStatus}),
                    true;
                error ->
                    ets:delete(?SITES_STATUS_TABLE, Site),
                    true
            end
        end,
        false,
        ets:tab2list(?SITES_STATUS_TABLE)),
    % Insert new
    IsChanged1 = maps:fold(
        fun(Site, #site_status{ status = Status }, AccChanged) ->
            case ets:lookup(?SITES_STATUS_TABLE, Site) of
                [] ->
                    ets:insert(?SITES_STATUS_TABLE, {Site, Status}),
                    true;
                _ ->
                    AccChanged
            end
        end,
        IsChanged,
        Sites),
    case IsChanged1 of
        true ->
            notify_status(),
            z_sites_dispatcher:update_hosts();
        false ->
            ok
    end.

notify_status() ->
    Sites = ets:tab2list(?SITES_STATUS_TABLE),
    zotonic_notifier:notify(sites_status, Sites, undefined).

% status2map(#site_status{ } = S) ->
%     #{
%         site => S#site_status.site,
%         is_enabled => S#site_status.is_enabled,
%         status => S#site_status.status,
%         start_time => tm(S#site_status.start_time),
%         stop_time => tm(S#site_status.stop_time),
%         stop_count => S#site_status.stop_count,
%         crash_time => tm(S#site_status.crash_time),
%         crash_count => S#site_status.crash_count
%     }.

% tm(undefined) ->
%     undefined;
% tm({MSecs, Secs, _USecs}) ->
%     z_datetime:timestamp_to_datetime(MSecs * 1000000 + Secs).

% ----------------------------------------------------------------------------

do_start_sites(#state{ sites = Sites } = State) ->
    Now = z_datetime:timestamp(),
    NStarting = maps:fold(
        fun
            (_, #site_status{ status = starting }, Count) -> Count+1;
            (_, #site_status{ status = retrying }, Count) -> Count+1;
            (_, _, Count) -> Count
        end,
        0,
        Sites),
    NQueued = maps:fold(
        fun
            (_, #site_status{ status = new, start_time = S }, Count)
                when S =< Now -> Count+1;
            (_, #site_status{ status = failed, start_time = S }, Count)
                when S =< Now -> Count+1;
            (_, _, Count) ->
                Count
        end,
        0,
        Sites),
    maybe_start_sites(NStarting, NQueued, Now, State).

maybe_start_sites(_, 0, _Now, State) ->
    State;
maybe_start_sites(NStarting, _NQueued, _Now, State) when NStarting >= ?MAX_PARALLEL_START ->
    State;
maybe_start_sites(NStarting, _NQueued, Now, State) ->
    StartAll = maps:fold(
        fun
            (Site, #site_status{ status = new, start_time = ST }, Acc) when ST =< Now ->
                [ {ST, Site} | Acc];
            (Site, #site_status{ status = failed, start_time = ST }, Acc) when ST =< Now ->
                [ {ST, Site} | Acc];
            (_Site, _SiteStatus, Acc) ->
                Acc
        end,
        [],
        State#state.sites),
    MaxStarting = ?MAX_PARALLEL_START - NStarting,
    StartTop = lists:sublist(
        lists:sort(StartAll),
        max(0, min(length(StartAll), MaxStarting))),
    lists:foldl(
        fun({_, Site}, StateAcc) ->
            case do_start(Site, StateAcc) of
                {ok, StateAcc1} -> StateAcc1;
                {error, _} -> StateAcc
            end
        end,
        State,
        StartTop).

% ----------------------------------------------------------------------------

do_start(Site, #state{ sites = Sites } = State) ->
    case maps:find(Site, Sites) of
        {ok, SiteStatus} ->
            case do_start_site(SiteStatus) of
                {ok, SiteStatus1} ->
                    State1 = State#state{
                        sites = Sites#{ Site => SiteStatus1 }
                    },
                    State2 = ensure_site_monitor(SiteStatus1, State1),
                    {ok, State2};
                {error, _} = Error ->
                    Error
            end;
        error ->
            ?LOG_WARNING(#{
                action => start_request,
                in => zotonic_core,
                result => error,
                reason => bad_name,
                text => <<"Requested to start unknown site">>
            }, #{ site => Site }),
            {error, bad_name}
    end.

do_start_site(#site_status{ site = Site } = SiteStatus) ->
    case site_is_startable(SiteStatus) of
        {true, StartState} ->
            ?LOG_NOTICE(#{
                text => <<"Site starting">>,
                in => zotonic_core,
                action => starting
            }, #{ site => Site }),
            case z_sites_sup:start_site(Site) of
                {ok, Pid} ->
                    {ok, SiteStatus#site_status{
                        status = StartState,
                        pid = Pid
                    }};
                {error, {already_started, Pid}} ->
                    % seems we have a race condition here
                    ?LOG_ERROR(#{
                        text => <<"Site already started, this shouldn't happen.">>,
                        in => zotonic_core,
                        result => error,
                        reason => already_started
                    }, #{ site => Site }),
                    {ok, SiteStatus#site_status{
                        pid = Pid
                    }};
                {error, Reason} = Error ->
                    ?LOG_ERROR(#{
                        text => "Site start failed",
                        in => zotonic_core,
                        result => error,
                        reason => Reason
                    }, #{ site => Site }),
                    Error
            end;
        false ->
            {error, SiteStatus#site_status.status}
    end.

site_is_startable(#site_status{ status = new }) -> {true, starting};
site_is_startable(#site_status{ status = stopped }) -> {true, starting};
site_is_startable(#site_status{ status = failed }) -> {true, retrying};
site_is_startable(_) -> false.


ensure_site_monitor(#site_status{ site = Site, pid = Pid }, #state{ site_monitors = Ms } = State) ->
    case maps:find(Pid, Ms) of
        {ok, _} ->
            State;
        error ->
            MRef = erlang:monitor(process, Pid),
            State#state{ site_monitors = Ms#{ Pid => {MRef, Site} } }
    end.

% ----------------------------------------------------------------------------

do_stop(Site, #state{ sites = Sites } = State) ->
    case maps:find(Site, Sites) of
        {ok, SiteStatus} ->
            case do_stop_site(SiteStatus) of
                {ok, SiteStatus1} ->
                    State1 = State#state{
                        sites = Sites#{ Site => SiteStatus1 }
                    },
                    {ok, State1};
                {error, _} = Error ->
                    Error
            end;
        error ->
            {error, bad_name}
    end.

do_stop_site(#site_status{ pid = Pid } = SiteStatus) ->
    case site_is_stoppable(SiteStatus) of
        true when is_pid(Pid) ->
            z_sites_sup:stop_site(Pid),
            {ok, SiteStatus#site_status{ status = stopping }};
        true when Pid =:= undefined ->
            {ok, SiteStatus#site_status{ status = stopped }};
        false ->
            {error, SiteStatus#site_status.status}
    end.

site_is_stoppable(#site_status{ status = running }) -> true;
site_is_stoppable(#site_status{ status = starting }) -> true;
site_is_stoppable(#site_status{ status = retrying }) -> true;
site_is_stoppable(_) -> false.

% ----------------------------------------------------------------------------

handle_down(MRef, Pid, Reason, #state{ site_monitors = Ms } = State) ->
    case maps:find(Pid, Ms) of
        {ok, {MRef, Site}} ->
            State1 = State#state{ site_monitors = maps:remove(Pid, Ms) },
            Sites1 = do_site_down(Site, Reason, State1#state.sites),
            State1#state{ sites = Sites1 };
        error ->
            ?LOG_WARNING(#{
                text => <<"'DOWN' for unknown site">>,
                in => zotonic_core,
                result => error,
                reason => Reason
            }),
            State
    end.

do_site_down(Site, Reason, Sites) ->
    case maps:find(Site, Sites) of
        {ok, #site_status{ status = removing }} ->
            z_sites_dispatcher:update_dispatchinfo(),
            maps:remove(Site, Sites);
        {ok, Status} ->
            Status1 = Status#site_status{
                pid = undefined,
                status = new_status_after_down(Site, Status#site_status.status, Reason),
                stop_count = Status#site_status.stop_count + 1,
                stop_time = os:timestamp()
            },
            Status2 = maybe_schedule_restart(Status1),
            z_sites_dispatcher:update_dispatchinfo(),
            Sites#{ Site => Status2 };
        error ->
            ?LOG_WARNING(#{
                text => <<"'DOWN' for site, but no site status found">>,
                in => zotonic_core,
                site => Site,
                result => error,
                reason => Reason
            }),
            Sites
    end.

new_status_after_down(_Site, stopping, shutdown) ->
    stopped;
new_status_after_down(Site, Status, Reason) ->
    ?LOG_ERROR(#{
        text => <<"Site is down">>,
        in => zotonic_core,
        old_status => Status,
        status => failed,
        reason => Reason
    }, #{ site => Site }),
    failed.

maybe_schedule_restart(#site_status{ status = stopped } = Status) ->
    Status;
maybe_schedule_restart(#site_status{ status = failed } = Status) ->
    % Non normal failure - site will be restarted
    Status#site_status{
        start_time = start_backoff(Status#site_status.crash_count + 1),
        crash_count = Status#site_status.crash_count + 1,
        crash_time = os:timestamp()
    }.

start_backoff(N) when N < 2 ->
    z_datetime:timestamp();
start_backoff(N) when N < 10 ->
    z_datetime:timestamp() + ?BACKOFF_SHORT;
start_backoff(_N) ->
    z_datetime:timestamp() + ?BACKOFF_LONG.


% ----------------------------------------------------------------------------

%% @doc If a site is running longer than ?PERIOD_CLEAR_CRASH seconds, then
%%      clear the crash count, assuming previous crashes are gone.
do_cleanup_crash_state(#state{ sites = Sites } = State) ->
    ClearTime = z_datetime:timestamp() - ?PERIOD_CLEAR_CRASH,
    Sites1 = maps:map(
        fun
            (_, #site_status{status = running, crash_count = N, start_time = T } = S)
                when N > 0, T < ClearTime ->
                S#site_status{ crash_count = 0 };
            (_, S) ->
                S
        end,
        Sites),
    State#state{ sites = Sites1 }.

% ----------------------------------------------------------------------------

%% @doc Reload the site's config files.
do_reload_site_config(Site, Sites) ->
    case maps:find(Site, Sites) of
        {ok, S} ->
            case scan_app(Site) of
                {true, NewConfig} ->
                    S1 = S#site_status{ config = NewConfig },
                    {ok, Sites#{ Site => S1 }};
                false ->
                    {ok, Sites}
            end;
        error ->
            ?LOG_INFO(#{
                text => <<"Requested to reload site config from unknown site">>,
                in => zotonic_core,
                site => Site
            }),
            {error, bad_name}
    end.

% ----------------------------------------------------------------------------


%% @doc Rescan all sites, add new sites to the sites map
rescan_sites(#state{ sites = Sites } = State) ->
    ScannedSites = do_scan_sites(),
    remove_unknown_sites(Sites, ScannedSites),
    NewSites = insert_new_sites(Sites, ScannedSites),
    self() ! startup_check,
    State#state{ sites = NewSites }.

%% @doc Check all known sites against the scanned sites.
%%      Stop all sites that are not in the scanned sites.
remove_unknown_sites(Sites, ScannedSites) ->
    Removed = maps:fold(
        fun(Site, _SiteStatus, Acc) ->
            case maps:is_key(Site, ScannedSites) of
                true -> Acc;
                false -> [ Site | Acc ]
            end
        end,
        [],
        Sites),
    lists:foreach(
        fun(Site) ->
            self() ! {remove_site, Site}
        end,
        Removed).

%% @doc Check all known sites, add a new site status record
%%      for the newly started site.
insert_new_sites(Sites, ScannedSites) ->
    maps:fold(
        fun(Site, Cfg, Acc) ->
            case maps:find(Site, Sites) of
                {ok, _} ->
                    Acc;
                error ->
                    S = new_site_status(Site, initial_status(Cfg), Cfg),
                    Acc#{ Site => S }
            end
        end,
        Sites,
        ScannedSites).

initial_status(Cfg) ->
    case proplists:get_value(enabled, Cfg, true) of
        true -> new;
        false -> stopped
    end.

new_site_status(Site, new, Cfg) ->
    #site_status{
        site = Site,
        status = new,
        start_time = z_datetime:timestamp(),
        config = Cfg
    };
new_site_status(Site, stopped, Cfg) ->
    #site_status{
        site = Site,
        status = stopped,
        config = Cfg
    }.

%% @doc Scan all sites subdirectories for the site configurations.
-spec do_scan_sites() -> #{ Site::atom() => proplists:proplist() }.
do_scan_sites() ->
    List = do_scan_sites( is_testsandbox_node() ),
    lists:foldl(
        fun(Cfg, Acc) ->
            {site, Site} = proplists:lookup(site, Cfg),
            Acc#{ Site => Cfg }
        end,
        #{},
        List).

do_scan_sites(true) ->
    lists:filter(
        fun is_testsandbox_site/1,
        scan_lib_dir( z_path:build_lib_dir() ));
do_scan_sites(false) ->
    lists:filter(
        fun(Cfg) -> not is_testsandbox_site(Cfg) end,
        scan_lib_dir( z_path:build_lib_dir() )).

is_testsandbox_site(Cfg) ->
   proplists:get_value(site, Cfg) =:= zotonic_site_testsandbox.

scan_lib_dir(Directory) ->
    Apps = filelib:wildcard( filename:join(Directory, "*") ),
    Apps1 = lists:filter(
        fun(AppDir) ->
            Basename = filename:basename(AppDir),
            hd(Basename) =/= $.
            andalso lists:last(Basename) =/= $~
            andalso lists:last(Basename) =/= $#
            andalso filelib:is_dir(AppDir)
            andalso filelib:is_dir( filename:join(AppDir, "priv") )
        end,
        Apps),
    lists:filtermap( fun scan_app_dir/1, Apps1 ).

scan_app_dir(AppDir) ->
    ensure_code_path(AppDir),
    App = z_convert:to_atom( filename:basename(AppDir) ),
    scan_app(App).

scan_app(App) ->
    case z_sites_config:config_files(App) of
        [] -> false;
        Fs ->
            case z_sites_config:read_configs(Fs) of
                {ok, Map} ->
                    _ = application:load(App),
                    Map1 = Map#{ site => App },
                    {true, to_list(Map1)};
                {error, Reason} ->
                    ?LOG_ERROR(#{
                        text => <<"Error reading config files">>,
                        in => zotonic_core,
                        app => App,
                        result => error,
                        reason => Reason
                    }),
                    false
            end
    end.

to_list(Map) ->
    L = maps:to_list(Map),
    lists:map(
        fun
            ({K, M}) when is_map(M) ->
                {K, maps:to_list(M)};
            (KV) ->
                KV
        end,
        L).

ensure_code_path(SitePath) ->
    Ebin = filename:join(SitePath, "ebin"),
    case lists:member(Ebin, code:get_path()) of
        false -> code:add_pathz(Ebin);
        true -> ok
    end.

do_get_sites_hosts(Sites) ->
    FallbackSite = do_get_fallback_site(Sites),
    maps:map(
        fun(Site, SiteStatus) ->
            do_get_sites_hosts_1(SiteStatus, Site =:= FallbackSite)
        end,
        Sites).

do_get_sites_hosts_1(#site_status{ status = Status, config = Cfg }, IsFallback) ->
    HostPrioList = hosts_from_config(Cfg),
    case IsFallback of
        true ->
            {Status, HostPrioList ++ [{<<"*">>, 99}], do_is_site_redirect(Cfg)};
        false ->
            {Status, HostPrioList, do_is_site_redirect(Cfg)}
    end.

hosts_from_config(Config) ->
    Hostname = proplists:get_value(hostname, Config),
    HostAlias = case proplists:get_value(hostalias, Config, []) of
        List when is_list(List) -> ensure_alias_list(List);
        _ -> []
    end,
    HostSmtp = proplists:get_value(smtphost, Config),
    Hs = [{Hostname, 1}]
        ++ [ {Alias, 2} || Alias <- HostAlias ]
        ++ [ {HostSmtp, 3} ],
    lists:filtermap(
        fun
            ({undefined, _}) -> false;
            ({none, _}) -> false;
            ({"", _}) -> false;
            ({<<"">>, _}) -> false;
            ({Host, Prio}) -> {true, {z_convert:to_binary(Host), Prio}}
        end,
        Hs).

do_is_site_redirect(Cfg) ->
    case proplists:get_value(redirect, Cfg, true) of
        true -> true;
        false -> false;
        undefined -> true
    end.

% Handle the case where an user just gives a single hostname.
ensure_alias_list([C|_] = Alias) when is_integer(C) -> [Alias];
ensure_alias_list(Alias) -> Alias.

%% @doc Check which site will act as fallback site
do_get_fallback_site(Sites) ->
    case has_zotonic_site(Sites) of
        true -> zotonic_site_status;
        false -> do_get_fallback_site_1(Sites)
    end.

%% @doc Return the zotonic_site_status, or any running site.
do_get_fallback_site_1(Sites) ->
    maps:fold(
        fun
            (Site, #site_status{status = running}, undefined) ->
                Site;
            (_Site, _SiteStatus, FirstFoundSite) ->
                FirstFoundSite
        end,
        undefined,
        Sites).

%% @doc Check if the 'zotonic_site_status' is one of the running sites
has_zotonic_site(#{ zotonic_site_status := #site_status{ is_enabled = IsEnabled } }) ->
    IsEnabled;
has_zotonic_site(_) ->
    false.


%% @doc Queue new sites for starting, stop removed sites.
-spec do_upgrade(CurrentSites :: map()) -> NewSites :: map().
do_upgrade(CurrentSites) ->
    AvailableSites = do_scan_sites(),
    AvailSiteNames = maps:keys(AvailableSites),
    CurrSiteNames = maps:keys(CurrentSites),
    ToStop = CurrSiteNames -- AvailSiteNames,
    ToStart = AvailSiteNames -- CurrSiteNames,
    CurrentSites1 = do_stop_removed_sites(CurrentSites, ToStop),
    do_add_new_sites(CurrentSites1, ToStart, AvailableSites).

do_stop_removed_sites(Sites, []) ->
    Sites;
do_stop_removed_sites(Sites, [Stop|Other]) ->
    #{ Stop := SiteStatus } = Sites,
    Sites1 = case SiteStatus#site_status.pid of
        undefined ->
            maps:remove(Stop, Sites);
        Pid ->
            SiteStatus1 = #site_status{ status = removing },
            z_sites_sup:stop_site(Pid),
            Sites#{ Stop => SiteStatus1 }
    end,
    do_stop_removed_sites(Sites1, Other).

do_add_new_sites(Sites, [], _SiteConfigs) ->
    Sites;
do_add_new_sites(Sites, [Start|Other], SiteConfigs) ->
    #{ Start := Config } = SiteConfigs,
    SiteStatus = new_site_status(Start, initial_status(Config), Config),
    Sites1 = Sites#{ Start => SiteStatus },
    do_add_new_sites(Sites1, Other, SiteConfigs).


%% @doc Check if the current beam is running the testsandbox
is_testsandbox_node() ->
    [Base|_] = string:tokens(atom_to_list(node()), "@"),
    case lists:last(string:tokens(Base, "_")) of
        "testsandbox" -> true;
        _ -> false
    end.


%% @doc Handle the load of a module by the code_server, maybe reattach observers.
do_load_module(Module, State) ->
    ?LOG_DEBUG(#{
        text => <<"Reloading module">>,
        in => zotonic_core,
        module => Module
    }),
    do_load_module(is_running_site(Module, State), is_module(Module), Module, State).

do_load_module(true, _IsModule, Site, _State) ->
    try
        z_module_manager:module_reloaded(Site, z_context:new(Site))
    catch
        _:_ ->
            ok
    end;
do_load_module(false, true, Module, #state{ sites = Sites }) ->
    Running = maps:fold(
        fun
            (Site, #site_status{ status = running }, Acc) -> [ Site | Acc ];
            (_Site, _SiteStatus, Acc) -> Acc
        end,
        [],
        Sites),
    lists:foreach(
        fun(Site) ->
            try
                z_module_manager:module_reloaded(Module, z_context:new(Site))
            catch
                _:_ ->
                    ok
            end
        end,
        Running);
do_load_module(false, false, _Module, _State) ->
    ok.

is_running_site(Module, #state{ sites = Sites }) ->
    case maps:find(Module, Sites) of
        {ok, #site_status{ status = running }} -> true;
        {ok, _} -> false;
        error -> false
    end.

is_module(Module) ->
    case atom_to_list(Module) of
        "mod_" ++ _ -> true;
        ModS -> string:str(ModS, "_mod_") > 0
    end.

get_site_config_overrides(Site) when is_atom(Site) ->
    Key = z_convert:to_atom(z_convert:to_list(Site) ++ "_config_overrides"),
    application:get_env(zotonic_core, Key, []).

%% @doc Override a given site config with arbitrary key/value
%% pairs. Should be called before the site is started.
put_site_config_overrides(Site, Overrides) when is_atom(Site), is_list(Overrides) ->
    Key = z_convert:to_atom(z_convert:to_list(Site) ++ "_config_overrides"),
    application:set_env(zotonic_core, Key, Overrides).