From 3dd38375773eb191e2e81234389f4f16a89fe3b4 Mon Sep 17 00:00:00 2001 From: Denys Gonchar Date: Wed, 17 Oct 2018 13:31:38 +0200 Subject: [PATCH] initial watchdog version --- rebar.config.script | 2 +- rel/files/mongooseim.cfg | 7 + src/watchdog/mongoose_watchdog.erl | 236 +++++++++++++++++++++ src/watchdog/mongoose_watchdog_default.erl | 235 ++++++++++++++++++++ 4 files changed, 479 insertions(+), 1 deletion(-) create mode 100644 src/watchdog/mongoose_watchdog.erl create mode 100644 src/watchdog/mongoose_watchdog_default.erl diff --git a/rebar.config.script b/rebar.config.script index 0841b697f1d..5288da2c1a1 100644 --- a/rebar.config.script +++ b/rebar.config.script @@ -54,7 +54,7 @@ fun() -> end end, -RequiredApps = fun() -> [mongooseim, inets, tools] end, +RequiredApps = fun() -> [mongooseim, inets, os_mon, tools] end, EnvApps = GetEnvApps(), SetupIncludedApps = diff --git a/rel/files/mongooseim.cfg b/rel/files/mongooseim.cfg index bbb230979bd..56806066405 100755 --- a/rel/files/mongooseim.cfg +++ b/rel/files/mongooseim.cfg @@ -714,6 +714,13 @@ {services, [ +%% {mongoose_watchdog, [{os_mon_env, [{start_cpu_sup, true}, +%% {start_disksup, true}]}, +%% {memsup_settings, [{helper_timeout, 10}, +%% {procmem_high_watermark, 3}]}, +%% {default_handlers, [ %%list of gen_event modules +%% {mongoose_watchdog_default,[{timeout, 15}]} +%% ]}]}, {service_admin_extra, [{submods, [node, accounts, sessions, vcard, roster, last, private, stanza, stats]}]} ] diff --git a/src/watchdog/mongoose_watchdog.erl b/src/watchdog/mongoose_watchdog.erl new file mode 100644 index 00000000000..d5deaf721c9 --- /dev/null +++ b/src/watchdog/mongoose_watchdog.erl @@ -0,0 +1,236 @@ +%%%------------------------------------------------------------------- +%%% @author denys +%%% @copyright (C) 2018, +%%% @doc +%%% +%%% @end +%%% Created : 08. Oct 2018 16:08 +%%%------------------------------------------------------------------- +-module(mongoose_watchdog). +-author("DenysGonchar"). + +-behaviour(gen_event). +-behaviour(mongoose_service). + + +%% gen_event callbacks +-export([init/1, + handle_event/2, + handle_call/2, + terminate/2, + code_change/3]). + +%% mongoose_service callbacks +-export([start/1, + stop/0]). + +%% API +-export([add_handler/2, + call/2, + get_alarms/0]). + +-define(SERVER, ?MODULE). + +-record(state, {default_handlers = [], + active_alarms = []}). + +%%%=================================================================== +%%% API +%%%=================================================================== +-spec add_handler(Handler :: atom() | {atom(), term()}, + Args :: term()) -> + term(). +add_handler(Handler, Args) -> + safe_add_handler(?SERVER, Handler, Args). + +-spec call(Handler :: atom() | {atom(), term()}, + Args :: term()) -> + term(). +call(Handler, Args) -> + gen_event:call(?SERVER, Handler, Args). + +get_alarms() -> + gen_event:call(alarm_handler, ?MODULE, get_alarms). + + +%%%=================================================================== +%%% mongoose_service callbacks +%%%=================================================================== +-spec start(Opts :: list()) -> any(). +start(Opts) -> + OsMonEnv = proplists:get_value(os_mon_env, Opts, []), + start_os_mon(OsMonEnv), + MemsupSettings = proplists:get_value(memsup_settings, Opts, []), + configure_memsup(MemsupSettings), + DefaultHandlers = proplists:get_value(default_handlers, Opts, + [{mongoose_watchdog_default, []}]), + safe_add_handler(alarm_handler, ?MODULE, DefaultHandlers), + start_event_manager(DefaultHandlers), + ok. + + +-spec stop() -> any(). +stop() -> + gen_event:delete_handler(alarm_handler, ?MODULE, {?MODULE, stop, 0}). + + +%%%=================================================================== +%%% gen_event callbacks +%%%=================================================================== + +%%-------------------------------------------------------------------- +%% @private +%% @doc +%% Whenever a new event handler is added to an event manager, +%% this function is called to initialize the event handler. +%% +%% @end +%%-------------------------------------------------------------------- +-spec init(InitArgs :: term()) -> + {ok, State :: #state{}} | + {ok, State :: #state{}, hibernate} | + {error, Reason :: term()}. +init(DefaultHandlers) -> + {ok, #state{default_handlers = DefaultHandlers}}. + +%%-------------------------------------------------------------------- +%% @private +%% @doc +%% Whenever an event manager receives an event sent using +%% gen_event:notify/2 or gen_event:sync_notify/2, this function is +%% called for each installed event handler to handle the event. +%% +%% @end +%%-------------------------------------------------------------------- +-spec(handle_event(Event :: term(), State :: #state{}) -> + {ok, NewState :: #state{}} | + {ok, NewState :: #state{}, hibernate} | + {swap_handler, Args1 :: term(), NewState :: #state{}, + Handler2 :: (atom() | {atom(), Id :: term()}), Args2 :: term()} | + remove_handler). +handle_event(Event, State) -> + {ok, safe_handle_event(Event, State)}. + +%%-------------------------------------------------------------------- +%% @private +%% @doc +%% Whenever an event manager receives a request sent using +%% gen_event:call/3,4, this function is called for the specified +%% event handler to handle the request. +%% +%% @end +%%-------------------------------------------------------------------- +-spec(handle_call(Request :: term(), State :: #state{}) -> + {ok, Reply :: term(), NewState :: #state{}} | + {ok, Reply :: term(), NewState :: #state{}, hibernate} | + {swap_handler, Reply :: term(), Args1 :: term(), NewState :: #state{}, + Handler2 :: (atom() | {atom(), Id :: term()}), Args2 :: term()} | + {remove_handler, Reply :: term()}). +handle_call(get_alarms, State) -> + {ok, State#state.active_alarms, State}; +handle_call(_Request, State) -> + Reply = ok, + {ok, Reply, State}. + +%%-------------------------------------------------------------------- +%% @private +%% @doc +%% Whenever an event handler is deleted from an event manager, this +%% function is called. It should be the opposite of Module:init/1 and +%% do any necessary cleaning up. +%% +%% @spec terminate(Reason, State) -> void() +%% @end +%%-------------------------------------------------------------------- +-spec terminate(Args :: (term() | {stop, Reason :: term()} | stop | + {error, {'EXIT', Reason :: term()}} | + remove_handler | {error, term()}), + State :: term()) -> term(). +terminate(_Arg, _State) -> + catch gen_event:stop(?SERVER), + ok. + +%%-------------------------------------------------------------------- +%% @private +%% @doc +%% Convert process state when code is changed +%% +%% @end +%%-------------------------------------------------------------------- +-spec code_change(OldVsn :: term() | {down, term()}, + State :: #state{}, + Extra :: term()) -> + {ok, NewState :: #state{}}. +code_change(_OldVsn, State, _Extra) -> + {ok, State}. + +%%%=================================================================== +%%% Internal functions +%%%=================================================================== + +start_os_mon(OsMonEnv) -> + OS = proplists:get_value(start_os_sup, OsMonEnv, false), + CPU = proplists:get_value(start_cpu_sup, OsMonEnv, false), + Disk = proplists:get_value(start_disksup, OsMonEnv, false), + case application:get_env(os_mon,start_memsup) of + undefined -> %application is not loaded + application:load(os_mon), + application:set_env(os_mon, start_cpu_sup, CPU), + application:set_env(os_mon, start_os_sup, OS), + application:set_env(os_mon, start_memsup, true), + application:set_env(os_mon, start_disksup, Disk); + {ok, true} -> ok + %let it crash if start_memsup is set to false + end, + application:ensure_all_started(os_mon). + +configure_memsup(MemsupSettings) -> + case proplists:get_value(helper_timeout, MemsupSettings) of + undefined -> ok; + Timeout -> memsup:set_helper_timeout(Timeout) + end, + case proplists:get_value(procmem_high_watermark, MemsupSettings) of + undefined -> ok; + Watermark when is_integer(Watermark), Watermark > 0, Watermark < 100-> + memsup:set_procmem_high_watermark(Watermark/100) + %% let it crash if watermark has invalid format + end. + +start_event_manager(DefaultHandlers) -> + case erlang:whereis(?SERVER) of + undefined -> + gen_event:start({local, ?SERVER}); + _ -> ok + end, + [safe_add_handler(?SERVER, H, A) || {H, A} <- DefaultHandlers]. + +safe_add_handler(Manager, Handler, Agrs) -> + AnyFun = fun(H) when H =:= Handler -> true; + (_) -> false + end, + case lists:any(AnyFun, gen_event:which_handlers(Manager)) of + false -> gen_event:add_handler(Manager, Handler, Agrs); + _ -> ok + end. + +get_alarm_id({set_alarm, {AlarmId, Description}}) -> {set, AlarmId, Description}; +get_alarm_id({clear_alarm, AlarmId}) -> {clear, AlarmId, no_description}; +get_alarm_id(_) -> invalid_alarm. + +track_alarms({set, AlarmId, Descr}, #state{active_alarms = Alarms} = S) -> + S#state{active_alarms = lists:keystore(AlarmId, 1, Alarms, {AlarmId, Descr})}; +track_alarms({clear, AlarmId, _}, #state{active_alarms = Alarms} = S) -> + S#state{active_alarms = lists:keydelete(AlarmId, 1, Alarms)}. + +safe_handle_event(Event, #state{default_handlers = DefaultHandlers} = State) -> + try + case get_alarm_id(Event) of + {_, process_memory_high_watermark, _} = Alarm -> + start_event_manager(DefaultHandlers), + gen_event:notify(?SERVER, Event), + track_alarms(Alarm, State); + _ -> State + end + catch + _:_ -> State + end. diff --git a/src/watchdog/mongoose_watchdog_default.erl b/src/watchdog/mongoose_watchdog_default.erl new file mode 100644 index 00000000000..029b8eb4b54 --- /dev/null +++ b/src/watchdog/mongoose_watchdog_default.erl @@ -0,0 +1,235 @@ +%%%------------------------------------------------------------------- +%%% @author DenysGonchar +%%% @copyright (C) 2018, +%%% @doc +%%% +%%% @end +%%% Created : 05. Oct 2018 16:08 +%%%------------------------------------------------------------------- +-module(mongoose_watchdog_default). +-author("DenysGonchar"). + +-behaviour(gen_event). + + +%% gen_event callbacks +-export([init/1, + handle_event/2, + handle_call/2, + terminate/2, + code_change/3]). + +%% API +-export([add_handler/2, + verify/0]). + +-define(DEFAULT_FN, fun default_fn/2). +-define(HIGH_MEM_PROCS, [?MODULE, high_mem_procs_no]). + +-record(state, { memory_threshold = 0, + verify_timeout = 0, %seconds + timer_ref = undefined, + fn = ?DEFAULT_FN, + fn_state = []}). + +%%%=================================================================== +%%% API +%%%=================================================================== +-spec add_handler(Handler :: atom() | {atom(), term()}, + Args :: term()) -> + term(). +add_handler(Fn, Args) when is_function(Fn, 2) -> + mongoose_watchdog:call(?MODULE, {set_fn, Fn, Args}). + +verify() -> + mongoose_watchdog:call(?MODULE,verify). + + +%%%=================================================================== +%%% gen_event callbacks +%%%=================================================================== + +%%-------------------------------------------------------------------- +%% @private +%% @doc +%% Whenever a new event handler is added to an event manager, +%% this function is called to initialize the event handler. +%% +%% @end +%%-------------------------------------------------------------------- +-spec init(InitArgs :: term()) -> + {ok, State :: #state{}} | + {ok, State :: #state{}, hibernate} | + {error, Reason :: term()}. +init(Args) -> + Timeout = proplists:get_value(timeout, Args, 10), + add_metric(?HIGH_MEM_PROCS, histogram), + spawn(fun process_current_alarms/0), + {ok, #state{verify_timeout = Timeout}}. + +%%-------------------------------------------------------------------- +%% @private +%% @doc +%% Whenever an event manager receives an event sent using +%% gen_event:notify/2 or gen_event:sync_notify/2, this function is +%% called for each installed event handler to handle the event. +%% +%% @end +%%-------------------------------------------------------------------- +-spec(handle_event(Event :: term(), State :: #state{}) -> + {ok, NewState :: #state{}} | + {ok, NewState :: #state{}, hibernate} | + {swap_handler, Args1 :: term(), NewState :: #state{}, + Handler2 :: (atom() | {atom(), Id :: term()}), Args2 :: term()} | + remove_handler). +handle_event({set_alarm, {process_memory_high_watermark, _}}, State) -> + NewState = set_memory_threshold(State), + {ok, process_memory_watermark_alarm(NewState)}; +handle_event({clear_alarm, process_memory_high_watermark}, State) -> + {ok, stop_timer(State)}; +handle_event(_Event, State) -> + {ok, State}. + +%%-------------------------------------------------------------------- +%% @private +%% @doc +%% Whenever an event manager receives a request sent using +%% gen_event:call/3,4, this function is called for the specified +%% event handler to handle the request. +%% +%% @end +%%-------------------------------------------------------------------- +-spec(handle_call(Request :: term(), State :: #state{}) -> + {ok, Reply :: term(), NewState :: #state{}} | + {ok, Reply :: term(), NewState :: #state{}, hibernate} | + {swap_handler, Reply :: term(), Args1 :: term(), NewState :: #state{}, + Handler2 :: (atom() | {atom(), Id :: term()}), Args2 :: term()} | + {remove_handler, Reply :: term()}). +handle_call({set_fn, NewFn, Args}, State) -> + safe_run_fn(terminate, State), + NewState = safe_run_fn(init, State#state{fn = NewFn, fn_state = Args}), + {ok, ok, NewState}; +handle_call(verify, State) -> + NewState = State#state{timer_ref = undefined}, + {ok, ok, process_memory_watermark_alarm(NewState)}; +handle_call({alarms, Alarms}, State) -> + case proplists:is_defined(process_memory_high_watermark, Alarms) of + true -> + NewState = set_memory_threshold(State), + {ok, ok, process_memory_watermark_alarm(NewState)}; + _ -> {ok, ok, State} + end; +handle_call(_Request, State) -> + Reply = ok, + {ok, Reply, State}. + +%%-------------------------------------------------------------------- +%% @private +%% @doc +%% Whenever an event handler is deleted from an event manager, this +%% function is called. It should be the opposite of Module:init/1 and +%% do any necessary cleaning up. +%% +%% @spec terminate(Reason, State) -> void() +%% @end +%%-------------------------------------------------------------------- +-spec terminate(Args :: (term() | {stop, Reason :: term()} | stop | + {error, {'EXIT', Reason :: term()}} | + remove_handler | {error, term()}), + State :: term()) -> term(). +terminate(_Arg, State) -> + safe_run_fn(terminate, State), + ok. + +%%-------------------------------------------------------------------- +%% @private +%% @doc +%% Convert process state when code is changed +%% +%% @end +%%-------------------------------------------------------------------- +-spec code_change(OldVsn :: term() | {down, term()}, + State :: #state{}, + Extra :: term()) -> + {ok, NewState :: #state{}}. +code_change(_OldVsn, State, _Extra) -> + {ok, State}. + +%%%=================================================================== +%%% Internal functions +%%%=================================================================== + +add_metric(Metric, Type) -> + case exometer:info(Metric, name) of + undefined -> exometer:new(Metric, Type); + _ -> ok + end. + +update_metric(Metric, Value) -> + exometer:update(Metric, Value). + +safe_run_fn(terminate, #state{fn = Fn, fn_state = FnState} = State) -> + catch Fn(terminate, FnState), + State#state{fn = ?DEFAULT_FN, fn_state = []}; +safe_run_fn(Event, #state{fn = Fn, fn_state = FnState} = State) -> + NewState = (catch Fn(Event, FnState)), + case NewState of + {'EXIT', _} -> + catch Fn(exception, NewState), + safe_run_fn(terminate, State); + _ -> + State#state{fn_state = NewState} + end. + +set_memory_threshold(State) -> + {Total, _, _} = memsup:get_memory_data(), + Watermark = memsup:get_procmem_high_watermark(), + Threshold = trunc(Watermark * Total / 100), + State#state{memory_threshold = Threshold}. + +stop_timer(State) -> + case State#state.timer_ref of + undefined -> State; + TRef -> + timer:cancel(TRef), + update_metric(?HIGH_MEM_PROCS,0), + State#state{timer_ref = undefined, memory_threshold = 0} + end. + +process_memory_watermark_alarm(#state{memory_threshold = 0} = State) -> State; +process_memory_watermark_alarm(#state{memory_threshold = Threshold, + verify_timeout = Timeout} = State) -> + + Processes = [Pid || Pid <- processes(), + {memory, Mem} <- [process_info(Pid, memory)], + Mem > Threshold], + update_metric(?HIGH_MEM_PROCS, length(Processes)), + NewState = process_all_high_mem_processes(Processes, State), + TRef = timer:apply_after(Timeout * 1000, ?MODULE, verify, []), + NewState#state{timer_ref = TRef}. + + +process_all_high_mem_processes(Processes, State) -> + lists:foldl(fun process_high_mem_process/2, State, Processes). + +process_high_mem_process(Pid, State) -> + case process_info(Pid, [memory, + message_queue_len, + initial_call, + dictionary]) of + [{memory, Mem}, + {message_queue_len, MQL}, + {initial_call, InitCall}, + {dictionary, Dict}] -> + InitialCall = proplists:get_value('$initial_call', Dict, InitCall), + MsgQueueLen = MQL + proplists:get_value('$internal_queue_len', Dict, 0), + Event = {Pid, InitialCall, Mem, MsgQueueLen}, + safe_run_fn(Event, State); + _ -> State + end. + +process_current_alarms() -> + Alarms = mongoose_watchdog:get_alarms(), + mongoose_watchdog:call(?MODULE, {alarms, Alarms}). + +default_fn(_Event, State) -> State.