From 5c20f06d63c05638987c91e0e4dce7c510bfed61 Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Fri, 31 Jul 2015 00:35:52 +0500 Subject: [PATCH 1/2] --cleanuprss option to clear caches when memory reaches a certain limit. See https://github.com/scrapinghub/splash/issues/216#issuecomment-97890450 --- splash/monitors.py | 90 ++++++++++++++++++++++++++++++++++++++++++++++ splash/server.py | 28 ++++----------- splash/utils.py | 30 +++++++++++++++- 3 files changed, 126 insertions(+), 22 deletions(-) create mode 100644 splash/monitors.py diff --git a/splash/monitors.py b/splash/monitors.py new file mode 100644 index 000000000..12a082e19 --- /dev/null +++ b/splash/monitors.py @@ -0,0 +1,90 @@ +# -*- coding: utf-8 -*- +""" Splash periodic monitoring tasks """ +from __future__ import absolute_import, division +import gc +import time + +from splash.utils import memory_to_absolute, get_ru_maxrss, get_mem_usage, MB +from splash.qtutils import clear_caches + + +def monitor_maxrss(maxrss, check_intreval=60): + from twisted.internet import reactor, task + from twisted.python import log + + maxrss = memory_to_absolute(maxrss) + + def check_maxrss(): + if get_ru_maxrss() > maxrss * MB: + log.msg("maxrss exceeded %d MB, shutting down..." % maxrss) + reactor.stop() + + if maxrss: + log.msg("maxrss limit: %d MB" % maxrss) + t = task.LoopingCall(check_maxrss) + t.start(check_intreval, now=False) + + +def monitor_currss(threshold, verbosity, min_interval=30, check_interval=10): + """ + Monitor current memory usage and try to free memory + if it exceeds a `threshold` (in MB) and at least `min_interval` + seconds passed since last cleanup. + + Memory is measured on event loop ticks. Temporary memory usage + spikes may not be taken in account. + """ + from twisted.internet import task + from twisted.python import log + + objgraph = None + if verbosity >= 3: + try: + import objgraph + objgraph.show_growth() + except ImportError: + pass + + threshold = memory_to_absolute(threshold) + last_cleanup = [-1.0] + + def check_memusage(): + rss = get_mem_usage() + peak = get_ru_maxrss() + + if verbosity >= 2: + log.msg("Memory usage: %0.1fMB (%0.1fMB peak)" % (rss / MB, + peak / MB)) + + if rss > threshold * MB: + now = time.time() + interval = now - last_cleanup[0] + if interval > min_interval: + if verbosity >= 1: + log.msg( + "Splash uses too much memory: %0.1f > %0.1f. " + "Cleaning up WebKit caches.." % (rss / MB, threshold) + ) + + clear_caches() + gc.collect() + + rss_new = get_mem_usage() + if verbosity >= 1: + log.msg("Memory freed: %0.1f MB" % ((rss - rss_new) / MB)) + last_cleanup[0] = time.time() + + if verbosity >= 3 and objgraph: + objgraph.show_growth(limit=100) + else: + if verbosity >= 2: + log.msg( + "Splash uses too much memory (%0.1f > %0.1f.), but " + "the cache was cleared recently (%0.1f seconds ago)" % + (rss / MB, threshold, interval) + ) + + if threshold: + log.msg("cleanup threshold: %d MB" % threshold) + t = task.LoopingCall(check_memusage) + t.start(check_interval, now=False) diff --git a/splash/server.py b/splash/server.py index dc455891c..a178ad440 100644 --- a/splash/server.py +++ b/splash/server.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import +from __future__ import absolute_import, division import os import sys import optparse @@ -10,6 +10,8 @@ from splash import defaults, __version__ from splash import xvfb from splash.qtutils import init_qt_app +from splash.monitors import monitor_currss, monitor_maxrss + def install_qtreactor(verbose): init_qt_app(verbose) @@ -24,6 +26,8 @@ def parse_opts(): op.add_option("-f", "--logfile", help="log file") op.add_option("-m", "--maxrss", type=float, default=0, help="exit if max RSS reaches this value (in MB or ratio of physical mem) (default: %default)") + op.add_option("--cleanuprss", type=float, default=0, + help="clean WebKit caches if current RSS reaches this value (in MB or ratio of physical mem) (default: %default)") op.add_option("-p", "--port", type="int", default=defaults.SPLASH_PORT, help="port to listen to (default: %default)") op.add_option("-s", "--slots", type="int", default=defaults.SLOTS, @@ -100,6 +104,7 @@ def start_logging(opts): def splash_started(opts, stderr): if opts.logfile: stderr.write("Splash started - logging to: %s\n" % opts.logfile) + print("Splash started") def bump_nofile_limit(): @@ -219,26 +224,6 @@ def splash_server(portnum, slots, network_manager, max_timeout, reactor.listenTCP(proxy_portnum, proxy_server_factory) -def monitor_maxrss(maxrss): - from twisted.internet import reactor, task - from twisted.python import log - from splash.utils import get_ru_maxrss, get_total_phymem - - # Support maxrss as a ratio of total physical memory - if 0.0 < maxrss < 1.0: - maxrss = get_total_phymem() * maxrss / (1024 ** 2) - - def check_maxrss(): - if get_ru_maxrss() > maxrss * (1024 ** 2): - log.msg("maxrss exceeded %d MB, shutting down..." % maxrss) - reactor.stop() - - if maxrss: - log.msg("maxrss limit: %d MB" % maxrss) - t = task.LoopingCall(check_maxrss) - t.start(60, now=False) - - def default_splash_server(portnum, max_timeout, slots=None, cache_enabled=None, cache_path=None, cache_size=None, proxy_profiles_path=None, js_profiles_path=None, @@ -347,6 +332,7 @@ def main(): install_qtreactor(opts.verbosity >= 5) monitor_maxrss(opts.maxrss) + monitor_currss(opts.cleanuprss, opts.verbosity) if opts.manhole: manhole_server() diff --git a/splash/utils.py b/splash/utils.py index 3d1b0bb07..438d142a4 100644 --- a/splash/utils.py +++ b/splash/utils.py @@ -11,6 +11,7 @@ import psutil +MB = 1024*1024 _REQUIRED = object() @@ -80,11 +81,38 @@ def get_ru_maxrss(): return size +def get_mem_usage(): + """ + Return RSS usage of the current process (in bytes). + >>> MB = 1024*1024 + >>> 5*MB < get_mem_usage() < 2048*MB + True + """ + proc = psutil.Process(os.getpid()) + try: + return proc.memory_info().rss + except AttributeError: + # psutil < 2.x + return proc.get_memory_info()[0] + + +def memory_to_absolute(ratio): + """ + Calculate absolute RSS value given a ratio of total physical memory. + If 0 < ratio < 1.0 then ration is considered already absolute and returned + as-is. + """ + from splash.utils import get_total_phymem + if 0.0 < ratio < 1.0: + return get_total_phymem() * ratio / MB + return ratio + + def get_total_phymem(): """ Return the total amount of physical memory available. """ try: return psutil.virtual_memory().total - except AttributeError: # psutil < 2.0 + except AttributeError: # psutil < 2.0 return psutil.phymem_usage().total From 874bacd6fd9d66103e31df6333a79cf90f1029dc Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Fri, 31 Jul 2015 00:58:46 +0500 Subject: [PATCH 2/2] cleanup log message --- splash/server.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/splash/server.py b/splash/server.py index a178ad440..2de65ec1e 100644 --- a/splash/server.py +++ b/splash/server.py @@ -104,7 +104,8 @@ def start_logging(opts): def splash_started(opts, stderr): if opts.logfile: stderr.write("Splash started - logging to: %s\n" % opts.logfile) - print("Splash started") + else: + stderr.write("Splash started") def bump_nofile_limit():