|
| 1 | +#!/usr/bin/env python3 |
| 2 | + |
| 3 | +from subprocess import Popen, PIPE, TimeoutExpired, check_output |
| 4 | +from random import shuffle |
| 5 | +import time |
| 6 | +from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter |
| 7 | +import logging |
| 8 | +import pprint |
| 9 | + |
| 10 | +log = logging.getLogger("repair-tsd") |
| 11 | +log.setLevel(logging.INFO) |
| 12 | +ch = logging.StreamHandler() |
| 13 | +logformat = '%(asctime)s %(name)s %(levelname)s %(message)s' |
| 14 | +formatter = logging.Formatter(logformat) |
| 15 | +ch.setFormatter(formatter) |
| 16 | +log.addHandler(ch) |
| 17 | + |
| 18 | + |
| 19 | +class TSDRepair(object): |
| 20 | + def __init__(self, args): |
| 21 | + self.time_chunk = args.get("time_chunk", 15) |
| 22 | + self.timeout = int(self.time_chunk * 60) |
| 23 | + self.retries = args.get("retries", 1) |
| 24 | + self.multiplier = int(60 / self.time_chunk) |
| 25 | + self.time_range = args.get("time_range", 48) |
| 26 | + self.chunk_count = self.time_range * self.multiplier |
| 27 | + self.tsd_path = args.get("tsd_path", "/usr/share/opentsdb/bin/tsdb") |
| 28 | + self.cfg_path = args.get("cfg_path", "/etc/opentsdb/opentsdb.conf") |
| 29 | + self.use_sudo = args.get("use_sudo", False) |
| 30 | + self.sudo_user = args.get("sudo_user", "opentsdb") |
| 31 | + self.log = logging.getLogger("repair-tsd") |
| 32 | + self.base = "{} fsck --config={}".format(self.tsd_path, self.cfg_path) |
| 33 | + self.check_cmd = "{} uid --config={} metrics".format(self.tsd_path, self.cfg_path) |
| 34 | + if self.use_sudo: |
| 35 | + self.base = "sudo -u {} {}".format(self.sudo_user, self.base) |
| 36 | + self.check_cmd = "sudo -u {} {}".format(self.sudo_user, self.check_cmd) |
| 37 | + |
| 38 | + def _get_metrics(self): |
| 39 | + """ |
| 40 | + Collect all metrics from OpenTSDB |
| 41 | +
|
| 42 | + :returns: all metrics |
| 43 | + :rtype: list |
| 44 | + """ |
| 45 | + try: |
| 46 | + self.store_path = args.get('store_path', '/tmp/opentsdb.list') |
| 47 | + with open(self.store_path, 'r') as f_in: |
| 48 | + finished_metrics = [m for m in f_in.read().split('\n') if m] |
| 49 | + except Exception: |
| 50 | + finished_metrics = [] |
| 51 | + cmd = '{} uid --config={} grep metrics ".*"'.format(self.tsd_path, |
| 52 | + self.cfg_path) |
| 53 | + proc = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE) |
| 54 | + results = proc.communicate() |
| 55 | + metrics = [m.split(" ")[1].strip(":") |
| 56 | + for m in results[0].decode().split("\n") if m] |
| 57 | + metrics = [m for m in metrics if m and m != "\x00" and |
| 58 | + m not in finished_metrics] |
| 59 | + shuffle(metrics) |
| 60 | + self.log.info("There are {} metrics to process".format(len(metrics))) |
| 61 | + return metrics |
| 62 | + |
| 63 | + def _repair_metric_chunk(self, metric, chunk): |
| 64 | + """ |
| 65 | + Repair one 'chunk' of data for a metric |
| 66 | + """ |
| 67 | + self.log.debug("Running chunk {} for {}".format(chunk, metric)) |
| 68 | + if chunk < 2: |
| 69 | + timestr = "{}m-ago".format(self.time_chunk) |
| 70 | + else: |
| 71 | + timestr = "{}m-ago {}m-ago".format((chunk + 1) * self.time_chunk, |
| 72 | + chunk * self.time_chunk) |
| 73 | + cmd = "{} {} sum".format(self.base, timestr) |
| 74 | + """ |
| 75 | + Even though we're chunking, it's worth trying things more than once |
| 76 | + """ |
| 77 | + for x in range(1, self.retries + 2): |
| 78 | + self.log.debug("Repair try {} for {}".format(x, timestr)) |
| 79 | + fullcmd = "{} {} --fix-all --compact".format(cmd, metric) |
| 80 | + self.log.debug("Full command: {}".format(fullcmd)) |
| 81 | + metricproc = Popen(fullcmd, shell=True, stdout=PIPE, stderr=PIPE) |
| 82 | + try: |
| 83 | + results, err = metricproc.communicate(timeout=self.timeout) |
| 84 | + except TimeoutExpired: |
| 85 | + self.log.debug("{} failed to complete in window (run {})".format(metric, x)) |
| 86 | + continue |
| 87 | + except Exception as e: |
| 88 | + self.log.error("{} general exception :: {}".format(metric, |
| 89 | + e)) |
| 90 | + else: |
| 91 | + results = [r for r in results.decode().split("\n") if r][-26:] |
| 92 | + final_results = [] |
| 93 | + """ |
| 94 | + We'll only collect results that are non-0 |
| 95 | + since we're not super interested in stuff that didn't change. |
| 96 | + """ |
| 97 | + for r in results: |
| 98 | + # Strip the timestamp from the log line |
| 99 | + line = r.split(" ")[6:] |
| 100 | + try: |
| 101 | + if int(line[-1]) != 0: |
| 102 | + final_results.append(" ".join(line)) |
| 103 | + except Exception: |
| 104 | + final_results.append(" ".join(line)) |
| 105 | + result_str = "\n".join(final_results) |
| 106 | + self.log.debug("{} results:\n{}".format(metric, result_str)) |
| 107 | + if chunk % 20 == 0: |
| 108 | + self.log.info("Chunk {} of {} finished".format(chunk, self.chunk_count)) |
| 109 | + else: |
| 110 | + self.log.debug("Chunk {} of {} finished".format(chunk, self.chunk_count)) |
| 111 | + try: |
| 112 | + with open(self.store_path, 'a') as f_out: |
| 113 | + f_out.write("{}\n".format(metric)) |
| 114 | + except Exception: |
| 115 | + pass |
| 116 | + return None |
| 117 | + else: |
| 118 | + self.log.error("Failed to completely repair {}".format(metric)) |
| 119 | + return metric |
| 120 | + |
| 121 | + def process_metrics(self): |
| 122 | + """ |
| 123 | + Run fsck on a list of metrics over a time range |
| 124 | + """ |
| 125 | + failed_metrics = [] |
| 126 | + metrics = self._get_metrics() |
| 127 | + for index, metric in enumerate(metrics): |
| 128 | + try: |
| 129 | + check_output("{} {}".format(self.check_cmd, metric), |
| 130 | + shell=True) |
| 131 | + except Exception: |
| 132 | + log.warning("{} doesn't exist! Skipping...".format(metric)) |
| 133 | + continue |
| 134 | + logline = "{} ({} of {})".format(metric, index + 1, len(metrics)) |
| 135 | + logline += " ({} failed) in {} chunks".format(len(failed_metrics), |
| 136 | + self.chunk_count) |
| 137 | + self.log.info(logline) |
| 138 | + start_time = time.time() |
| 139 | + start_time_min = int(start_time//60 * 60) |
| 140 | + failed_metrics = [self._repair_metric_chunk(metric, x) |
| 141 | + for x in range(1, self.chunk_count + 1)] |
| 142 | + failed_metrics = [m for m in failed_metrics if m] |
| 143 | + runtime = time.time() - start_time |
| 144 | + self.log.info("{} repair took {} seconds".format(metric, |
| 145 | + int(runtime))) |
| 146 | + self.log.info("Failed metrics: {}".format(failed_metrics)) |
| 147 | + return failed_metrics |
| 148 | + |
| 149 | + |
| 150 | +def cli_opts(): |
| 151 | + parser = ArgumentParser(description="Repair all OpenTSDB metrics", |
| 152 | + formatter_class=ArgumentDefaultsHelpFormatter) |
| 153 | + parser.add_argument("--debug", action="store_true", default=False, |
| 154 | + help="Show debug information") |
| 155 | + parser.add_argument("--time-range", default="48", |
| 156 | + help="How many hours of time we collect to repair") |
| 157 | + parser.add_argument("--time-chunk", default="15", |
| 158 | + help="How many minutes of data to scan per chunk") |
| 159 | + parser.add_argument("--retries", default="1", |
| 160 | + help="How many times we should try failed metrics") |
| 161 | + parser.add_argument("--tsd-path", default="/usr/share/opentsdb/bin/tsdb", |
| 162 | + help="Path to the OpenTSDB CLI binary") |
| 163 | + parser.add_argument("--cfg-path", default="/etc/opentsdb/opentsdb.conf", |
| 164 | + help="Path to OpenTSDB config") |
| 165 | + parser.add_argument("--store-path", default="/opentsdb-fsck.list", |
| 166 | + help="Path to OpenTSDB config") |
| 167 | + parser.add_argument("--use-sudo", action="store_true", |
| 168 | + default=False, |
| 169 | + help="switch user when running repairs?") |
| 170 | + parser.add_argument("--sudo-user", default="opentsdb", |
| 171 | + help="User to switch to...") |
| 172 | + return parser.parse_args() |
| 173 | + |
| 174 | + |
| 175 | +def main(): |
| 176 | + args = cli_opts() |
| 177 | + if args.debug: |
| 178 | + log.setLevel(logging.DEBUG) |
| 179 | + try: |
| 180 | + time_range = int(args.time_range) |
| 181 | + except Exception as e: |
| 182 | + log.error("Invalid time range {} :: {}".format(args.time_range, e)) |
| 183 | + try: |
| 184 | + retries = int(args.retries) |
| 185 | + except Exception as e: |
| 186 | + log.error("Invalid retry number {} :: {}".format(args.retries, e)) |
| 187 | + try: |
| 188 | + time_chunk = int(args.time_chunk) |
| 189 | + if 60 % time_chunk != 0: |
| 190 | + raise ArithmeticError |
| 191 | + except Exception as e: |
| 192 | + log.error("Invalid time chunk {} :: {}".format(args.retries, e)) |
| 193 | + |
| 194 | + repair_tool = TSDRepair({"time_range": time_range, |
| 195 | + "use_sudo": args.use_sudo, |
| 196 | + "sudo_user": args.sudo_user, |
| 197 | + "time_chunk": time_chunk, |
| 198 | + "tsd_path": args.tsd_path, |
| 199 | + "cfg_path": args.cfg_path, |
| 200 | + "store_path": args.store_path, |
| 201 | + "retries": retries}) |
| 202 | + repair_tool.process_metrics() |
| 203 | + |
| 204 | + |
| 205 | +if __name__ == "__main__": |
| 206 | + main() |
0 commit comments