Skip to content

Commit 15944f6

Browse files
committed
Merge remote-tracking branch 'OFFICIAL/master'
2 parents f35f2e8 + e4da2c0 commit 15944f6

File tree

6 files changed

+234
-50
lines changed

6 files changed

+234
-50
lines changed

NEWS

+28
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,33 @@
11
OpenTSDB - User visible changes.
22

3+
* Version 2.3.1 (2018-04-21)
4+
5+
Noteworthy Changes:
6+
- When setting up aggregators, advance to the first data point equal to or greater
7+
than the query start timestamp. This helps with calendar downsampling intervals.
8+
- Add support to the Nagios check script for downsampling fill policies.
9+
10+
Bug Fixes:
11+
- Fix expression calculation by avoiding double execution and checking both
12+
output types for boolean values.
13+
- Fixing missing tools scripts in builds.
14+
- Default HBase 1.2.5 in the OSX install script
15+
- Upgrade AsyncBigtable to 0.3.1
16+
- Log query stats when a channel is closed unexpectedly.
17+
- Add the Java 8 path in the debian init script and remove Java 6.
18+
- Pass the column family name to the get requests in the compaction scheduler.
19+
- Fix a comparison issue in the UI on group by tags.
20+
- Filter annotation queries by the starting timestamp, excluding those in a row that
21+
began before the query start time.
22+
- Tiny stap at purging backticks from Gnuplot scripts.
23+
- Remove the `final` annotation from the meta classes so they can be extended.
24+
- Fix the javacc maven plugin version.
25+
- Fix the literal or filter to allow single character filters.
26+
- Fix query start stats logging to use the ms instead of nano time.
27+
- Move Jackson and Netty to newer versions for security reasons.
28+
- Upgrade to AsyncHBase 1.8.2 for compatibility with HBase 1.3 and 2.0
29+
- Fix the Highest Current calculation to handle empty time series.
30+
- Change the cache hits counters to longs.
331

432
* Version 2.3.0 (2016-12-31)
533

src/core/IncomingDataPoint.java

-5
Original file line numberDiff line numberDiff line change
@@ -128,11 +128,6 @@ public final String getTSUID() {
128128
return tsuid;
129129
}
130130

131-
/** @param moretags the hashmap of kv pair to add */
132-
public final void addTags(HashMap<String, String> moretags) {
133-
this.tags.putAll(moretags);
134-
}
135-
136131
/** @param metric the metric to set */
137132
public final void setMetric(String metric) {
138133
this.metric = metric;

src/tsd/AbstractHttpQuery.java

-10
Original file line numberDiff line numberDiff line change
@@ -165,16 +165,6 @@ public Map<String, String> getHeaders() {
165165
return headers;
166166
}
167167

168-
/**
169-
* Return the value of the given HTTP Header
170-
* first match wins
171-
* @return Header value as string
172-
*/
173-
public String getHeaderValue(final String headerName) {
174-
if (headerName == null) { return null; }
175-
return request.headers().get(headerName);
176-
}
177-
178168
/** @param stats The stats object to mark after writing is complete */
179169
public void setStats(final QueryStats stats) {
180170
this.stats = stats;

src/tsd/PutDataPointRpc.java

-18
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,6 @@ public void execute(final TSDB tsdb, final HttpQuery query)
124124
throw new BadRequestException("No datapoints found in content");
125125
}
126126

127-
final HashMap<String, String> query_tags = new HashMap<String, String>();
128127
final boolean show_details = query.hasQueryStringParam("details");
129128
final boolean show_summary = query.hasQueryStringParam("summary");
130129
final boolean synchronous = query.hasQueryStringParam("sync");
@@ -139,18 +138,6 @@ public void execute(final TSDB tsdb, final HttpQuery query)
139138
int queued = 0;
140139
final List<Deferred<Boolean>> deferreds = synchronous ?
141140
new ArrayList<Deferred<Boolean>>(dps.size()) : null;
142-
143-
if (tsdb.getConfig().enable_header_tag()) {
144-
LOG.debug("Looking for tag header " + tsdb.getConfig().get_name_header_tag());
145-
final String header_tag_value = query.getHeaderValue(tsdb.getConfig().get_name_header_tag()) ;
146-
if (header_tag_value != null) {
147-
LOG.debug(" header found with value:" + header_tag_value);
148-
Tags.parse(query_tags, header_tag_value);
149-
} else {
150-
LOG.debug(" no such header in request");
151-
}
152-
}
153-
154141
for (final IncomingDataPoint dp : dps) {
155142

156143
/** Handles passing a data point to the storage exception handler if
@@ -183,11 +170,6 @@ public String toString() {
183170
}
184171

185172
try {
186-
/** Add additionnal tags from HTTP header */
187-
if ( (query_tags != null) && (query_tags.size() > 0) ) {
188-
dp.addTags(query_tags);
189-
}
190-
191173
if (dp.getMetric() == null || dp.getMetric().isEmpty()) {
192174
if (show_details) {
193175
details.add(this.getHttpDetails("Metric name was empty", dp));

src/utils/Config.java

-17
Original file line numberDiff line numberDiff line change
@@ -97,9 +97,6 @@ public class Config {
9797
/** tsd.storage.fix_duplicates */
9898
private boolean fix_duplicates = false;
9999

100-
/** tsd.http.header_tag */
101-
private String http_header_tag = null;
102-
103100
/** tsd.http.request.max_chunk */
104101
private int max_chunked_requests = 4096;
105102

@@ -231,16 +228,6 @@ public int scanner_maxNumRows() {
231228
return scanner_max_num_rows;
232229
}
233230

234-
/** @return whether or not additional http header tag is allowed */
235-
public boolean enable_header_tag() {
236-
return http_header_tag != null ;
237-
}
238-
239-
/** @return the lookup value for additional http header tag */
240-
public String get_name_header_tag() {
241-
return http_header_tag ;
242-
}
243-
244231
/** @return whether or not chunked requests are supported */
245232
public boolean enable_chunked_requests() {
246233
return enable_chunked_requests;
@@ -548,7 +535,6 @@ protected void setDefaults() {
548535
default_map.put("tsd.core.stats_with_port", "false");
549536
default_map.put("tsd.http.show_stack_trace", "true");
550537
default_map.put("tsd.http.query.allow_delete", "false");
551-
default_map.put("tsd.http.header_tag", "");
552538
default_map.put("tsd.http.request.enable_chunked", "false");
553539
default_map.put("tsd.http.request.max_chunk", "4096");
554540
default_map.put("tsd.http.request.cors_domains", "");
@@ -666,9 +652,6 @@ protected void loadStaticVariables() {
666652
if (this.hasProperty("tsd.http.request.max_chunk")) {
667653
max_chunked_requests = this.getInt("tsd.http.request.max_chunk");
668654
}
669-
if (this.hasProperty("tsd.http.header_tag")) {
670-
http_header_tag = this.getString("tsd.http.header_tag");
671-
}
672655
enable_tree_processing = this.getBoolean("tsd.core.tree.enable_processing");
673656
fix_duplicates = this.getBoolean("tsd.storage.fix_duplicates");
674657
scanner_max_num_rows = this.getInt("tsd.storage.hbase.scanner.maxNumRows");

tools/repair-tsd

+206
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,206 @@
1+
#!/usr/bin/env python3
2+
3+
from subprocess import Popen, PIPE, TimeoutExpired, check_output
4+
from random import shuffle
5+
import time
6+
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
7+
import logging
8+
import pprint
9+
10+
log = logging.getLogger("repair-tsd")
11+
log.setLevel(logging.INFO)
12+
ch = logging.StreamHandler()
13+
logformat = '%(asctime)s %(name)s %(levelname)s %(message)s'
14+
formatter = logging.Formatter(logformat)
15+
ch.setFormatter(formatter)
16+
log.addHandler(ch)
17+
18+
19+
class TSDRepair(object):
20+
def __init__(self, args):
21+
self.time_chunk = args.get("time_chunk", 15)
22+
self.timeout = int(self.time_chunk * 60)
23+
self.retries = args.get("retries", 1)
24+
self.multiplier = int(60 / self.time_chunk)
25+
self.time_range = args.get("time_range", 48)
26+
self.chunk_count = self.time_range * self.multiplier
27+
self.tsd_path = args.get("tsd_path", "/usr/share/opentsdb/bin/tsdb")
28+
self.cfg_path = args.get("cfg_path", "/etc/opentsdb/opentsdb.conf")
29+
self.use_sudo = args.get("use_sudo", False)
30+
self.sudo_user = args.get("sudo_user", "opentsdb")
31+
self.log = logging.getLogger("repair-tsd")
32+
self.base = "{} fsck --config={}".format(self.tsd_path, self.cfg_path)
33+
self.check_cmd = "{} uid --config={} metrics".format(self.tsd_path, self.cfg_path)
34+
if self.use_sudo:
35+
self.base = "sudo -u {} {}".format(self.sudo_user, self.base)
36+
self.check_cmd = "sudo -u {} {}".format(self.sudo_user, self.check_cmd)
37+
38+
def _get_metrics(self):
39+
"""
40+
Collect all metrics from OpenTSDB
41+
42+
:returns: all metrics
43+
:rtype: list
44+
"""
45+
try:
46+
self.store_path = args.get('store_path', '/tmp/opentsdb.list')
47+
with open(self.store_path, 'r') as f_in:
48+
finished_metrics = [m for m in f_in.read().split('\n') if m]
49+
except Exception:
50+
finished_metrics = []
51+
cmd = '{} uid --config={} grep metrics ".*"'.format(self.tsd_path,
52+
self.cfg_path)
53+
proc = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE)
54+
results = proc.communicate()
55+
metrics = [m.split(" ")[1].strip(":")
56+
for m in results[0].decode().split("\n") if m]
57+
metrics = [m for m in metrics if m and m != "\x00" and
58+
m not in finished_metrics]
59+
shuffle(metrics)
60+
self.log.info("There are {} metrics to process".format(len(metrics)))
61+
return metrics
62+
63+
def _repair_metric_chunk(self, metric, chunk):
64+
"""
65+
Repair one 'chunk' of data for a metric
66+
"""
67+
self.log.debug("Running chunk {} for {}".format(chunk, metric))
68+
if chunk < 2:
69+
timestr = "{}m-ago".format(self.time_chunk)
70+
else:
71+
timestr = "{}m-ago {}m-ago".format((chunk + 1) * self.time_chunk,
72+
chunk * self.time_chunk)
73+
cmd = "{} {} sum".format(self.base, timestr)
74+
"""
75+
Even though we're chunking, it's worth trying things more than once
76+
"""
77+
for x in range(1, self.retries + 2):
78+
self.log.debug("Repair try {} for {}".format(x, timestr))
79+
fullcmd = "{} {} --fix-all --compact".format(cmd, metric)
80+
self.log.debug("Full command: {}".format(fullcmd))
81+
metricproc = Popen(fullcmd, shell=True, stdout=PIPE, stderr=PIPE)
82+
try:
83+
results, err = metricproc.communicate(timeout=self.timeout)
84+
except TimeoutExpired:
85+
self.log.debug("{} failed to complete in window (run {})".format(metric, x))
86+
continue
87+
except Exception as e:
88+
self.log.error("{} general exception :: {}".format(metric,
89+
e))
90+
else:
91+
results = [r for r in results.decode().split("\n") if r][-26:]
92+
final_results = []
93+
"""
94+
We'll only collect results that are non-0
95+
since we're not super interested in stuff that didn't change.
96+
"""
97+
for r in results:
98+
# Strip the timestamp from the log line
99+
line = r.split(" ")[6:]
100+
try:
101+
if int(line[-1]) != 0:
102+
final_results.append(" ".join(line))
103+
except Exception:
104+
final_results.append(" ".join(line))
105+
result_str = "\n".join(final_results)
106+
self.log.debug("{} results:\n{}".format(metric, result_str))
107+
if chunk % 20 == 0:
108+
self.log.info("Chunk {} of {} finished".format(chunk, self.chunk_count))
109+
else:
110+
self.log.debug("Chunk {} of {} finished".format(chunk, self.chunk_count))
111+
try:
112+
with open(self.store_path, 'a') as f_out:
113+
f_out.write("{}\n".format(metric))
114+
except Exception:
115+
pass
116+
return None
117+
else:
118+
self.log.error("Failed to completely repair {}".format(metric))
119+
return metric
120+
121+
def process_metrics(self):
122+
"""
123+
Run fsck on a list of metrics over a time range
124+
"""
125+
failed_metrics = []
126+
metrics = self._get_metrics()
127+
for index, metric in enumerate(metrics):
128+
try:
129+
check_output("{} {}".format(self.check_cmd, metric),
130+
shell=True)
131+
except Exception:
132+
log.warning("{} doesn't exist! Skipping...".format(metric))
133+
continue
134+
logline = "{} ({} of {})".format(metric, index + 1, len(metrics))
135+
logline += " ({} failed) in {} chunks".format(len(failed_metrics),
136+
self.chunk_count)
137+
self.log.info(logline)
138+
start_time = time.time()
139+
start_time_min = int(start_time//60 * 60)
140+
failed_metrics = [self._repair_metric_chunk(metric, x)
141+
for x in range(1, self.chunk_count + 1)]
142+
failed_metrics = [m for m in failed_metrics if m]
143+
runtime = time.time() - start_time
144+
self.log.info("{} repair took {} seconds".format(metric,
145+
int(runtime)))
146+
self.log.info("Failed metrics: {}".format(failed_metrics))
147+
return failed_metrics
148+
149+
150+
def cli_opts():
151+
parser = ArgumentParser(description="Repair all OpenTSDB metrics",
152+
formatter_class=ArgumentDefaultsHelpFormatter)
153+
parser.add_argument("--debug", action="store_true", default=False,
154+
help="Show debug information")
155+
parser.add_argument("--time-range", default="48",
156+
help="How many hours of time we collect to repair")
157+
parser.add_argument("--time-chunk", default="15",
158+
help="How many minutes of data to scan per chunk")
159+
parser.add_argument("--retries", default="1",
160+
help="How many times we should try failed metrics")
161+
parser.add_argument("--tsd-path", default="/usr/share/opentsdb/bin/tsdb",
162+
help="Path to the OpenTSDB CLI binary")
163+
parser.add_argument("--cfg-path", default="/etc/opentsdb/opentsdb.conf",
164+
help="Path to OpenTSDB config")
165+
parser.add_argument("--store-path", default="/opentsdb-fsck.list",
166+
help="Path to OpenTSDB config")
167+
parser.add_argument("--use-sudo", action="store_true",
168+
default=False,
169+
help="switch user when running repairs?")
170+
parser.add_argument("--sudo-user", default="opentsdb",
171+
help="User to switch to...")
172+
return parser.parse_args()
173+
174+
175+
def main():
176+
args = cli_opts()
177+
if args.debug:
178+
log.setLevel(logging.DEBUG)
179+
try:
180+
time_range = int(args.time_range)
181+
except Exception as e:
182+
log.error("Invalid time range {} :: {}".format(args.time_range, e))
183+
try:
184+
retries = int(args.retries)
185+
except Exception as e:
186+
log.error("Invalid retry number {} :: {}".format(args.retries, e))
187+
try:
188+
time_chunk = int(args.time_chunk)
189+
if 60 % time_chunk != 0:
190+
raise ArithmeticError
191+
except Exception as e:
192+
log.error("Invalid time chunk {} :: {}".format(args.retries, e))
193+
194+
repair_tool = TSDRepair({"time_range": time_range,
195+
"use_sudo": args.use_sudo,
196+
"sudo_user": args.sudo_user,
197+
"time_chunk": time_chunk,
198+
"tsd_path": args.tsd_path,
199+
"cfg_path": args.cfg_path,
200+
"store_path": args.store_path,
201+
"retries": retries})
202+
repair_tool.process_metrics()
203+
204+
205+
if __name__ == "__main__":
206+
main()

0 commit comments

Comments
 (0)