-
Notifications
You must be signed in to change notification settings - Fork 1
/
pg_check.py
1989 lines (1765 loc) · 103 KB
/
pg_check.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
#!/usr/bin/env python2
#!/usr/bin/env python
#!/usr/bin/python
### pg_check.py
###############################################################################
### COPYRIGHT NOTICE FOLLOWS. DO NOT REMOVE
###############################################################################
### Copyright (c) 1998 - 2023 SQLEXEC LLC
###
### Permission to use, copy, modify, and distribute this software and its
### documentation for any purpose, without fee, and without a written agreement
### is hereby granted, provided that the above copyright notice and this paragraph
### and the following two paragraphs appear in all copies.
###
### IN NO EVENT SHALL SQLEXEC LLC BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT,
### INDIRECT SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS,
### ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF
### SQLEXEC LLC HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
###
### SQLEXEC LLC SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT
### LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
### PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS,
### AND SQLEXEC LLC HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
### ENHANCEMENTS, OR MODIFICATIONS.
###
###############################################################################
#
# Original Author: Michael Vitale, [email protected]
#
# Description: This python utility does email alerting based on specific input actions.
#
# Inputs: all fields are optional except database.
# -h <hostname or IP address>
# -p <PORT>
# -U <db user>
# -n <schema>
# -d <database>
# -g general boolean to do all other stuff as well
# -w <number> check waits/locks > number seconds
# -l <number> check for long running queries > number minutes
# -i <number> check for idle in trans > number minutes
# -o <number> check for idle connections > number minutes
# -e environment ID, aka, hostname, dbname, SDLC name, etc.
# -t [testing mode with testing email]
# -v [verbose output flag, mostly used for debugging]
#
# Requirements:
# 1. python 3
# 2. psql client
# 3. sendmail service running on host (alternative: postfix). Check /var/log/maillog
#
# Assumptions:
# 1. db user defaults to postgres if not provided as parameter.
# 2. Password must be in local .pgpass file or client authentication changed to trust or peer
# 3. psql must be in the user's path
# 4. Make sure timing and pager are turned off (see .psqlrc)
# 5. slack webhook must be in specified file, ~/.slackhook
#
# Cron Job Info:
# View cron job output: view /var/log/cron
# Example cron job that does checks against the cluster and nothing else (pgbackrest, pgbounceer) and logs warnings to email and slack
# * * * * * /var/lib/pgsql/pg_check/pg_check.py -h localhsot -p 5416 -U postgres -d clone_testing -o 2440 -w 10 -l 60 -i 30 -e CLONE_TESTING -g -m -s
#
# TODOs:
#
# History:
# who did it Date did what
# ========== ========= ==============================
# Michael Vitale 09/06/2021 Original coding using python 3.x CentOS 8.3 and PG 11.x
# Michael Vitale 09/14/2021 Modified parameter structure and some fixes
# Michael Vitale 09/27/2021 Added new functionality for idle connections
# Michael Vitale 09/28/2021 filter out DataFileRead-IO as a considered wait condition
# Michael Vitale 05/29/2022 detect cpu automatically if localhost and report it.
# Michael Vitale 12/12/2023 v1.3 Major Upgrade: added slack notification method, bug fixes
# Michael Vitale 12/16/2023 Fixed PG major and minor version checking based on latest versions.
# Michael Vitale 12/17/2023 Enhancement: Control how often alerts are done based on history alert file.
# Michael Vitale 12/21/2023 Enhancement: Add PGBouncer and PGBackrest checks
# Michael Vitale 12/26/2023 Enhancement: Add warnings from current PG log file (local only)
# Michael Vitale 01/05/2024 Enhancement: Use calculated formula for size to determe vacuum freeze candidates since the pg_table_size() func can cause wait/lock conditions
################################################################################################################
import string, sys, os, time
#import datetime
from datetime import datetime, timedelta
from datetime import date
import tempfile, platform, math
from decimal import *
import smtplib
import subprocess
from subprocess import Popen, PIPE
from optparse import OptionParser
import getpass
#############################################################################################
#globals
SUCCESS = 0
ERROR = -1
ERROR2 = -2
ERROR3 = -3
WARNING = -4
DEFERRED = 1
NOTICE = 2
TOOLONG = 3
HIGHLOAD = 4
DESCRIPTION="This python utility program issues email/slack alerts for waits, locks, idle in trans, long queries."
VERSION = 1.4
PROGNAME = "pg_check"
ADATE = "Jan 5, 2024"
PROGDATE = "2024-01-05"
MARK_OK = "[ OK ] "
MARK_WARN = "[WARN] "
# alert notifications
TESTALERT="TestAlert"
DIRSIZE="DirSize"
IDLECONNS="IdleConns"
WAITS="Waits"
IDLEINTRANS="IdleInTrans"
LONGQUERY="LongQuery"
ACTIVECONNS="ActiveConns"
LOAD1="Load1"
LOAD5="Load5"
LOAD15="Load15"
PGBOUNCER1="PGBouncer1"
PGBOUNCER2="PGBouncer2"
PGBOUNCER3="PGBouncer3"
PGBACKREST1="PGBackrest1"
REPLICATION="Replication"
PGHOSTUP="PGHostUp"
#############################################################################################
########################### class definition ################################################
#############################################################################################
class maint:
def __init__(self):
self.dateprogstr = PROGDATE
self.dateprog = datetime.strptime(PROGDATE, "%Y-%m-%d")
self.datenowstr = datetime.now().strftime("%Y-%m-%d")
self.datenow = datetime.today()
self.datediff = self.datenow - self.dateprog
self.genchechs = ''
self.waitslocks = -1
self.longquerymins = -1
self.idleintransmins = -1
self.idleconnmins = -1
self.cpus = -1
self.environment = ''
self.local = False
self.dbhost = ''
self.dbport = 5432
self.dbuser = ''
self.database = ''
self.testmode = False
self.verbose = False
self.debug = False
self.connected = False
self.slacknotify = False
self.mailnotify = False
self.checkreplication = False
self.checkpgbouncer = False
self.checkpgbackrest = False
# slack hook found in users home dir/.slackhook file
hookfile = os.path.expanduser("~") + '/.slackhook'
with open(hookfile) as f:
self.slackhook = f.readline().strip('\n')
#print ('slackhook=%s' % self.slackhook)
self.to = '[email protected]'
#self.to = '[email protected] [email protected]'
self.from_ = '[email protected]'
self.fout = ''
self.connstring = ''
self.schemaclause = ' '
self.pid = os.getpid()
self.opsys = ''
self.tempfile = ''
self.tempdir = tempfile.gettempdir()
self.pgbindir = ''
self.pgversionmajor = Decimal('0.0')
self.pgversionminor = '0.0'
self.programdir = ''
self.alertsfile = ''
self.alertslist = []
# default is 15 mins
self.alertmaxsecs = 900
self.slaves = []
self.slavecnt = 0
self.in_recovery = False
self.bloatedtables = False
self.unusedindexes = False
self.freezecandidates = False
self.analyzecandidates = False
self.timestartmins = time.time() / 60
# db config stuff
self.archive_mode = ''
self.max_connections = -1
self.datadir = ''
self.logdir = ''
self.waldir = ''
self.shared_buffers = -1
self.work_mem = -1
self.maint_work_mem = -1
self.eff_cache_size = -1
self.shared_preload_libraries = ''
self.pg_type = 'community'
self.overcommit_memory = -1
self.overcommit_ratio = -1
###########################################################
def send_alert(self, to, from_, subject, body):
# assumes nonprintables are already removed from the body, else it will send it as an attachment and not in the body of the email!
# msg = 'echo "%s" | mailx -s "%s" -r %s -- %s' % (body, subject, to, from_)
rc = 0
msg2 = self.environment + ' ' + subject
msg = 'echo "%s" | mailx -s "%s" %s' % (body, msg2, to)
# print ("DEBUG: msg=%s" % msg)
if self.mailnotify:
if self.verbose:
print ("[****] sending email...")
rc = os.system(msg)
if self.slacknotify:
if self.verbose:
print ("[****] sending to slack...")
if body == '':
msg2 = self.environment + ' ' + subject
else:
msg2 = self.environment + ' ' + subject + ':' + body
msg = 'curl --location "' + self.slackhook + '" --header "Content-Type: application/json" --data "{\"text\": \\"' + msg2 + '\\"}"'
# weird "ok" string printed as output from os.system call, so route to /dev/null
msg = msg + ">/dev/null 2>&1"
rc = os.system(msg)
return rc
###########################################################
def set_dbinfo(self, dbhost, dbport, dbuser, database, schema, genchecks, waitslocks, longquerymins, idleintransmins, idleconnmins, cpus, \
environment, testmode, verbose, debug, slacknotify, mailnotify, checkreplication, checkpgbouncer, checkpgbackrest, argv):
self.waitslocks = waitslocks
self.dbhost = dbhost
self.dbport = dbport
self.dbuser = dbuser
self.database = database
self.schema = schema
self.genchecks = genchecks
self.environment = environment
self.testmode = testmode
self.verbose = verbose
self.debug = debug
self.slacknotify = slacknotify
self.mailnotify = mailnotify
self.checkreplication = checkreplication
self.checkpgbouncer = checkpgbouncer
self.checkpgbackrest = checkpgbackrest
if waitslocks == -999:
#print("waitslocks not passed")
pass
elif waitslocks is None or waitslocks < 1:
return ERROR, "Invalid waitslocks provided: %s" % waitslocks
else:
self.waitslocks = waitslocks
if cpus == -999:
#print("cpus not passed")
# cat /proc/cpuinfo | grep processor | wc -l
cmd = 'cat /proc/cpuinfo | grep processor | wc -l'
rc, results = self.executecmd(cmd, True)
if rc != SUCCESS:
# just pass
print ("Unable to get CPU count.")
else:
self.cpus = int(results)
#print("Cpus=%d" % self.cpus)
elif cpus is None or cpus < 1:
return ERROR, "Invalid CPUs provided: %s" % cpus
else:
self.cpus = cpus
if longquerymins == -999:
#print("longquerymins not passed")
pass
elif longquerymins is None or longquerymins < 1:
return ERROR, "Invalid longquerymins provided: %s" % longquerymins
else:
self.longquerymins = longquerymins
if idleintransmins == -999:
#print("idleintransmins not passed")
pass
elif idleintransmins is None or idleintransmins < 1:
return ERROR, "Invalid idleintransmins provided: %s" % idleintransmins
else:
self.idleintransmins = idleintransmins
if idleconnmins == -999:
#print("idleconnmins not passed")
pass
elif idleconnmins is None or idleconnmins < 1:
return ERROR, "Invalid idleconnmins provided: %s" % idleconnmins
else:
self.idleconnmins = idleconnmins
if self.testmode:
self.to = '[email protected] '
#print("testing mode")
else:
#self.to = '[email protected] [email protected]'
self.to = '[email protected]'
# process the schema or table elements
total = len(argv)
cmdargs = str(argv)
if os.name == 'posix':
self.opsys = 'posix'
self.dir_delim = '/'
elif os.name == 'nt':
self.opsys = 'nt'
self.dir_delim = '\\'
else:
return ERROR, "Unsupported platform."
self.workfile = "%s%s%s_stats.sql" % (self.tempdir, self.dir_delim, self.pid)
self.workfile_deferred = "%s%s%s_stats_deferred.sql" % (self.tempdir, self.dir_delim, self.pid)
self.tempfile = "%s%s%s_temp.sql" % (self.tempdir, self.dir_delim, self.pid)
self.reportfile = "%s%s%s_report.txt" % (self.tempdir, self.dir_delim, self.pid)
# construct the connection string that will be used in all database requests
# do not provide host name and/or port if not provided
if self.dbhost != '':
self.connstring = " -h %s " % self.dbhost
if self.database != '':
self.connstring += " -d %s " % self.database
if self.dbport != '':
self.connstring += " -p %s " % self.dbport
if self.dbuser != '':
self.connstring += " -U %s " % self.dbuser
if self.schema != '':
self.schemaclause = " and n.nspname = '%s' " % self.schema
# check if local connection for automatic checking of cpus, mem, etc.
if 'localhost' in self.dbhost or '127.0.0.1' in self.dbhost or dbhost == '':
# appears to be local host
self.local = True
if self.verbose:
print ("The total numbers of args passed to the script: %d " % total)
print ("Args list: %s " % cmdargs)
print ("connection string: %s" % self.connstring)
self.programdir = sys.path[0]
# Make sure psql is in the path
if self.opsys == 'posix':
cmd = "which psql"
else:
# assume windows
cmd = "where psql"
rc, results = self.executecmd(cmd, True)
if rc != SUCCESS:
errors = "Unable to determine if psql is in path. rc=%d results=%s" % (rc,results)
return rc, errors
if 'psql' not in results:
msg = "psql must be in the path. rc=%d, results=%s" % (rc, results)
return ERROR, msg
pos = results.find('psql')
if pos > 0:
self.pgbindir = results[0:pos]
# get history of alerts to control current sesssion alerts
self.alertsfile = self.programdir + '/' + 'pg_check.alerts'
if os.path.isfile(self.alertsfile):
# read in last 30 alerts for subsequent checking
#print ("Alerts file found.")
with open(self.alertsfile) as file:
for aline in (file.readlines() [-30:]):
self.alertslist += [aline.strip()]
#print("The list: " + str(self.alertslist))
else:
#print("alerts file not found: %s" % self.alertsfile)
pass
rc, results = self.get_configinfo()
if rc != SUCCESS:
errors = "rc=%d results=%s" % (rc,results)
return rc, errors
rc, results = self.get_pgversion()
if rc != SUCCESS:
return rc, results
print ("%s version: %.1f %s Python Version: %d PG Version: %s local detected=%r PG Database: %s\n\n" \
% (PROGNAME, VERSION, ADATE, sys.version_info[0], self.pgversionminor, self.local, self.database))
# See if we can even connect to the PG host.
# If not, treat as PG host down warning
sql = 'SELECT 1'
cmd = "psql %s -At -X -c \"%s\" > %s" % (self.connstring, sql, self.tempfile)
rc, results = self.executecmd(cmd, False)
if rc != SUCCESS:
marker = MARK_WARN
if 'could not connect to server' in results or 'Connection refused' in results:
msg = 'PG Connection Refused.'
else:
msg = 'Unexpected PG Connection Error'
subject = msg
if self.alert(PGHOSTUP):
rc = self.send_alert(self.to, self.from_, subject, '')
print (marker+msg)
return rc, results
else:
marker = MARK_OK
msg = 'PG Host is up.'
print (marker+msg)
return SUCCESS, ''
###########################################################
def log_alert(self, msg):
afile = open(self.programdir + '/' + 'pg_check.alerts', "a")
n = datetime.now()
adate = n.strftime("%Y-%m-%d %H:%M:%S")
afile.write(adate + '*' + msg + '\n')
afile.close()
return
###########################################################
def alert(self, msg):
#print("msg=%s" % msg)
doit = False
noalerts = True
dt1 = datetime.now()
for alert in self.alertslist:
noalerts = False
parts = alert.split('*')
adatetimestr = parts[0].strip()
adatetimeobj = datetime.strptime(adatetimestr, "%Y-%m-%d %H:%M:%S")
analert = parts[1].strip()
if analert != msg:
continue
diff = dt1 - adatetimeobj
secs = diff.seconds
# only alert on certain types of conditions
if self.debug:
print("alert checking with analert=%s seconds=%d and max seconds=%d..." % (analert, secs,self.alertmaxsecs))
if secs > self.alertmaxsecs:
if self.debug:
print("alert qualifies")
if msg == TESTALERT:
doit = True
elif msg ==ACTIVECONNS:
doit = True
elif msg ==LOAD1:
doit = True
elif msg ==LOAD5:
doit = True
elif msg ==LOAD15:
doit = True
elif msg ==DIRSIZE:
doit = True
elif msg ==WAITS:
doit = True
elif msg ==REPLICATION:
doit = True
elif msg ==IDLEINTRANS:
doit = True
elif msg ==LONGQUERY:
doit = True
elif msg ==PGBOUNCER1:
doit = True
# skip PGBOUNCER2
elif msg ==PGBOUNCER3:
doit = True
elif msg ==PGBACKREST1:
doit = True
elif msg ==PGHOSTUP:
doit = True
else:
# found but does not qualify
doit = False
if self.debug:
print("alert not qualified")
if noalerts:
# no alerts in alert file so alert
doit = True
# log the alert
self.log_alert(msg)
if self.debug:
print("no alerts found. Do alert...")
return True
if doit:
if self.debug:
print("do alert...")
# log the alert
self.log_alert(msg)
return True
else:
if self.debug:
print("alert bypassed...")
return False
###########################################################
def cleanup(self):
if self.connected:
# do something here later if we enable a db driver
self.connected = false
# print ("deleting temp file: %s" % self.tempfile)
try:
os.remove(self.tempfile)
except OSError:
pass
return
###########################################################
def getnow(self):
now = datetime.now()
adate = str(now)
parts = adate.split('.')
return parts[0]
###########################################################
def getfilelinecnt(self, afile):
return sum(1 for line in open(afile))
###########################################################
def convert_humanfriendly_to_MB(self, humanfriendly):
# assumes input in form: 10GB, 500 MB, 200 KB, 1TB
# returns value in megabytes
hf = humanfriendly.upper()
valueMB = -1
if 'TB' in (hf):
pos = hf.find('TB')
valueMB = int(hf[0:pos]) * (1024*1024)
elif 'GB' in (hf):
pos = hf.find('GB')
value = hf[0:pos]
valueMB = int(hf[0:pos]) * 1024
elif 'MB' in (hf):
pos = hf.find('MB')
valueMB = int(hf[0:pos]) * 1
elif 'KB' in (hf):
pos = hf.find('KB')
valueMB = round(float(hf[0:pos]) / 1024, 2)
valuefloat = "%.2f" % valueMB
return Decimal(valuefloat)
###########################################################
def writeout(self,aline):
if self.fout != '':
aline = aline + "\r\n"
self.fout.write(aline)
else:
# default to standard output
print (aline)
return
###########################################################
def get_configinfo(self):
#print("conn=%s" % self.connstring)
sql = "show all"
cmd = "psql %s -At -X -c \"%s\" > %s" % (self.connstring, sql, self.tempfile)
rc, results = self.executecmd(cmd, False)
if rc != SUCCESS:
# let calling function report the error
errors = "Unable to get config info: %d %s\nsql=%s\n" % (rc, results, sql)
#aline = "%s" % (errors)
#self.writeout(aline)
return rc, errors
f = open(self.tempfile, "r")
lineno = 0
count = 0
for line in f:
lineno = lineno + 1
aline = line.strip()
if len(aline) < 1:
continue
# v2.2 fix: things like "Timing is On" can appear as a line so bypass
if aline == 'Timing is on.' or aline == 'Timing is off.' or aline == 'Pager usage is off.' or aline == 'Pager is used for long output.' or ':activity' in aline or 'Time: ' in aline:
continue
# print ("DEBUG: aline=%s" % (aline))
fields = aline.split('|')
name = fields[0].strip()
setting = fields[1].strip()
#print ("name=%s setting=%s" % (name, setting))
if name == 'data_directory':
self.datadir = setting
if self.pgversionmajor > Decimal('9.6'):
self.waldir = "%s/pg_wal" % self.datadir
else:
self.waldir = "%s/pg_xlog" % self.datadir
# for pg rds version, 9.6, "show all" command does not have shared_preload_libraries! so rely on data_directory instead
if 'rdsdbdata' in self.datadir:
self.pg_type = 'rds'
# heroku indicator using aws in the background
elif self.datadir == '/database':
self.pg_type = 'rds'
elif name == 'log_directory':
self.logdir = setting
elif name == 'archive_mode':
self.archive_mode = setting
elif name == 'max_connections':
self.max_connections = int(setting)
elif name == 'shared_buffers':
# shared_buffers in 8kilobytes units from select from pg_settings, so convert to megabytes, but show gives user friendly form (10GB, 10MB, 10KB, etc.)
# self.shared_buffers = int(setting) / 8192
rc = self.convert_humanfriendly_to_MB(setting)
self.shared_buffers = rc
elif name == 'maintenance_work_mem':
# maintenance_work_mem in kilobytes units from select from pg_settings, so convert to megabytes, but show gives user friendly form (10GB, 10MB, 10KB, etc.)
# self.maint_work_mem = int(setting) / 1024
rc = self.convert_humanfriendly_to_MB(setting)
self.maint_work_mem = rc
elif name == 'work_mem':
# work_mem in kilobytes units from select from pg_settings, so convert to megabytes, but show gives user friendly form (10GB, 10MB, 10KB, etc.)
#self.work_mem = int(setting) / 1024
rc = self.convert_humanfriendly_to_MB(setting)
self.work_mem = rc
elif name == 'effective_cache_size':
# effective_cache_size in 8 kilobytes units from select from pg_settings, so convert to megabytes, but show gives user friendly form (10GB, 10MB, 10KB, etc.)
rc = self.convert_humanfriendly_to_MB(setting)
self.eff_cache_size = rc
elif name == 'shared_preload_libraries':
# we only care that it is loaded, not necessarily created
# for pg rds version, 9.6, "show all" command does not have shared_preload_libraries! so rely on data_directory instead
self.shared_preload_libraries = setting
if 'rdsutils' in self.shared_preload_libraries:
self.pg_type = 'rds'
elif name == 'rds.extensions':
self.pg_type = 'rds'
f.close()
if self.verbose:
print ("shared_buffers = %d maint_work_mem = %d work_mem = %d shared_preload_libraries = %s" % (self.shared_buffers, self.maint_work_mem, self.work_mem, self.shared_preload_libraries))
return SUCCESS, results
###########################################################
def executecmd(self, cmd, expect):
if self.debug:
print ("[****] executecmd --> %s" % cmd)
# NOTE: try and catch does not work for Popen
try:
# Popen(args, bufsize=0, executable=None, stdin=None, stdout=None, stderr=None, preexec_fn=None, close_fds=False, shell=False, cwd=None, env=None, universal_newlines=False, startupinfo=None, creationflags=0)
if self.opsys == 'posix':
p = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE, executable="/bin/bash")
else:
p = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE)
values2, err2 = p.communicate()
except exceptions.OSError as e:
print ("exceptions.OSError Error",e)
return ERROR, "Error(1)"
except BaseException as e:
print ("BaseException Error",e)
return ERROR, "Error(2)"
except OSError as e:
print ("OSError Error", e)
return ERROR, "Error(3)"
except RuntimeError as e:
print ("RuntimeError", e)
return ERROR, "Error(4)"
except ValueError as e:
print ("Value Error", e)
return ERROR, "Error(5)"
except Exception as e:
print ("General Exception Error", e)
return ERROR, "Error(6)"
except:
print ("Unexpected error:", sys.exc_info()[0])
return ERROR, "Error(7)"
if err2 is None or len(err2) == 0:
err = ""
else:
# python 3 returns values and err in byte format so convert accordingly
err = bytes(err2).decode('utf-8')
if values2 is None or len(values2) == 0:
values = ""
else:
# python 3 returns values and err in byte format so convert accordingly
values = bytes(values2).decode('utf-8')
values = values.strip()
rc = p.returncode
if self.debug:
print ("[****] rc=%d values=***%s*** errors=***%s***" % (rc, values, err))
if rc == 1 or rc == 2:
return ERROR2, err
elif rc == 127:
return ERROR2, err
elif err != "":
# do nothing since INFO information is returned here for analyze commands
# return ERROR, err
return SUCCESS, err
elif values == "" and expect == True:
return ERROR2, values
elif rc != SUCCESS:
# print or(stderr_data)
return rc, err
elif values == "" and expect:
return ERROR3, '[ERROR] return set is empty'
else:
return SUCCESS, values
###########################################################
def get_pgversion(self):
# v 2.1 fix: expected output --> 10.15-10.
#sql = "select substring(foo.version from 12 for 3) from (select version() as major) foo, substring(version(), 12, position(' ' in substring(version(),12))) as minor"
#sql = "select substring(version(), 12, position(' ' in substring(version(),12)))"
sql = "select trim(substring(version(), 12, position(' ' in substring(version(),12)))) || '-' || substring(foo.major from 12 for 3)as major from (select version() as major) foo"
# do not provide host name and/or port if not provided
cmd = "psql %s -At -X -c \"%s\" " % (self.connstring, sql)
rc, results = self.executecmd(cmd, True)
if rc != SUCCESS:
errors = "%s\n" % (results)
aline = "%s" % (errors)
self.writeout(aline)
return rc, errors
# with version 10, major version format changes from x.x to x, where x is a 2 byte integer, ie, 10, 11, etc.
# values = bytes(values2).decode('utf-8')
results = str(results)
parsed = results.split('-')
amajor = parsed[1]
self.pgversionminor = parsed[0]
pos = amajor.find('.')
if pos == -1:
# must be a beta or rc candidate version starting at version 10 since the current version is 10rc1
self.pgversionmajor = Decimal(amajor[:2])
else:
self.pgversionmajor = Decimal(amajor)
#print ("majorversion = %.1f minorversion = %s" % (self.pgversionmajor, self.pgversionminor))
return SUCCESS, str(results)
###########################################################
def get_readycnt(self):
# we cannot handle cloud types like AWS RDS
if self.pg_type == 'rds':
return SUCCESS, '0'
# version 10 replaces pg_xlog with pg_wal directory
if self.pgversionmajor > Decimal('9.6'):
xlogdir = "%s/pg_wal/archive_status" % self.datadir
else:
xlogdir = "%s/pg_xlog/archive_status" % self.datadir
sql = "select count(*) from (select pg_ls_dir from pg_ls_dir('%s') where pg_ls_dir ~ E'^[0-9A-F]{24}.ready$') as foo" % xlogdir
# do not provide host name and/or port if not provided
cmd = "psql %s -At -X -c \"%s\" " % (self.connstring, sql)
rc, results = self.executecmd(cmd, True)
if rc != SUCCESS:
errors = "%s\n" % (results)
aline = "%s" % (errors)
self.writeout(aline)
return rc, errors
return SUCCESS, str(results)
###########################################################
def get_datadir(self):
sql = "show data_directory"
# do not provide host name and/or port if not provided
cmd = "psql %s -At -X -c \"%s\" " % (self.connstring, sql)
rc, results = self.executecmd(cmd, True)
if rc != SUCCESS:
errors = "%s\n" % (results)
aline = "%s" % (errors)
self.writeout(aline)
return rc, errors
return SUCCESS, str(results)
###########################################################
def get_pgbindir(self):
if self.opsys == 'posix':
cmd = "pg_config | grep BINDIR"
else:
cmd = "pg_config | find \"BINDIR\""
rc, results = self.executecmd(cmd, True)
if rc != SUCCESS:
# don't consider failure unless bindir not already populated by "which psql" command that executed earlier
if self.pgbindir == "":
errors = "unable to get PG Bind Directory. rc=%d %s\n" % (rc, results)
aline = "%s" % (errors)
self.writeout(aline)
return rc, errors
else:
return SUCCESS, self.pgbindir
results = results.split('=')
self.pgbindir = results[1].strip()
if self.verbose:
print ("PG Bind Directory = %s" % self.pgbindir)
return SUCCESS, str(results)
###########################################################
def do_report(self):
if self.testmode:
marker = MARK_WARN
subject = 'Test Mode'
msg = "Testing notifications."
rc = self.send_alert(self.to, self.from_, subject, msg)
if rc != 0:
print("mail error")
return ERROR, ''
self.log_alert(TESTALERT)
print (marker+msg)
if self.waitslocks > 0:
##########################################################
# Get lock waiting transactions where wait is > input seconds
##########################################################
if self.pgversionmajor < Decimal('9.2'):
# select procpid, datname, usename, client_addr, now(), query_start, substring(current_query,1,100), now() - query_start as duration from pg_stat_activity where waiting is true and now() - query_start > interval '30 seconds';
sql = "select count(*) from pg_stat_activity where waiting is true and now() - query_start > interval '%d seconds'" % self.waitslocks
sql2 = "TODO"
elif self.pgversionmajor < Decimal('9.6'):
# select pid, datname, usename, client_addr, now(), query_start, substring(query,1,100), now() - query_start as duration from pg_stat_activity where waiting is true and now() - query_start > interval '30 seconds';
sql1 = "select count(*) from pg_stat_activity where waiting is true and now() - query_start > interval '%d seconds'" % self.waitslocks
sql2 = "TODO"
else:
# new wait_event column replaces waiting in 9.6/10
# v2.2 fix: add backend_type qualifier to not consider walsender
# filter out DataFileRead-IO
#sql1 = "select count(*) from pg_stat_activity where wait_event is NOT NULL and state = 'active' and backend_type <> 'walsender' and now() - query_start > interval '%d seconds'" % self.waitslocks
sql1 = "select count(*) from pg_stat_activity where wait_event is NOT NULL AND wait_event <> 'DataFileRead' and state = 'active' and backend_type <> 'walsender' and now() - query_start > interval '%d seconds'" % self.waitslocks
sql2 = "select 'db=' || datname || ' user=' || usename || ' appname=' || application_name || ' waitinfo=' || wait_event || '-' || wait_event_type || " \
"' duration=' || cast(EXTRACT(EPOCH FROM (now() - query_start)) as integer) || '\n'" \
"'sql=' || regexp_replace(replace(regexp_replace(query, E'[\\n\\r]+', ' ', 'g' ),' ',''), '[^\x20-\x7f\x0d\x1b]', '', 'g') || '\n'" \
"from pg_stat_activity where wait_event is NOT NULL and state = 'active' and backend_type <> 'walsender' and now() - query_start > interval '%d seconds'" % self.waitslocks
sql3 = "SELECT '\n\nblocked_pid =' || rpad(cast(blocked_locks.pid as varchar),7,' ') || ' blocked_user=' || blocked_activity.usename || " \
"'\nblocking_pid=' || rpad(cast(blocking_locks.pid as varchar), 7, ' ') || 'blocking_user=' || blocking_activity.usename || '\n' ||" \
"'blocked_query =' || regexp_replace(replace(regexp_replace(blocked_activity.query, E'[\\n\\r]+', ' ', 'g' ),' ',''), '[^\x20-\x7f\x0d\x1b]', '', 'g') || '...\n' ||" \
"'blocking_query=' || regexp_replace(replace(regexp_replace(blocking_activity.query, E'[\\n\\r]+', ' ', 'g' ),' ',''), '[^\x20-\x7f\x0d\x1b]', '', 'g') || '...\n\n' FROM pg_catalog.pg_locks blocked_locks " \
"JOIN pg_catalog.pg_stat_activity blocked_activity ON blocked_activity.pid = blocked_locks.pid JOIN pg_catalog.pg_locks blocking_locks ON blocking_locks.locktype = blocked_locks.locktype AND " \
"blocking_locks.DATABASE IS NOT DISTINCT FROM blocked_locks.DATABASE AND blocking_locks.relation IS NOT DISTINCT FROM blocked_locks.relation AND blocking_locks.page IS NOT DISTINCT " \
"FROM blocked_locks.page AND blocking_locks.tuple IS NOT DISTINCT FROM blocked_locks.tuple AND blocking_locks.virtualxid IS NOT DISTINCT FROM blocked_locks.virtualxid AND " \
"blocking_locks.transactionid IS NOT DISTINCT FROM blocked_locks.transactionid AND blocking_locks.classid IS NOT DISTINCT FROM blocked_locks.classid AND blocking_locks.objid IS NOT DISTINCT " \
"FROM blocked_locks.objid AND blocking_locks.objsubid IS NOT DISTINCT FROM blocked_locks.objsubid AND blocking_locks.pid != blocked_locks.pid " \
"JOIN pg_catalog.pg_stat_activity blocking_activity ON blocking_activity.pid = blocking_locks.pid WHERE NOT blocked_locks.GRANTED"
cmd = "psql %s -At -X -c \"%s\"" % (self.connstring, sql1)
rc, results = self.executecmd(cmd, False)
if rc != SUCCESS:
errors = "[ERROR] Unable to get count of blocked queries."
return rc, errors
blocked_queries_cnt = int(results)
if blocked_queries_cnt == 0:
marker = MARK_OK
msg = "No \"Waiting/Blocked queries\" longer than %d seconds were detected." % self.waitslocks
else:
marker = MARK_WARN
msg = "%d \"Waiting/Blocked queries\" longer than %d seconds were detected." % (blocked_queries_cnt, self.waitslocks)
cmd = "psql %s -At -X -c \"%s\"" % (self.connstring, sql2)
rc, results2 = self.executecmd(cmd, False)
if rc != SUCCESS:
print ("Unable to get waiting or blocked queries(A): %d %s\nsql=%s\n" % (rc, results2, sql2))
cmd = "psql %s -At -X -c \"%s\"" % (self.connstring, sql3)
rc, results3 = self.executecmd(cmd, False)
if rc != SUCCESS:
print ("Unable to get waiting or blocked queries(B): %d %s\nsql=%s\n" % (rc, results2, sql3))
subject = '%d Waiting/BLocked SQL(s) Detected' % (blocked_queries_cnt)
if results2 is None or results2.strip() == '':
results2 = ''
if results3 is None or results3.strip() == '':
results3 = ''
if self.debug:
print("[****] results2=%s" % results2)
print("[****] results3=%s" % results3)
print("[****] ")
print("[****] total results=%s" % results2 + '\r\n' + results3)
# /r makes body disappear!
if results2.strip() == '' and results3.strip() == '':
# then must have gone away so don't report anything
msg = "%d \"Waiting/Blocked queries\" longer than %d seconds were detected but details not available anymore." % (blocked_queries_cnt, self.waitslocks)
if self.verbose:
print("%d waits/locks detected, but details are no longer available." % blocked_queries_cnt)
else:
#rc = self.send_alert(self.to, self.from_, subject, results2+ '\r\n' + results3)
if self.alert(WAITS):
rc = self.send_alert(self.to, self.from_, subject, results2 + '\n' + results3)
if rc != 0:
print("mail error")
return 1
print (marker+msg)
if self.idleintransmins > 0:
#######################################################################
# get existing "idle in transaction" connections longer than 10 minutes
#######################################################################
# NOTE: 9.1 uses procpid, current_query, and no state column, but 9.2+ uses pid, query and state columns respectively. Also idle is <IDLE> in current_query for 9.1 and less
# <IDLE> in transaction for 9.1 but idle in transaction for state column in 9.2+
if self.pgversionmajor < Decimal('9.2'):
# select substring(current_query,1,50), round(EXTRACT(EPOCH FROM (now() - query_start))), now(), query_start from pg_stat_activity;
sql1 = "select count(*) from pg_stat_activity where current_query ilike \'<IDLE> in transaction%%\' and round(EXTRACT(EPOCH FROM (now() - query_start))) > %d" % self.idleintransmins
sql2 = "select datname, usename, application_name from pg_stat_activity where current_query ilike \'idle in transaction\' and round(EXTRACT(EPOCH FROM (now() - query_start))) > %d" % self.idleintransmins
else:
# select substring(query,1,50), round(EXTRACT(EPOCH FROM (now() - query_start))), now(), query_start, state from pg_stat_activity;
sql1 = "select count(*) from pg_stat_activity where state = \'idle in transaction\' and round(EXTRACT(EPOCH FROM (now() - query_start))) / 60 > %d" % self.idleintransmins
sql2 = "select 'pid=' || pid || ' db=' || datname || ' user=' || usename || ' app=' || coalesce(application_name, 'N/A') || ' clientip=' || client_addr || ' duration=' || round(round(EXTRACT(EPOCH FROM (now() - query_start))) / 60) || ' mins' from pg_stat_activity where state = \'idle in transaction\' and round(EXTRACT(EPOCH FROM (now() - query_start))) / 60 > %d" % self.idleintransmins
cmd = "psql %s -At -X -c \"%s\"" % (self.connstring, sql1)
rc, results = self.executecmd(cmd, False)
if rc != SUCCESS:
errors = "Unable to get count of idle in transaction connections: %d %s\nsql=%s\n" % (rc, results, sql1)
return rc, errors
idle_in_transaction_cnt = int(results)
if idle_in_transaction_cnt == 0:
marker = MARK_OK
msg = "No \"idle in transaction\" longer than %d minutes were detected." % self.idleintransmins
else:
marker = MARK_WARN
msg = "%d \"idle in transaction\" longer than %d minutes were detected." % (idle_in_transaction_cnt, self.idleintransmins)
cmd = "psql %s -At -X -c \"%s\"" % (self.connstring, sql2)
rc, results2 = self.executecmd(cmd, False)
if rc != SUCCESS:
print ("Unable to get idle in transaction queries: %d %s\nsql=%s\n" % (rc, results2, sql2))
subject = '%d Idle In Trans SQL(s) detected longer than %d minutes' % (idle_in_transaction_cnt, self.idleintransmins)
if self.alert(IDLEINTRANS):
rc = self.send_alert(self.to, self.from_, subject, results2)
if rc != 0:
print("mail error")
return 1
print (marker+msg)
if self.longquerymins > 0:
######################################
# Get long running queries > 5 minutes (default
######################################
# NOTE: 9.1 uses procpid, current_query, and no state column, but 9.2+ uses pid, query and state columns respectively. Also idle is <IDLE> in current_query for 9.1 and less
# <IDLE> in transaction for 9.1 but idle in transaction for state column in 9.2+
if self.pgversionmajor < Decimal('9.2'):
# select procpid,datname,usename, client_addr, now(), query_start, substring(current_query,1,100), now() - query_start as duration from pg_stat_activity where current_query not ilike '<IDLE%' and current_query <> ''::text and now() - query_start > interval '5 minutes';
sql1 = "select count(*) from pg_stat_activity where current_query not ilike '<IDLE%' and current_query <> ''::text and now() - query_start > interval '%d minutes'" % self.longquerymins
sql2 = "select 'db=' || datname || ' user=' || usename || ' appname=' || application_name || ' sql=' || query from pg_stat_activity where current_query not ilike '<IDLE%%' and current_query <> ''::text and now() - query_start > interval '%d minutes'" % self.longquerymins
else:
# select pid,datname,usename, client_addr, now(), state, query_start, substring(query,1,100), now() - query_start as duration from pg_stat_activity where state not ilike 'idle%' and query <> ''::text and now() - query_start > interval '%d minutes'" % self.longquerymins
sql1 = "select count(*) from pg_stat_activity where backend_type not in ('walsender') and state not ilike 'idle%%' and query <> ''::text and now() - query_start > interval '%s minutes'" % self.longquerymins
sql2 = "select 'pid=' || pid || ' db=' || datname || ' user=' || usename || ' appname=' || coalesce(application_name, 'N/A') || ' minutes=' || " \