|
@@ -1,4 +1,236 @@
|
|
groups:
|
|
groups:
|
|
|
|
|
|
- name: postgresExporter
|
|
- name: postgresExporter
|
|
|
|
+ rules:
|
|
|
|
+ ########## EXPORTER RULES ##########
|
|
|
|
+ - alert: PGExporterScrapeError
|
|
|
|
+ expr: pg_exporter_last_scrape_error > 0
|
|
|
|
+ for: 60s
|
|
|
|
+ labels:
|
|
|
|
+ service: postgresql
|
|
|
|
+ severity: critical
|
|
|
|
+ severity_num: 300
|
|
|
|
+ annotations:
|
|
|
|
+ summary: 'Postgres Exporter running on {{ $labels.job }} (instance: {{ $labels.instance }}) is encountering scrape errors processing queries. Error count: ( {{ $value }} )'
|
|
|
|
|
|
|
|
+
|
|
|
|
+ ########## POSTGRESQL RULES ##########
|
|
|
|
+ - alert: PGIsUp
|
|
|
|
+ expr: pg_up < 1
|
|
|
|
+ for: 60s
|
|
|
|
+ labels:
|
|
|
|
+ service: postgresql
|
|
|
|
+ severity: critical
|
|
|
|
+ severity_num: 300
|
|
|
|
+ annotations:
|
|
|
|
+ summary: 'postgres_exporter running on {{ $labels.job }} is unable to communicate with the configured database'
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ ## Monitor for a failover event by checking if the recovery status value has changed within the specified time period
|
|
|
|
+ ## IMPORTANT NOTE: This alert will *automatically resolve* after the given offset time period has passed! If you desire to have an alert that must be manually resolved, see the commented out alert beneath this one
|
|
|
|
+ - alert: PGRecoveryStatusSwitch
|
|
|
|
+ expr: ccp_is_in_recovery_status != ccp_is_in_recovery_status offset 5m
|
|
|
|
+ for: 60s
|
|
|
|
+ labels:
|
|
|
|
+ service: postgresql
|
|
|
|
+ severity: critical
|
|
|
|
+ severity_num: 300
|
|
|
|
+ annotations:
|
|
|
|
+ summary: '{{ $labels.job }} has had a PostgreSQL failover event. Please check systems involved in this cluster for more details'
|
|
|
|
+
|
|
|
|
+ - alert: PGIdleTxn
|
|
|
|
+ expr: ccp_connection_stats_max_idle_in_txn_time > 300
|
|
|
|
+ for: 60s
|
|
|
|
+ labels:
|
|
|
|
+ service: postgresql
|
|
|
|
+ severity: warning
|
|
|
|
+ severity_num: 200
|
|
|
|
+ annotations:
|
|
|
|
+ description: '{{ $labels.job }} has at least one session idle in transaction for over 5 minutes.'
|
|
|
|
+ summary: 'PGSQL Instance idle transactions'
|
|
|
|
+
|
|
|
|
+ - alert: PGIdleTxn
|
|
|
|
+ expr: ccp_connection_stats_max_idle_in_txn_time > 900
|
|
|
|
+ for: 60s
|
|
|
|
+ labels:
|
|
|
|
+ service: postgresql
|
|
|
|
+ severity: critical
|
|
|
|
+ severity_num: 300
|
|
|
|
+ annotations:
|
|
|
|
+ description: '{{ $labels.job }} has at least one session idle in transaction for over 15 minutes.'
|
|
|
|
+ summary: 'PGSQL Instance idle transactions'
|
|
|
|
+
|
|
|
|
+ - alert: PGQueryTime
|
|
|
|
+ expr: ccp_connection_stats_max_query_time > 43200
|
|
|
|
+ for: 60s
|
|
|
|
+ labels:
|
|
|
|
+ service: postgresql
|
|
|
|
+ severity: warning
|
|
|
|
+ severity_num: 200
|
|
|
|
+ annotations:
|
|
|
|
+ description: '{{ $labels.job }} has at least one query running for over 12 hours.'
|
|
|
|
+ summary: 'PGSQL Max Query Runtime'
|
|
|
|
+
|
|
|
|
+ - alert: PGQueryTime
|
|
|
|
+ expr: ccp_connection_stats_max_query_time > 86400
|
|
|
|
+ for: 60s
|
|
|
|
+ labels:
|
|
|
|
+ service: postgresql
|
|
|
|
+ severity: critical
|
|
|
|
+ severity_num: 300
|
|
|
|
+ annotations:
|
|
|
|
+ description: '{{ $labels.job }} has at least one query running for over 1 day.'
|
|
|
|
+ summary: 'PGSQL Max Query Runtime'
|
|
|
|
+
|
|
|
|
+ - alert: PGConnPerc
|
|
|
|
+ expr: 100 * (ccp_connection_stats_total / ccp_connection_stats_max_connections) > 75
|
|
|
|
+ for: 60s
|
|
|
|
+ labels:
|
|
|
|
+ service: postgresql
|
|
|
|
+ severity: warning
|
|
|
|
+ severity_num: 200
|
|
|
|
+ annotations:
|
|
|
|
+ description: '{{ $labels.job }} is using 75% or more of available connections ({{ $value }}%)'
|
|
|
|
+ summary: 'PGSQL Instance connections'
|
|
|
|
+
|
|
|
|
+ - alert: PGConnPerc
|
|
|
|
+ expr: 100 * (ccp_connection_stats_total / ccp_connection_stats_max_connections) > 90
|
|
|
|
+ for: 60s
|
|
|
|
+ labels:
|
|
|
|
+ service: postgresql
|
|
|
|
+ severity: critical
|
|
|
|
+ severity_num: 300
|
|
|
|
+ annotations:
|
|
|
|
+ description: '{{ $labels.job }} is using 90% or more of available connections ({{ $value }}%)'
|
|
|
|
+ summary: 'PGSQL Instance connections'
|
|
|
|
+
|
|
|
|
+ - alert: PGDBSize
|
|
|
|
+ expr: ccp_database_size_bytes > 1.073741824e+11
|
|
|
|
+ for: 60s
|
|
|
|
+ labels:
|
|
|
|
+ service: postgresql
|
|
|
|
+ severity: warning
|
|
|
|
+ severity_num: 200
|
|
|
|
+ annotations:
|
|
|
|
+ description: 'PGSQL Instance {{ $labels.job }} over 100GB in size: {{ $value }} bytes'
|
|
|
|
+ summary: 'PGSQL Instance size warning'
|
|
|
|
+
|
|
|
|
+ - alert: PGDBSize
|
|
|
|
+ expr: ccp_database_size_bytes > 2.68435456e+11
|
|
|
|
+ for: 60s
|
|
|
|
+ labels:
|
|
|
|
+ service: postgresql
|
|
|
|
+ severity: critical
|
|
|
|
+ severity_num: 300
|
|
|
|
+ annotations:
|
|
|
|
+ description: 'PGSQL Instance {{ $labels.job }} over 250GB in size: {{ $value }} bytes'
|
|
|
|
+ summary: 'PGSQL Instance size critical'
|
|
|
|
+
|
|
|
|
+ - alert: PGReplicationByteLag
|
|
|
|
+ expr: ccp_replication_lag_size_bytes > 5.24288e+07
|
|
|
|
+ for: 60s
|
|
|
|
+ labels:
|
|
|
|
+ service: postgresql
|
|
|
|
+ severity: warning
|
|
|
|
+ severity_num: 200
|
|
|
|
+ annotations:
|
|
|
|
+ description: 'PGSQL Instance {{ $labels.job }} has at least one replica lagging over 50MB behind.'
|
|
|
|
+ summary: 'PGSQL Instance replica lag warning'
|
|
|
|
+
|
|
|
|
+ - alert: PGReplicationByteLag
|
|
|
|
+ expr: ccp_replication_lag_size_bytes > 1.048576e+08
|
|
|
|
+ for: 60s
|
|
|
|
+ labels:
|
|
|
|
+ service: postgresql
|
|
|
|
+ severity: critical
|
|
|
|
+ severity_num: 300
|
|
|
|
+ annotations:
|
|
|
|
+ description: 'PGSQL Instance {{ $labels.job }} has at least one replica lagging over 100MB behind.'
|
|
|
|
+ summary: 'PGSQL Instance replica lag warning'
|
|
|
|
+
|
|
|
|
+ - alert: PGReplicationSlotsInactive
|
|
|
|
+ expr: ccp_replication_slots_active == 0
|
|
|
|
+ for: 60s
|
|
|
|
+ labels:
|
|
|
|
+ service: postgresql
|
|
|
|
+ severity: critical
|
|
|
|
+ severity_num: 300
|
|
|
|
+ annotations:
|
|
|
|
+ description: 'PGSQL Instance {{ $labels.job }} has one or more inactive replication slots'
|
|
|
|
+ summary: 'PGSQL Instance inactive replication slot'
|
|
|
|
+
|
|
|
|
+ - alert: PGXIDWraparound
|
|
|
|
+ expr: ccp_transaction_wraparound_percent_towards_wraparound > 50
|
|
|
|
+ for: 60s
|
|
|
|
+ labels:
|
|
|
|
+ service: postgresql
|
|
|
|
+ severity: warning
|
|
|
|
+ severity_num: 200
|
|
|
|
+ annotations:
|
|
|
|
+ description: 'PGSQL Instance {{ $labels.job }} is over 50% towards transaction id wraparound.'
|
|
|
|
+ summary: 'PGSQL Instance {{ $labels.job }} transaction id wraparound imminent'
|
|
|
|
+
|
|
|
|
+ - alert: PGXIDWraparound
|
|
|
|
+ expr: ccp_transaction_wraparound_percent_towards_wraparound > 75
|
|
|
|
+ for: 60s
|
|
|
|
+ labels:
|
|
|
|
+ service: postgresql
|
|
|
|
+ severity: critical
|
|
|
|
+ severity_num: 300
|
|
|
|
+ annotations:
|
|
|
|
+ description: 'PGSQL Instance {{ $labels.job }} is over 75% towards transaction id wraparound.'
|
|
|
|
+ summary: 'PGSQL Instance transaction id wraparound imminent'
|
|
|
|
+
|
|
|
|
+ - alert: PGEmergencyVacuum
|
|
|
|
+ expr: ccp_transaction_wraparound_percent_towards_emergency_autovac > 110
|
|
|
|
+ for: 60s
|
|
|
|
+ labels:
|
|
|
|
+ service: postgresql
|
|
|
|
+ severity: warning
|
|
|
|
+ severity_num: 200
|
|
|
|
+ annotations:
|
|
|
|
+ description: 'PGSQL Instance {{ $labels.job }} is over 110% beyond autovacuum_freeze_max_age value. Autovacuum may need tuning to better keep up.'
|
|
|
|
+ summary: 'PGSQL Instance emergency vacuum imminent'
|
|
|
|
+
|
|
|
|
+ - alert: PGEmergencyVacuum
|
|
|
|
+ expr: ccp_transaction_wraparound_percent_towards_emergency_autovac > 125
|
|
|
|
+ for: 60s
|
|
|
|
+ labels:
|
|
|
|
+ service: postgresql
|
|
|
|
+ severity: critical
|
|
|
|
+ severity_num: 300
|
|
|
|
+ annotations:
|
|
|
|
+ description: 'PGSQL Instance {{ $labels.job }} is over 125% beyond autovacuum_freeze_max_age value. Autovacuum needs tuning to better keep up.'
|
|
|
|
+ summary: 'PGSQL Instance emergency vacuum imminent'
|
|
|
|
+
|
|
|
|
+ - alert: PGArchiveCommandStatus
|
|
|
|
+ expr: ccp_archive_command_status_seconds_since_last_fail > 300
|
|
|
|
+ for: 60s
|
|
|
|
+ labels:
|
|
|
|
+ service: postgresql
|
|
|
|
+ severity: critical
|
|
|
|
+ severity_num: 300
|
|
|
|
+ annotations:
|
|
|
|
+ description: 'PGSQL Instance {{ $labels.job }} has a recent failing archive command'
|
|
|
|
+ summary: 'Seconds since the last recorded failure of the archive_command'
|
|
|
|
+
|
|
|
|
+ - alert: PGSequenceExhaustion
|
|
|
|
+ expr: ccp_sequence_exhaustion_count > 0
|
|
|
|
+ for: 60s
|
|
|
|
+ labels:
|
|
|
|
+ service: postgresql
|
|
|
|
+ severity: critical
|
|
|
|
+ severity_num: 300
|
|
|
|
+ annotations:
|
|
|
|
+ description: 'Count of sequences on instance {{ $labels.job }} at over 75% usage: {{ $value }}. Run following query to see full sequence status: SELECT * FROM monitor.sequence_status() WHERE percent >= 75'
|
|
|
|
+
|
|
|
|
+ - alert: PGSettingsPendingRestart
|
|
|
|
+ expr: ccp_settings_pending_restart_count > 0
|
|
|
|
+ for: 60s
|
|
|
|
+ labels:
|
|
|
|
+ service: postgresql
|
|
|
|
+ severity: critical
|
|
|
|
+ severity_num: 300
|
|
|
|
+ annotations:
|
|
|
|
+ description: 'One or more settings in the pg_settings system catalog on system {{ $labels.job }} are in a pending_restart state. Check the system catalog for which settings are pending and review postgresql.conf for changes.'
|