groups: - name: postgresExporter rules: ########## EXPORTER RULES ########## - alert: PGExporterScrapeError expr: pg_exporter_last_scrape_error > 0 for: 60s labels: service: postgresql severity: critical severity_num: 300 annotations: summary: 'Postgres Exporter running on {{ $labels.job }} (instance: {{ $labels.instance }}) is encountering scrape errors processing queries. Error count: ( {{ $value }} )' ########## POSTGRESQL RULES ########## - alert: PGIsUp expr: pg_up < 1 for: 60s labels: service: postgresql severity: critical severity_num: 300 annotations: summary: 'postgres_exporter running on {{ $labels.job }} is unable to communicate with the configured database' ## Monitor for a failover event by checking if the recovery status value has changed within the specified time period ## IMPORTANT NOTE: This alert will *automatically resolve* after the given offset time period has passed! If you desire to have an alert that must be manually resolved, see the commented out alert beneath this one - alert: PGRecoveryStatusSwitch expr: ccp_is_in_recovery_status != ccp_is_in_recovery_status offset 5m for: 60s labels: service: postgresql severity: critical severity_num: 300 annotations: summary: '{{ $labels.job }} has had a PostgreSQL failover event. Please check systems involved in this cluster for more details' - alert: PGIdleTxn expr: ccp_connection_stats_max_idle_in_txn_time > 300 for: 60s labels: service: postgresql severity: warning severity_num: 200 annotations: description: '{{ $labels.job }} has at least one session idle in transaction for over 5 minutes.' summary: 'PGSQL Instance idle transactions' - alert: PGIdleTxn expr: ccp_connection_stats_max_idle_in_txn_time > 900 for: 60s labels: service: postgresql severity: critical severity_num: 300 annotations: description: '{{ $labels.job }} has at least one session idle in transaction for over 15 minutes.' summary: 'PGSQL Instance idle transactions' - alert: PGQueryTime expr: ccp_connection_stats_max_query_time > 43200 for: 60s labels: service: postgresql severity: warning severity_num: 200 annotations: description: '{{ $labels.job }} has at least one query running for over 12 hours.' summary: 'PGSQL Max Query Runtime' - alert: PGQueryTime expr: ccp_connection_stats_max_query_time > 86400 for: 60s labels: service: postgresql severity: critical severity_num: 300 annotations: description: '{{ $labels.job }} has at least one query running for over 1 day.' summary: 'PGSQL Max Query Runtime' - alert: PGConnPerc expr: 100 * (ccp_connection_stats_total / ccp_connection_stats_max_connections) > 75 for: 60s labels: service: postgresql severity: warning severity_num: 200 annotations: description: '{{ $labels.job }} is using 75% or more of available connections ({{ $value }}%)' summary: 'PGSQL Instance connections' - alert: PGConnPerc expr: 100 * (ccp_connection_stats_total / ccp_connection_stats_max_connections) > 90 for: 60s labels: service: postgresql severity: critical severity_num: 300 annotations: description: '{{ $labels.job }} is using 90% or more of available connections ({{ $value }}%)' summary: 'PGSQL Instance connections' - alert: PGDBSize expr: ccp_database_size_bytes > 1.073741824e+11 for: 60s labels: service: postgresql severity: warning severity_num: 200 annotations: description: 'PGSQL Instance {{ $labels.job }} over 100GB in size: {{ $value }} bytes' summary: 'PGSQL Instance size warning' - alert: PGDBSize expr: ccp_database_size_bytes > 2.68435456e+11 for: 60s labels: service: postgresql severity: critical severity_num: 300 annotations: description: 'PGSQL Instance {{ $labels.job }} over 250GB in size: {{ $value }} bytes' summary: 'PGSQL Instance size critical' - alert: PGReplicationByteLag expr: ccp_replication_lag_size_bytes > 5.24288e+07 for: 60s labels: service: postgresql severity: warning severity_num: 200 annotations: description: 'PGSQL Instance {{ $labels.job }} has at least one replica lagging over 50MB behind.' summary: 'PGSQL Instance replica lag warning' - alert: PGReplicationByteLag expr: ccp_replication_lag_size_bytes > 1.048576e+08 for: 60s labels: service: postgresql severity: critical severity_num: 300 annotations: description: 'PGSQL Instance {{ $labels.job }} has at least one replica lagging over 100MB behind.' summary: 'PGSQL Instance replica lag warning' - alert: PGReplicationSlotsInactive expr: ccp_replication_slots_active == 0 for: 60s labels: service: postgresql severity: critical severity_num: 300 annotations: description: 'PGSQL Instance {{ $labels.job }} has one or more inactive replication slots' summary: 'PGSQL Instance inactive replication slot' - alert: PGXIDWraparound expr: ccp_transaction_wraparound_percent_towards_wraparound > 50 for: 60s labels: service: postgresql severity: warning severity_num: 200 annotations: description: 'PGSQL Instance {{ $labels.job }} is over 50% towards transaction id wraparound.' summary: 'PGSQL Instance {{ $labels.job }} transaction id wraparound imminent' - alert: PGXIDWraparound expr: ccp_transaction_wraparound_percent_towards_wraparound > 75 for: 60s labels: service: postgresql severity: critical severity_num: 300 annotations: description: 'PGSQL Instance {{ $labels.job }} is over 75% towards transaction id wraparound.' summary: 'PGSQL Instance transaction id wraparound imminent' - alert: PGEmergencyVacuum expr: ccp_transaction_wraparound_percent_towards_emergency_autovac > 110 for: 60s labels: service: postgresql severity: warning severity_num: 200 annotations: description: 'PGSQL Instance {{ $labels.job }} is over 110% beyond autovacuum_freeze_max_age value. Autovacuum may need tuning to better keep up.' summary: 'PGSQL Instance emergency vacuum imminent' - alert: PGEmergencyVacuum expr: ccp_transaction_wraparound_percent_towards_emergency_autovac > 125 for: 60s labels: service: postgresql severity: critical severity_num: 300 annotations: description: 'PGSQL Instance {{ $labels.job }} is over 125% beyond autovacuum_freeze_max_age value. Autovacuum needs tuning to better keep up.' summary: 'PGSQL Instance emergency vacuum imminent' - alert: PGArchiveCommandStatus expr: ccp_archive_command_status_seconds_since_last_fail > 300 for: 60s labels: service: postgresql severity: critical severity_num: 300 annotations: description: 'PGSQL Instance {{ $labels.job }} has a recent failing archive command' summary: 'Seconds since the last recorded failure of the archive_command' - alert: PGSequenceExhaustion expr: ccp_sequence_exhaustion_count > 0 for: 60s labels: service: postgresql severity: critical severity_num: 300 annotations: description: 'Count of sequences on instance {{ $labels.job }} at over 75% usage: {{ $value }}. Run following query to see full sequence status: SELECT * FROM monitor.sequence_status() WHERE percent >= 75' - alert: PGSettingsPendingRestart expr: ccp_settings_pending_restart_count > 0 for: 60s labels: service: postgresql severity: critical severity_num: 300 annotations: description: 'One or more settings in the pg_settings system catalog on system {{ $labels.job }} are in a pending_restart state. Check the system catalog for which settings are pending and review postgresql.conf for changes.'