123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236 |
- groups:
- - name: postgresExporter
- rules:
- ########## EXPORTER RULES ##########
- - alert: PGExporterScrapeError
- expr: pg_exporter_last_scrape_error > 0
- for: 60s
- labels:
- service: postgresql
- severity: critical
- severity_num: 300
- annotations:
- summary: 'Postgres Exporter running on {{ $labels.job }} (instance: {{ $labels.instance }}) is encountering scrape errors processing queries. Error count: ( {{ $value }} )'
- ########## POSTGRESQL RULES ##########
- - alert: PGIsUp
- expr: pg_up < 1
- for: 60s
- labels:
- service: postgresql
- severity: critical
- severity_num: 300
- annotations:
- summary: 'postgres_exporter running on {{ $labels.job }} is unable to communicate with the configured database'
- ## Monitor for a failover event by checking if the recovery status value has changed within the specified time period
- ## IMPORTANT NOTE: This alert will *automatically resolve* after the given offset time period has passed! If you desire to have an alert that must be manually resolved, see the commented out alert beneath this one
- - alert: PGRecoveryStatusSwitch
- expr: ccp_is_in_recovery_status != ccp_is_in_recovery_status offset 5m
- for: 60s
- labels:
- service: postgresql
- severity: critical
- severity_num: 300
- annotations:
- summary: '{{ $labels.job }} has had a PostgreSQL failover event. Please check systems involved in this cluster for more details'
- - alert: PGIdleTxn
- expr: ccp_connection_stats_max_idle_in_txn_time > 300
- for: 60s
- labels:
- service: postgresql
- severity: warning
- severity_num: 200
- annotations:
- description: '{{ $labels.job }} has at least one session idle in transaction for over 5 minutes.'
- summary: 'PGSQL Instance idle transactions'
- - alert: PGIdleTxn
- expr: ccp_connection_stats_max_idle_in_txn_time > 900
- for: 60s
- labels:
- service: postgresql
- severity: critical
- severity_num: 300
- annotations:
- description: '{{ $labels.job }} has at least one session idle in transaction for over 15 minutes.'
- summary: 'PGSQL Instance idle transactions'
- - alert: PGQueryTime
- expr: ccp_connection_stats_max_query_time > 43200
- for: 60s
- labels:
- service: postgresql
- severity: warning
- severity_num: 200
- annotations:
- description: '{{ $labels.job }} has at least one query running for over 12 hours.'
- summary: 'PGSQL Max Query Runtime'
- - alert: PGQueryTime
- expr: ccp_connection_stats_max_query_time > 86400
- for: 60s
- labels:
- service: postgresql
- severity: critical
- severity_num: 300
- annotations:
- description: '{{ $labels.job }} has at least one query running for over 1 day.'
- summary: 'PGSQL Max Query Runtime'
- - alert: PGConnPerc
- expr: 100 * (ccp_connection_stats_total / ccp_connection_stats_max_connections) > 75
- for: 60s
- labels:
- service: postgresql
- severity: warning
- severity_num: 200
- annotations:
- description: '{{ $labels.job }} is using 75% or more of available connections ({{ $value }}%)'
- summary: 'PGSQL Instance connections'
- - alert: PGConnPerc
- expr: 100 * (ccp_connection_stats_total / ccp_connection_stats_max_connections) > 90
- for: 60s
- labels:
- service: postgresql
- severity: critical
- severity_num: 300
- annotations:
- description: '{{ $labels.job }} is using 90% or more of available connections ({{ $value }}%)'
- summary: 'PGSQL Instance connections'
- - alert: PGDBSize
- expr: ccp_database_size_bytes > 1.073741824e+11
- for: 60s
- labels:
- service: postgresql
- severity: warning
- severity_num: 200
- annotations:
- description: 'PGSQL Instance {{ $labels.job }} over 100GB in size: {{ $value }} bytes'
- summary: 'PGSQL Instance size warning'
- - alert: PGDBSize
- expr: ccp_database_size_bytes > 2.68435456e+11
- for: 60s
- labels:
- service: postgresql
- severity: critical
- severity_num: 300
- annotations:
- description: 'PGSQL Instance {{ $labels.job }} over 250GB in size: {{ $value }} bytes'
- summary: 'PGSQL Instance size critical'
- - alert: PGReplicationByteLag
- expr: ccp_replication_lag_size_bytes > 5.24288e+07
- for: 60s
- labels:
- service: postgresql
- severity: warning
- severity_num: 200
- annotations:
- description: 'PGSQL Instance {{ $labels.job }} has at least one replica lagging over 50MB behind.'
- summary: 'PGSQL Instance replica lag warning'
- - alert: PGReplicationByteLag
- expr: ccp_replication_lag_size_bytes > 1.048576e+08
- for: 60s
- labels:
- service: postgresql
- severity: critical
- severity_num: 300
- annotations:
- description: 'PGSQL Instance {{ $labels.job }} has at least one replica lagging over 100MB behind.'
- summary: 'PGSQL Instance replica lag warning'
- - alert: PGReplicationSlotsInactive
- expr: ccp_replication_slots_active == 0
- for: 60s
- labels:
- service: postgresql
- severity: critical
- severity_num: 300
- annotations:
- description: 'PGSQL Instance {{ $labels.job }} has one or more inactive replication slots'
- summary: 'PGSQL Instance inactive replication slot'
- - alert: PGXIDWraparound
- expr: ccp_transaction_wraparound_percent_towards_wraparound > 50
- for: 60s
- labels:
- service: postgresql
- severity: warning
- severity_num: 200
- annotations:
- description: 'PGSQL Instance {{ $labels.job }} is over 50% towards transaction id wraparound.'
- summary: 'PGSQL Instance {{ $labels.job }} transaction id wraparound imminent'
- - alert: PGXIDWraparound
- expr: ccp_transaction_wraparound_percent_towards_wraparound > 75
- for: 60s
- labels:
- service: postgresql
- severity: critical
- severity_num: 300
- annotations:
- description: 'PGSQL Instance {{ $labels.job }} is over 75% towards transaction id wraparound.'
- summary: 'PGSQL Instance transaction id wraparound imminent'
- - alert: PGEmergencyVacuum
- expr: ccp_transaction_wraparound_percent_towards_emergency_autovac > 110
- for: 60s
- labels:
- service: postgresql
- severity: warning
- severity_num: 200
- annotations:
- description: 'PGSQL Instance {{ $labels.job }} is over 110% beyond autovacuum_freeze_max_age value. Autovacuum may need tuning to better keep up.'
- summary: 'PGSQL Instance emergency vacuum imminent'
- - alert: PGEmergencyVacuum
- expr: ccp_transaction_wraparound_percent_towards_emergency_autovac > 125
- for: 60s
- labels:
- service: postgresql
- severity: critical
- severity_num: 300
- annotations:
- description: 'PGSQL Instance {{ $labels.job }} is over 125% beyond autovacuum_freeze_max_age value. Autovacuum needs tuning to better keep up.'
- summary: 'PGSQL Instance emergency vacuum imminent'
- - alert: PGArchiveCommandStatus
- expr: ccp_archive_command_status_seconds_since_last_fail > 300
- for: 60s
- labels:
- service: postgresql
- severity: critical
- severity_num: 300
- annotations:
- description: 'PGSQL Instance {{ $labels.job }} has a recent failing archive command'
- summary: 'Seconds since the last recorded failure of the archive_command'
- - alert: PGSequenceExhaustion
- expr: ccp_sequence_exhaustion_count > 0
- for: 60s
- labels:
- service: postgresql
- severity: critical
- severity_num: 300
- annotations:
- description: 'Count of sequences on instance {{ $labels.job }} at over 75% usage: {{ $value }}. Run following query to see full sequence status: SELECT * FROM monitor.sequence_status() WHERE percent >= 75'
- - alert: PGSettingsPendingRestart
- expr: ccp_settings_pending_restart_count > 0
- for: 60s
- labels:
- service: postgresql
- severity: critical
- severity_num: 300
- annotations:
- description: 'One or more settings in the pg_settings system catalog on system {{ $labels.job }} are in a pending_restart state. Check the system catalog for which settings are pending and review postgresql.conf for changes.'
|