postgres.rules 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236
  1. groups:
  2. - name: postgresExporter
  3. rules:
  4. ########## EXPORTER RULES ##########
  5. - alert: PGExporterScrapeError
  6. expr: pg_exporter_last_scrape_error > 0
  7. for: 60s
  8. labels:
  9. service: postgresql
  10. severity: critical
  11. severity_num: 300
  12. annotations:
  13. summary: 'Postgres Exporter running on {{ $labels.job }} (instance: {{ $labels.instance }}) is encountering scrape errors processing queries. Error count: ( {{ $value }} )'
  14. ########## POSTGRESQL RULES ##########
  15. - alert: PGIsUp
  16. expr: pg_up < 1
  17. for: 60s
  18. labels:
  19. service: postgresql
  20. severity: critical
  21. severity_num: 300
  22. annotations:
  23. summary: 'postgres_exporter running on {{ $labels.job }} is unable to communicate with the configured database'
  24. ## Monitor for a failover event by checking if the recovery status value has changed within the specified time period
  25. ## IMPORTANT NOTE: This alert will *automatically resolve* after the given offset time period has passed! If you desire to have an alert that must be manually resolved, see the commented out alert beneath this one
  26. - alert: PGRecoveryStatusSwitch
  27. expr: ccp_is_in_recovery_status != ccp_is_in_recovery_status offset 5m
  28. for: 60s
  29. labels:
  30. service: postgresql
  31. severity: critical
  32. severity_num: 300
  33. annotations:
  34. summary: '{{ $labels.job }} has had a PostgreSQL failover event. Please check systems involved in this cluster for more details'
  35. - alert: PGIdleTxn
  36. expr: ccp_connection_stats_max_idle_in_txn_time > 300
  37. for: 60s
  38. labels:
  39. service: postgresql
  40. severity: warning
  41. severity_num: 200
  42. annotations:
  43. description: '{{ $labels.job }} has at least one session idle in transaction for over 5 minutes.'
  44. summary: 'PGSQL Instance idle transactions'
  45. - alert: PGIdleTxn
  46. expr: ccp_connection_stats_max_idle_in_txn_time > 900
  47. for: 60s
  48. labels:
  49. service: postgresql
  50. severity: critical
  51. severity_num: 300
  52. annotations:
  53. description: '{{ $labels.job }} has at least one session idle in transaction for over 15 minutes.'
  54. summary: 'PGSQL Instance idle transactions'
  55. - alert: PGQueryTime
  56. expr: ccp_connection_stats_max_query_time > 43200
  57. for: 60s
  58. labels:
  59. service: postgresql
  60. severity: warning
  61. severity_num: 200
  62. annotations:
  63. description: '{{ $labels.job }} has at least one query running for over 12 hours.'
  64. summary: 'PGSQL Max Query Runtime'
  65. - alert: PGQueryTime
  66. expr: ccp_connection_stats_max_query_time > 86400
  67. for: 60s
  68. labels:
  69. service: postgresql
  70. severity: critical
  71. severity_num: 300
  72. annotations:
  73. description: '{{ $labels.job }} has at least one query running for over 1 day.'
  74. summary: 'PGSQL Max Query Runtime'
  75. - alert: PGConnPerc
  76. expr: 100 * (ccp_connection_stats_total / ccp_connection_stats_max_connections) > 75
  77. for: 60s
  78. labels:
  79. service: postgresql
  80. severity: warning
  81. severity_num: 200
  82. annotations:
  83. description: '{{ $labels.job }} is using 75% or more of available connections ({{ $value }}%)'
  84. summary: 'PGSQL Instance connections'
  85. - alert: PGConnPerc
  86. expr: 100 * (ccp_connection_stats_total / ccp_connection_stats_max_connections) > 90
  87. for: 60s
  88. labels:
  89. service: postgresql
  90. severity: critical
  91. severity_num: 300
  92. annotations:
  93. description: '{{ $labels.job }} is using 90% or more of available connections ({{ $value }}%)'
  94. summary: 'PGSQL Instance connections'
  95. - alert: PGDBSize
  96. expr: ccp_database_size_bytes > 1.073741824e+11
  97. for: 60s
  98. labels:
  99. service: postgresql
  100. severity: warning
  101. severity_num: 200
  102. annotations:
  103. description: 'PGSQL Instance {{ $labels.job }} over 100GB in size: {{ $value }} bytes'
  104. summary: 'PGSQL Instance size warning'
  105. - alert: PGDBSize
  106. expr: ccp_database_size_bytes > 2.68435456e+11
  107. for: 60s
  108. labels:
  109. service: postgresql
  110. severity: critical
  111. severity_num: 300
  112. annotations:
  113. description: 'PGSQL Instance {{ $labels.job }} over 250GB in size: {{ $value }} bytes'
  114. summary: 'PGSQL Instance size critical'
  115. - alert: PGReplicationByteLag
  116. expr: ccp_replication_lag_size_bytes > 5.24288e+07
  117. for: 60s
  118. labels:
  119. service: postgresql
  120. severity: warning
  121. severity_num: 200
  122. annotations:
  123. description: 'PGSQL Instance {{ $labels.job }} has at least one replica lagging over 50MB behind.'
  124. summary: 'PGSQL Instance replica lag warning'
  125. - alert: PGReplicationByteLag
  126. expr: ccp_replication_lag_size_bytes > 1.048576e+08
  127. for: 60s
  128. labels:
  129. service: postgresql
  130. severity: critical
  131. severity_num: 300
  132. annotations:
  133. description: 'PGSQL Instance {{ $labels.job }} has at least one replica lagging over 100MB behind.'
  134. summary: 'PGSQL Instance replica lag warning'
  135. - alert: PGReplicationSlotsInactive
  136. expr: ccp_replication_slots_active == 0
  137. for: 60s
  138. labels:
  139. service: postgresql
  140. severity: critical
  141. severity_num: 300
  142. annotations:
  143. description: 'PGSQL Instance {{ $labels.job }} has one or more inactive replication slots'
  144. summary: 'PGSQL Instance inactive replication slot'
  145. - alert: PGXIDWraparound
  146. expr: ccp_transaction_wraparound_percent_towards_wraparound > 50
  147. for: 60s
  148. labels:
  149. service: postgresql
  150. severity: warning
  151. severity_num: 200
  152. annotations:
  153. description: 'PGSQL Instance {{ $labels.job }} is over 50% towards transaction id wraparound.'
  154. summary: 'PGSQL Instance {{ $labels.job }} transaction id wraparound imminent'
  155. - alert: PGXIDWraparound
  156. expr: ccp_transaction_wraparound_percent_towards_wraparound > 75
  157. for: 60s
  158. labels:
  159. service: postgresql
  160. severity: critical
  161. severity_num: 300
  162. annotations:
  163. description: 'PGSQL Instance {{ $labels.job }} is over 75% towards transaction id wraparound.'
  164. summary: 'PGSQL Instance transaction id wraparound imminent'
  165. - alert: PGEmergencyVacuum
  166. expr: ccp_transaction_wraparound_percent_towards_emergency_autovac > 110
  167. for: 60s
  168. labels:
  169. service: postgresql
  170. severity: warning
  171. severity_num: 200
  172. annotations:
  173. description: 'PGSQL Instance {{ $labels.job }} is over 110% beyond autovacuum_freeze_max_age value. Autovacuum may need tuning to better keep up.'
  174. summary: 'PGSQL Instance emergency vacuum imminent'
  175. - alert: PGEmergencyVacuum
  176. expr: ccp_transaction_wraparound_percent_towards_emergency_autovac > 125
  177. for: 60s
  178. labels:
  179. service: postgresql
  180. severity: critical
  181. severity_num: 300
  182. annotations:
  183. description: 'PGSQL Instance {{ $labels.job }} is over 125% beyond autovacuum_freeze_max_age value. Autovacuum needs tuning to better keep up.'
  184. summary: 'PGSQL Instance emergency vacuum imminent'
  185. - alert: PGArchiveCommandStatus
  186. expr: ccp_archive_command_status_seconds_since_last_fail > 300
  187. for: 60s
  188. labels:
  189. service: postgresql
  190. severity: critical
  191. severity_num: 300
  192. annotations:
  193. description: 'PGSQL Instance {{ $labels.job }} has a recent failing archive command'
  194. summary: 'Seconds since the last recorded failure of the archive_command'
  195. - alert: PGSequenceExhaustion
  196. expr: ccp_sequence_exhaustion_count > 0
  197. for: 60s
  198. labels:
  199. service: postgresql
  200. severity: critical
  201. severity_num: 300
  202. annotations:
  203. description: 'Count of sequences on instance {{ $labels.job }} at over 75% usage: {{ $value }}. Run following query to see full sequence status: SELECT * FROM monitor.sequence_status() WHERE percent >= 75'
  204. - alert: PGSettingsPendingRestart
  205. expr: ccp_settings_pending_restart_count > 0
  206. for: 60s
  207. labels:
  208. service: postgresql
  209. severity: critical
  210. severity_num: 300
  211. annotations:
  212. description: 'One or more settings in the pg_settings system catalog on system {{ $labels.job }} are in a pending_restart state. Check the system catalog for which settings are pending and review postgresql.conf for changes.'