10 ヶ月前 · c3e96895f9
--- a/README.md
+++ b/README.md
@@ -11,4 +11,16 @@ Exporter:
 
				  - node_exporter: 9100
			
 
				  - blackbox_exporter: 9115
			
 
				  - postgres_exporter: 9187
			
 
				- - jmx_exporter: 自定义
			
 
				+ - jmx_exporter: 自定义
			
 
				+
			
 
				+Clickhouse:
			
 
				+  /etc/clickhouse-server/config.xml
			
 
				+  打开以下设置
			
 
				+     <prometheus>
			
 
				+          <endpoint>/metrics</endpoint>
			
 
				+          <port>9363</port>
			
 
				+          <metrics>true</metrics>
			
 
				+          <events>true</events>
			
 
				+          <asynchronous_metrics>true</asynchronous_metrics>
			
 
				+          <status_info>true</status_info>
			
 
				+     </prometheus>
			
--- a/code/conf/jmx_exporter_readme.md
+++ b/code/conf/jmx_exporter_readme.md
@@ -0,0 +1,13 @@
 
				+## jmx_exporter使用说明
			
 
				+
			
 
				+使用时需要用`jmx_exporter`启动`jar`程序，并且需要用到文件夹下的`config.yml`
			
 
				+
			
 
				+启动方式如下：
			
 
				+```shell
			
 
				+EXPORTER=$ROOT/jmx_exporter.jar
			
 
				+EXPORTER_PORT=9996
			
 
				+JAR_NAME=PATH/TO/JAR_APP
			
 
				+
			
 
				+java -javaagent:$EXPORTER=$EXPORTER_PORT:config.yml -jar $JAR_NAME
			
 
				+```
			
 
				+
			
--- a/code/prometheus/alertmanager.yml
+++ b/code/prometheus/alertmanager.yml
@@ -20,7 +20,7 @@ receivers:
 
				     # email_configs:
			
 
				     #   - to: <mail to address>
			
 
				     #     send_resolved: true
			
 
				-    webhook_configs: #todo:填写地址
			
 
				+    webhook_configs: # todo:填写告警收集地址
			
 
				     - url: 'http://localhost:9995/prometheusalert?type=dd&tpl=prometheus-dd&ddurl=钉钉机器人地址'
			
 
				     - url: 'http://localhost:9995/prometheusalert?type=email&tpl=prometheus-email&email=Email地址'
			
 
				 inhibit_rules:
			
--- a/code/prometheus/prometheus.yml
+++ b/code/prometheus/prometheus.yml
@@ -27,30 +27,34 @@ scrape_configs:
 
				     static_configs:
			
 
				       - targets: ["localhost:9990"]
			
 
				 
			
 
				-  #NodeExporter
			
 
				+  #NodeExporter(9100) //todo 填写端口与地址
			
 
				   - job_name: "NodeExporter"
			
 
				     metrics_path: /metrics
			
 
				     static_config:
			
 
				-      - targets: [ "localhost: 9100" ]
			
 
				+      - targets: 'localhost: 9100'
			
 
				+      - targets: ''
			
 
				+      - targets: ''
			
 
				+      - targets: ''
			
 
				 
			
 
				-  #clickhouse
			
 
				+  #clickhouse(9363) //todo 填写端口与地址 （配置文件中prometheus收集指标的端口 默认为9363）
			
 
				   - job_name: "clickhouse"
			
 
				     metrics_path: /metrics
			
 
				-    static_config: #todo:设置clickhouse目标
			
 
				+    static_config:
			
 
				       - targets:
			
 
				           - 'target1:port'
			
 
				       - targets:
			
 
				           - 'target2:port'
			
 
				 
			
 
				-  #clickhouse-keeper
			
 
				+  #clickhousekeeper() //todo : 填写端口与地址
			
 
				   - job_name: "clickhouse-keeper"
			
 
				     metrics_path: /metrics
			
 
				     static_config:
			
 
				       - targets:
			
 
				-          - ''
			
 
				+          - 'target1:port'
			
 
				       - targets:
			
 
				-          - ''
			
 
				+          - 'target2:port'
			
 
				 
			
 
				+  #clickhouse(8123) //todo: 填写端口与地址
			
 
				   - job_name: "blackbox_clickhouse_instance"
			
 
				     metrics_path: /probe
			
 
				     params:
			
@@ -58,6 +62,8 @@ scrape_configs:
 
				     static_configs:
			
 
				       - targets:
			
 
				           - 'target1:port'
			
 
				+          - ':'
			
 
				+          - ':'
			
 
				     relabel_configs:
			
 
				       - source_labels: [ __address__ ]
			
 
				         target_label: __param_target
			
@@ -66,6 +72,7 @@ scrape_configs:
 
				       - target_label: __address__
			
 
				         replacement: 'localhost:9115'
			
 
				 
			
 
				+  #clickhouse-keeper //todo：填写端口与地址
			
 
				   - job_name: "blackbox_ckkeeper_instance"
			
 
				     metrics_path: /probe
			
 
				     params:
			
@@ -73,10 +80,31 @@ scrape_configs:
 
				     static_configs:
			
 
				       - targets:
			
 
				           - ':'
			
 
				+          - ':'
			
 
				+          - ':'
			
 
				     relabel_configs:
			
 
				       - source_labels: [ __address__ ]
			
 
				         target_label: __param_target
			
 
				       - source_labels: [ __param_target ]
			
 
				         target_label: instance
			
 
				       - target_label: __address__
			
 
				-        replacement: 'localhost:9115'
			
 
				+        replacement: 'localhost:9115'
			
 
				+
			
 
				+  #postgres(5432)  // todo: 填写端口与地址
			
 
				+  - job_name: "blackbox_postgres_instance"
			
 
				+    metrics_path: /probe
			
 
				+    params:
			
 
				+      module: [ tcp_connect ]
			
 
				+    static_configs:
			
 
				+      - targets:
			
 
				+          - ':'
			
 
				+          - ':'
			
 
				+          - ':'
			
 
				+    relabel_configs:
			
 
				+      - source_labels: [ __address__ ]
			
 
				+        target_label: __param_target
			
 
				+      - source_labels: [ __param_target ]
			
 
				+        target_label: instance
			
 
				+      - target_label: __address__
			
 
				+        replacement: 'localhost:9115'
			
 
				+
			
--- a/code/prometheus/rules/blackbox.rules
+++ b/code/prometheus/rules/blackbox.rules
@@ -13,11 +13,21 @@ groups:
 
				         description: "clickhouse instance down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
			
 
				 
			
 
				     - alert: ckkeeperServerDown
			
 
				-          expr: probe_success{job="blackbox_ckkeeper_instance"} != 1
			
 
				-          for: 30s
			
 
				-          labels:
			
 
				-            severity: warning
			
 
				-            type: ops
			
 
				-          annotations:
			
 
				-            summary: "clickhouse keeper instance {{ $labels.instance }} is down"
			
 
				-            description: "clickhouse keeper instance down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
			
 
				+      expr: probe_success{job="blackbox_ckkeeper_instance"} != 1
			
 
				+      for: 30s
			
 
				+      labels:
			
 
				+         severity: warning
			
 
				+         type: ops
			
 
				+      annotations:
			
 
				+         summary: "clickhouse keeper instance {{ $labels.instance }} is down"
			
 
				+         description: "clickhouse keeper instance down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
			
 
				+
			
 
				+    - alert: postgresServerDown
			
 
				+      expr: probe_success{job="blackbox_postgres_instance"} != 1
			
 
				+      for: 30s
			
 
				+      labels:
			
 
				+         severity: warning
			
 
				+         type: ops
			
 
				+      annotations:
			
 
				+         summary: "postgres keeper instance {{ $labels.instance }} is down"
			
 
				+         description: "postgres keeper instance down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
			
--- a/code/prometheus/rules/clickhouse.rules
+++ b/code/prometheus/rules/clickhouse.rules
@@ -1,4 +1,13 @@
 
				 groups:
			
 
				 
			
 
				 - name: clickhouse
			
 
				+  rules:
			
 
				+  - alert: ClickhouseReadOnlyReplicas
			
 
				+    #expr: clickhouse_readonly_replica > 0
			
 
				+    expr:  ClickHouseMetrics_ReadonlyReplica > 0
			
 
				+    for: 1m
			
 
				+  - alert: ClickhouseReplicationQueue
			
 
				+    #expr: clickhouse_replicas_max_queue_size > 5 and deriv(clickhouse_replicas_max_queue_size [10m]) > 0
			
 
				+    expr:  ClickHouseAsyncMetrics_ReplicasMaxQueueSize > 5 and deriv(ClickHouseAsyncMetrics_ReplicasMaxQueueSize [10m]) > 0
			
 
				+    for: 1m
			
 
				 
			
--- a/code/prometheus/rules/postgres.rules
+++ b/code/prometheus/rules/postgres.rules
@@ -1,4 +1,236 @@
 
				 groups:
			
 
				 
			
 
				 - name: postgresExporter
			
 
				+  rules:
			
 
				+    ########## EXPORTER RULES ##########
			
 
				+      - alert: PGExporterScrapeError
			
 
				+        expr: pg_exporter_last_scrape_error > 0
			
 
				+        for: 60s
			
 
				+        labels:
			
 
				+          service: postgresql
			
 
				+          severity: critical
			
 
				+          severity_num: 300
			
 
				+        annotations:
			
 
				+          summary: 'Postgres Exporter running on {{ $labels.job }} (instance: {{ $labels.instance }}) is encountering scrape errors processing queries. Error count: ( {{ $value }} )'
			
 
				 
			
 
				+
			
 
				+    ########## POSTGRESQL RULES ##########
			
 
				+      - alert: PGIsUp
			
 
				+        expr: pg_up < 1
			
 
				+        for: 60s
			
 
				+        labels:
			
 
				+          service: postgresql
			
 
				+          severity: critical
			
 
				+          severity_num: 300
			
 
				+        annotations:
			
 
				+          summary: 'postgres_exporter running on {{ $labels.job }} is unable to communicate with the configured database'
			
 
				+
			
 
				+
			
 
				+
			
 
				+    ## Monitor for a failover event by checking if the recovery status value has changed within the specified time period
			
 
				+    ## IMPORTANT NOTE: This alert will *automatically resolve* after the given offset time period has passed! If you desire to have an alert that must be manually resolved, see the commented out alert beneath this one
			
 
				+      - alert: PGRecoveryStatusSwitch
			
 
				+        expr: ccp_is_in_recovery_status != ccp_is_in_recovery_status offset 5m
			
 
				+        for: 60s
			
 
				+        labels:
			
 
				+          service: postgresql
			
 
				+          severity: critical
			
 
				+          severity_num: 300
			
 
				+        annotations:
			
 
				+          summary: '{{ $labels.job }} has had a PostgreSQL failover event. Please check systems involved in this cluster for more details'
			
 
				+
			
 
				+      - alert: PGIdleTxn
			
 
				+        expr: ccp_connection_stats_max_idle_in_txn_time > 300
			
 
				+        for: 60s
			
 
				+        labels:
			
 
				+          service: postgresql
			
 
				+          severity: warning
			
 
				+          severity_num: 200
			
 
				+        annotations:
			
 
				+          description: '{{ $labels.job }} has at least one session idle in transaction for over 5 minutes.'
			
 
				+          summary: 'PGSQL Instance idle transactions'
			
 
				+
			
 
				+      - alert: PGIdleTxn
			
 
				+        expr: ccp_connection_stats_max_idle_in_txn_time > 900
			
 
				+        for: 60s
			
 
				+        labels:
			
 
				+          service: postgresql
			
 
				+          severity: critical
			
 
				+          severity_num: 300
			
 
				+        annotations:
			
 
				+          description: '{{ $labels.job }} has at least one session idle in transaction for over 15 minutes.'
			
 
				+          summary: 'PGSQL Instance idle transactions'
			
 
				+
			
 
				+      - alert: PGQueryTime
			
 
				+        expr: ccp_connection_stats_max_query_time > 43200
			
 
				+        for: 60s
			
 
				+        labels:
			
 
				+          service: postgresql
			
 
				+          severity: warning
			
 
				+          severity_num: 200
			
 
				+        annotations:
			
 
				+          description: '{{ $labels.job }} has at least one query running for over 12 hours.'
			
 
				+          summary: 'PGSQL Max Query Runtime'
			
 
				+
			
 
				+      - alert: PGQueryTime
			
 
				+        expr: ccp_connection_stats_max_query_time > 86400
			
 
				+        for: 60s
			
 
				+        labels:
			
 
				+          service: postgresql
			
 
				+          severity: critical
			
 
				+          severity_num: 300
			
 
				+        annotations:
			
 
				+          description: '{{ $labels.job }} has at least one query running for over 1 day.'
			
 
				+          summary: 'PGSQL Max Query Runtime'
			
 
				+
			
 
				+      - alert: PGConnPerc
			
 
				+        expr: 100 * (ccp_connection_stats_total / ccp_connection_stats_max_connections) > 75
			
 
				+        for: 60s
			
 
				+        labels:
			
 
				+          service: postgresql
			
 
				+          severity: warning
			
 
				+          severity_num: 200
			
 
				+        annotations:
			
 
				+          description: '{{ $labels.job }} is using 75% or more of available connections ({{ $value }}%)'
			
 
				+          summary: 'PGSQL Instance connections'
			
 
				+
			
 
				+      - alert: PGConnPerc
			
 
				+        expr: 100 * (ccp_connection_stats_total / ccp_connection_stats_max_connections) > 90
			
 
				+        for: 60s
			
 
				+        labels:
			
 
				+          service: postgresql
			
 
				+          severity: critical
			
 
				+          severity_num: 300
			
 
				+        annotations:
			
 
				+          description: '{{ $labels.job }} is using 90% or more of available connections ({{ $value }}%)'
			
 
				+          summary: 'PGSQL Instance connections'
			
 
				+
			
 
				+      - alert: PGDBSize
			
 
				+        expr: ccp_database_size_bytes > 1.073741824e+11
			
 
				+        for: 60s
			
 
				+        labels:
			
 
				+          service: postgresql
			
 
				+          severity: warning
			
 
				+          severity_num: 200
			
 
				+        annotations:
			
 
				+          description: 'PGSQL Instance {{ $labels.job }} over 100GB in size: {{ $value }} bytes'
			
 
				+          summary: 'PGSQL Instance size warning'
			
 
				+
			
 
				+      - alert: PGDBSize
			
 
				+        expr: ccp_database_size_bytes > 2.68435456e+11
			
 
				+        for: 60s
			
 
				+        labels:
			
 
				+          service: postgresql
			
 
				+          severity: critical
			
 
				+          severity_num: 300
			
 
				+        annotations:
			
 
				+          description: 'PGSQL Instance {{ $labels.job }} over 250GB in size: {{ $value }} bytes'
			
 
				+          summary: 'PGSQL Instance size critical'
			
 
				+
			
 
				+      - alert: PGReplicationByteLag
			
 
				+        expr: ccp_replication_lag_size_bytes > 5.24288e+07
			
 
				+        for: 60s
			
 
				+        labels:
			
 
				+          service: postgresql
			
 
				+          severity: warning
			
 
				+          severity_num: 200
			
 
				+        annotations:
			
 
				+          description: 'PGSQL Instance {{ $labels.job }} has at least one replica lagging over 50MB behind.'
			
 
				+          summary: 'PGSQL Instance replica lag warning'
			
 
				+
			
 
				+      - alert: PGReplicationByteLag
			
 
				+        expr: ccp_replication_lag_size_bytes > 1.048576e+08
			
 
				+        for: 60s
			
 
				+        labels:
			
 
				+          service: postgresql
			
 
				+          severity: critical
			
 
				+          severity_num: 300
			
 
				+        annotations:
			
 
				+          description: 'PGSQL Instance {{ $labels.job }} has at least one replica lagging over 100MB behind.'
			
 
				+          summary: 'PGSQL Instance replica lag warning'
			
 
				+
			
 
				+      - alert: PGReplicationSlotsInactive
			
 
				+        expr: ccp_replication_slots_active == 0
			
 
				+        for: 60s
			
 
				+        labels:
			
 
				+          service: postgresql
			
 
				+          severity: critical
			
 
				+          severity_num: 300
			
 
				+        annotations:
			
 
				+          description: 'PGSQL Instance {{ $labels.job }} has one or more inactive replication slots'
			
 
				+          summary: 'PGSQL Instance inactive replication slot'
			
 
				+
			
 
				+      - alert: PGXIDWraparound
			
 
				+        expr: ccp_transaction_wraparound_percent_towards_wraparound > 50
			
 
				+        for: 60s
			
 
				+        labels:
			
 
				+          service: postgresql
			
 
				+          severity: warning
			
 
				+          severity_num: 200
			
 
				+        annotations:
			
 
				+          description: 'PGSQL Instance {{ $labels.job }} is over 50% towards transaction id wraparound.'
			
 
				+          summary: 'PGSQL Instance {{ $labels.job }} transaction id wraparound imminent'
			
 
				+
			
 
				+      - alert: PGXIDWraparound
			
 
				+        expr: ccp_transaction_wraparound_percent_towards_wraparound > 75
			
 
				+        for: 60s
			
 
				+        labels:
			
 
				+          service: postgresql
			
 
				+          severity: critical
			
 
				+          severity_num: 300
			
 
				+        annotations:
			
 
				+          description: 'PGSQL Instance {{ $labels.job }} is over 75% towards transaction id wraparound.'
			
 
				+          summary: 'PGSQL Instance transaction id wraparound imminent'
			
 
				+
			
 
				+      - alert: PGEmergencyVacuum
			
 
				+        expr: ccp_transaction_wraparound_percent_towards_emergency_autovac > 110
			
 
				+        for: 60s
			
 
				+        labels:
			
 
				+          service: postgresql
			
 
				+          severity: warning
			
 
				+          severity_num: 200
			
 
				+        annotations:
			
 
				+          description: 'PGSQL Instance {{ $labels.job }} is over 110% beyond autovacuum_freeze_max_age value. Autovacuum may need tuning to better keep up.'
			
 
				+          summary: 'PGSQL Instance emergency vacuum imminent'
			
 
				+
			
 
				+      - alert: PGEmergencyVacuum
			
 
				+        expr: ccp_transaction_wraparound_percent_towards_emergency_autovac > 125
			
 
				+        for: 60s
			
 
				+        labels:
			
 
				+          service: postgresql
			
 
				+          severity: critical
			
 
				+          severity_num: 300
			
 
				+        annotations:
			
 
				+          description: 'PGSQL Instance {{ $labels.job }} is over 125% beyond autovacuum_freeze_max_age value. Autovacuum needs tuning to better keep up.'
			
 
				+          summary: 'PGSQL Instance emergency vacuum imminent'
			
 
				+
			
 
				+      - alert: PGArchiveCommandStatus
			
 
				+        expr: ccp_archive_command_status_seconds_since_last_fail > 300
			
 
				+        for: 60s
			
 
				+        labels:
			
 
				+            service: postgresql
			
 
				+            severity: critical
			
 
				+            severity_num: 300
			
 
				+        annotations:
			
 
				+            description: 'PGSQL Instance {{ $labels.job }} has a recent failing archive command'
			
 
				+            summary: 'Seconds since the last recorded failure of the archive_command'
			
 
				+
			
 
				+      - alert: PGSequenceExhaustion
			
 
				+        expr: ccp_sequence_exhaustion_count > 0
			
 
				+        for: 60s
			
 
				+        labels:
			
 
				+            service: postgresql
			
 
				+            severity: critical
			
 
				+            severity_num: 300
			
 
				+        annotations:
			
 
				+            description: 'Count of sequences on instance {{ $labels.job }} at over 75% usage: {{ $value }}. Run following query to see full sequence status: SELECT * FROM monitor.sequence_status() WHERE percent >= 75'
			
 
				+
			
 
				+      - alert: PGSettingsPendingRestart
			
 
				+        expr: ccp_settings_pending_restart_count > 0
			
 
				+        for: 60s
			
 
				+        labels:
			
 
				+            service: postgresql
			
 
				+            severity: critical
			
 
				+            severity_num: 300
			
 
				+        annotations:
			
 
				+            description: 'One or more settings in the pg_settings system catalog on system {{ $labels.job }} are in a pending_restart state. Check the system catalog for which settings are pending and review postgresql.conf for changes.'
			
--- a/code/runExporters.sh
+++ b/code/runExporters.sh
@@ -59,9 +59,10 @@ else
 
				     if [[ ! -d $postgres_exporter_log_path ]]; then
			
 
				         mkdir -p "$postgres_exporter_log_path"
			
 
				     fi
			
 
				-
			
 
				+    #todo 输入用户名和密码
			
 
				+    export DATA_SOURCE_NAME=postgresql://username:password@localhost:5432/?sslmode=disable
			
 
				     log_postgres_exporter_file="postgres_exporter_$(date "+%Y-%m-%d-%H:%M:%S").log"
			
 
				-    ./lib/postgres_exporter  --web.listen-address=:9187 &>> "${postgres_exporter_log_path}${log_postgres_exporter_file}" &
			
 
				+    ./lib/postgres_exporter  --web.listen-address=:9187  &>> "${postgres_exporter_log_path}${log_postgres_exporter_file}" &
			
 
				     sleep 1
			
 
				     log_info "postgres_exporter 启动成功！"
			
 
				 fi
			
--- a/code/statusExporters.sh
+++ b/code/statusExporters.sh
@@ -28,7 +28,7 @@ fi
 
				 
			
 
				 # postgres_exporter
			
 
				 echo -e "\n"
			
 
				-postgres_exporter_pid=$(ps -auxc | grep psotgres_exporter | grep -vE "grep|-Xmx700m" | awk '{print $2}')
			
 
				+postgres_exporter_pid=$(ps -auxc | grep postgres_export | grep -vE "grep|-Xmx700m" | awk '{print $2}')
			
 
				 if [[ $postgres_exporter_pid == "" ]]; then
			
 
				     echo "postgres_exporter未在运行"
			
 
				 else
			
--- a/config/product/prometheus.yml
+++ b/config/product/prometheus.yml
@@ -36,7 +36,7 @@ scrape_configs:
 
				   #clickhouse
			
 
				   - job_name: "clickhouse"
			
 
				     metrics_path: /metrics
			
 
				-    static_config: #todo:设置clickhouse目标
			
 
				+    static_config:
			
 
				       - targets:
			
 
				           - 'target1:port'
			
 
				       - targets:
			
--- a/config/test/prometheus.yml
+++ b/config/test/prometheus.yml
@@ -36,7 +36,7 @@ scrape_configs:
 
				   #clickhouse
			
 
				   - job_name: "clickhouse"
			
 
				     metrics_path: /metrics
			
 
				-    static_config: #todo:设置clickhouse目标
			
 
				+    static_config:
			
 
				       - targets:
			
 
				           - 'target1:port'
			
 
				       - targets: