SHA1
--- a/README.md
+++ b/README.md
@@ -11,4 +11,16 @@ Exporter:
 
																  - node_exporter: 9100
															
 
																  - blackbox_exporter: 9115
															
 
																  - postgres_exporter: 9187
															
 
																- - jmx_exporter: 自定义
															
 
																+ - jmx_exporter: 自定义
															
 
																+
															
 
																+Clickhouse:
															
 
																+  /etc/clickhouse-server/config.xml
															
 
																+  打开以下设置
															
 
																+     <prometheus>
															
 
																+          <endpoint>/metrics</endpoint>
															
 
																+          <port>9363</port>
															
 
																+          <metrics>true</metrics>
															
 
																+          <events>true</events>
															
 
																+          <asynchronous_metrics>true</asynchronous_metrics>
															
 
																+          <status_info>true</status_info>
															
 
																+     </prometheus>
															
--- a/code/conf/jmx_exporter_readme.md
+++ b/code/conf/jmx_exporter_readme.md
@@ -0,0 +1,13 @@
 
																+## jmx_exporter使用说明
															
 
																+
															
 
																+使用时需要用`jmx_exporter`启动`jar`程序，并且需要用到文件夹下的`config.yml`
															
 
																+
															
 
																+启动方式如下：
															
 
																+```shell
															
 
																+EXPORTER=$ROOT/jmx_exporter.jar
															
 
																+EXPORTER_PORT=9996
															
 
																+JAR_NAME=PATH/TO/JAR_APP
															
 
																+
															
 
																+java -javaagent:$EXPORTER=$EXPORTER_PORT:config.yml -jar $JAR_NAME
															
 
																+```
															
 
																+
															
--- a/code/prometheus/alertmanager.yml
+++ b/code/prometheus/alertmanager.yml
@@ -20,7 +20,7 @@ receivers:
 
																     # email_configs:
															
 
																     #   - to: <mail to address>
															
 
																     #     send_resolved: true
															
 
																-    webhook_configs: #todo:填写地址
															
 
																+    webhook_configs: # todo:填写告警收集地址
															
 
																     - url: 'http://localhost:9995/prometheusalert?type=dd&tpl=prometheus-dd&ddurl=钉钉机器人地址'
															
 
																     - url: 'http://localhost:9995/prometheusalert?type=email&tpl=prometheus-email&email=Email地址'
															
 
																 inhibit_rules:
															
--- a/code/prometheus/prometheus.yml
+++ b/code/prometheus/prometheus.yml
@@ -27,30 +27,34 @@ scrape_configs:
 
																     static_configs:
															
 
																       - targets: ["localhost:9990"]
															
 
																-  #NodeExporter
															
 
																+  #NodeExporter(9100) //todo 填写端口与地址
															
 
																   - job_name: "NodeExporter"
															
 
																     metrics_path: /metrics
															
 
																     static_config:
															
 
																-      - targets: [ "localhost: 9100" ]
															
 
																+      - targets: 'localhost: 9100'
															
 
																+      - targets: ''
															
 
																+      - targets: ''
															
 
																+      - targets: ''
															
 
																-  #clickhouse
															
 
																+  #clickhouse(9363) //todo 填写端口与地址 （配置文件中prometheus收集指标的端口 默认为9363）
															
 
																   - job_name: "clickhouse"
															
 
																     metrics_path: /metrics
															
 
																-    static_config: #todo:设置clickhouse目标
															
 
																+    static_config:
															
 
																       - targets:
															
 
																           - 'target1:port'
															
 
																       - targets:
															
 
																           - 'target2:port'
															
 
																-  #clickhouse-keeper
															
 
																+  #clickhousekeeper() //todo : 填写端口与地址
															
 
																   - job_name: "clickhouse-keeper"
															
 
																     metrics_path: /metrics
															
 
																     static_config:
															
 
																       - targets:
															
 
																-          - ''
															
 
																+          - 'target1:port'
															
 
																       - targets:
															
 
																-          - ''
															
 
																+          - 'target2:port'
															
 
																+  #clickhouse(8123) //todo: 填写端口与地址
															
 
																   - job_name: "blackbox_clickhouse_instance"
															
 
																     metrics_path: /probe
															
 
																     params:
															
@@ -58,6 +62,8 @@ scrape_configs:
 
																     static_configs:
															
 
																       - targets:
															
 
																           - 'target1:port'
															
 
																+          - ':'
															
 
																+          - ':'
															
 
																     relabel_configs:
															
 
																       - source_labels: [ __address__ ]
															
 
																         target_label: __param_target
															
@@ -66,6 +72,7 @@ scrape_configs:
 
																       - target_label: __address__
															
 
																         replacement: 'localhost:9115'
															
 
																+  #clickhouse-keeper //todo：填写端口与地址
															
 
																   - job_name: "blackbox_ckkeeper_instance"
															
 
																     metrics_path: /probe
															
 
																     params:
															
@@ -73,10 +80,31 @@ scrape_configs:
 
																     static_configs:
															
 
																       - targets:
															
 
																           - ':'
															
 
																+          - ':'
															
 
																+          - ':'
															
 
																     relabel_configs:
															
 
																       - source_labels: [ __address__ ]
															
 
																         target_label: __param_target
															
 
																       - source_labels: [ __param_target ]
															
 
																         target_label: instance
															
 
																       - target_label: __address__
															
 
																-        replacement: 'localhost:9115'
															
 
																+        replacement: 'localhost:9115'
															
 
																+
															
 
																+  #postgres(5432)  // todo: 填写端口与地址
															
 
																+  - job_name: "blackbox_postgres_instance"
															
 
																+    metrics_path: /probe
															
 
																+    params:
															
 
																+      module: [ tcp_connect ]
															
 
																+    static_configs:
															
 
																+      - targets:
															
 
																+          - ':'
															
 
																+          - ':'
															
 
																+          - ':'
															
 
																+    relabel_configs:
															
 
																+      - source_labels: [ __address__ ]
															
 
																+        target_label: __param_target
															
 
																+      - source_labels: [ __param_target ]
															
 
																+        target_label: instance
															
 
																+      - target_label: __address__
															
 
																+        replacement: 'localhost:9115'
															
 
																+
															
--- a/code/prometheus/rules/blackbox.rules
+++ b/code/prometheus/rules/blackbox.rules
@@ -13,11 +13,21 @@ groups:
 
																         description: "clickhouse instance down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
															
 
																     - alert: ckkeeperServerDown
															
 
																-          expr: probe_success{job="blackbox_ckkeeper_instance"} != 1
															
 
																-          for: 30s
															
 
																-          labels:
															
 
																-            severity: warning
															
 
																-            type: ops
															
 
																-          annotations:
															
 
																-            summary: "clickhouse keeper instance {{ $labels.instance }} is down"
															
 
																-            description: "clickhouse keeper instance down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
															
 
																+      expr: probe_success{job="blackbox_ckkeeper_instance"} != 1
															
 
																+      for: 30s
															
 
																+      labels:
															
 
																+         severity: warning
															
 
																+         type: ops
															
 
																+      annotations:
															
 
																+         summary: "clickhouse keeper instance {{ $labels.instance }} is down"
															
 
																+         description: "clickhouse keeper instance down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
															
 
																+
															
 
																+    - alert: postgresServerDown
															
 
																+      expr: probe_success{job="blackbox_postgres_instance"} != 1
															
 
																+      for: 30s
															
 
																+      labels:
															
 
																+         severity: warning
															
 
																+         type: ops
															
 
																+      annotations:
															
 
																+         summary: "postgres keeper instance {{ $labels.instance }} is down"
															
 
																+         description: "postgres keeper instance down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
															
--- a/code/prometheus/rules/clickhouse.rules
+++ b/code/prometheus/rules/clickhouse.rules
@@ -1,4 +1,13 @@
 
																 groups:
															
 
																 - name: clickhouse
															
 
																+  rules:
															
 
																+  - alert: ClickhouseReadOnlyReplicas
															
 
																+    #expr: clickhouse_readonly_replica > 0
															
 
																+    expr:  ClickHouseMetrics_ReadonlyReplica > 0
															
 
																+    for: 1m
															
 
																+  - alert: ClickhouseReplicationQueue
															
 
																+    #expr: clickhouse_replicas_max_queue_size > 5 and deriv(clickhouse_replicas_max_queue_size [10m]) > 0
															
 
																+    expr:  ClickHouseAsyncMetrics_ReplicasMaxQueueSize > 5 and deriv(ClickHouseAsyncMetrics_ReplicasMaxQueueSize [10m]) > 0
															
 
																+    for: 1m
															
--- a/code/prometheus/rules/postgres.rules
+++ b/code/prometheus/rules/postgres.rules
@@ -1,4 +1,236 @@
 
																 groups:
															
 
																 - name: postgresExporter
															
 
																+  rules:
															
 
																+    ########## EXPORTER RULES ##########
															
 
																+      - alert: PGExporterScrapeError
															
 
																+        expr: pg_exporter_last_scrape_error > 0
															
 
																+        for: 60s
															
 
																+        labels:
															
 
																+          service: postgresql
															
 
																+          severity: critical
															
 
																+          severity_num: 300
															
 
																+        annotations:
															
 
																+          summary: 'Postgres Exporter running on {{ $labels.job }} (instance: {{ $labels.instance }}) is encountering scrape errors processing queries. Error count: ( {{ $value }} )'
															
 
																+
															
 
																+    ########## POSTGRESQL RULES ##########
															
 
																+      - alert: PGIsUp
															
 
																+        expr: pg_up < 1
															
 
																+        for: 60s
															
 
																+        labels:
															
 
																+          service: postgresql
															
 
																+          severity: critical
															
 
																+          severity_num: 300
															
 
																+        annotations:
															
 
																+          summary: 'postgres_exporter running on {{ $labels.job }} is unable to communicate with the configured database'
															
 
																+
															
 
																+
															
 
																+
															
 
																+    ## Monitor for a failover event by checking if the recovery status value has changed within the specified time period
															
 
																+    ## IMPORTANT NOTE: This alert will *automatically resolve* after the given offset time period has passed! If you desire to have an alert that must be manually resolved, see the commented out alert beneath this one
															
 
																+      - alert: PGRecoveryStatusSwitch
															
 
																+        expr: ccp_is_in_recovery_status != ccp_is_in_recovery_status offset 5m
															
 
																+        for: 60s
															
 
																+        labels:
															
 
																+          service: postgresql
															
 
																+          severity: critical
															
 
																+          severity_num: 300
															
 
																+        annotations:
															
 
																+          summary: '{{ $labels.job }} has had a PostgreSQL failover event. Please check systems involved in this cluster for more details'
															
 
																+
															
 
																+      - alert: PGIdleTxn
															
 
																+        expr: ccp_connection_stats_max_idle_in_txn_time > 300
															
 
																+        for: 60s
															
 
																+        labels:
															
 
																+          service: postgresql
															
 
																+          severity: warning
															
 
																+          severity_num: 200
															
 
																+        annotations:
															
 
																+          description: '{{ $labels.job }} has at least one session idle in transaction for over 5 minutes.'
															
 
																+          summary: 'PGSQL Instance idle transactions'
															
 
																+
															
 
																+      - alert: PGIdleTxn
															
 
																+        expr: ccp_connection_stats_max_idle_in_txn_time > 900
															
 
																+        for: 60s
															
 
																+        labels:
															
 
																+          service: postgresql
															
 
																+          severity: critical
															
 
																+          severity_num: 300
															
 
																+        annotations:
															
 
																+          description: '{{ $labels.job }} has at least one session idle in transaction for over 15 minutes.'
															
 
																+          summary: 'PGSQL Instance idle transactions'
															
 
																+
															
 
																+      - alert: PGQueryTime
															
 
																+        expr: ccp_connection_stats_max_query_time > 43200
															
 
																+        for: 60s
															
 
																+        labels:
															
 
																+          service: postgresql
															
 
																+          severity: warning
															
 
																+          severity_num: 200
															
 
																+        annotations:
															
 
																+          description: '{{ $labels.job }} has at least one query running for over 12 hours.'
															
 
																+          summary: 'PGSQL Max Query Runtime'
															
 
																+
															
 
																+      - alert: PGQueryTime
															
 
																+        expr: ccp_connection_stats_max_query_time > 86400
															
 
																+        for: 60s
															
 
																+        labels:
															
 
																+          service: postgresql
															
 
																+          severity: critical
															
 
																+          severity_num: 300
															
 
																+        annotations:
															
 
																+          description: '{{ $labels.job }} has at least one query running for over 1 day.'
															
 
																+          summary: 'PGSQL Max Query Runtime'
															
 
																+
															
 
																+      - alert: PGConnPerc
															
 
																+        expr: 100 * (ccp_connection_stats_total / ccp_connection_stats_max_connections) > 75
															
 
																+        for: 60s
															
 
																+        labels:
															
 
																+          service: postgresql
															
 
																+          severity: warning
															
 
																+          severity_num: 200
															
 
																+        annotations:
															
 
																+          description: '{{ $labels.job }} is using 75% or more of available connections ({{ $value }}%)'
															
 
																+          summary: 'PGSQL Instance connections'
															
 
																+
															
 
																+      - alert: PGConnPerc
															
 
																+        expr: 100 * (ccp_connection_stats_total / ccp_connection_stats_max_connections) > 90
															
 
																+        for: 60s
															
 
																+        labels:
															
 
																+          service: postgresql
															
 
																+          severity: critical
															
 
																+          severity_num: 300
															
 
																+        annotations:
															
 
																+          description: '{{ $labels.job }} is using 90% or more of available connections ({{ $value }}%)'
															
 
																+          summary: 'PGSQL Instance connections'
															
 
																+
															
 
																+      - alert: PGDBSize
															
 
																+        expr: ccp_database_size_bytes > 1.073741824e+11
															
 
																+        for: 60s
															
 
																+        labels:
															
 
																+          service: postgresql
															
 
																+          severity: warning
															
 
																+          severity_num: 200
															
 
																+        annotations:
															
 
																+          description: 'PGSQL Instance {{ $labels.job }} over 100GB in size: {{ $value }} bytes'
															
 
																+          summary: 'PGSQL Instance size warning'
															
 
																+
															
 
																+      - alert: PGDBSize
															
 
																+        expr: ccp_database_size_bytes > 2.68435456e+11
															
 
																+        for: 60s
															
 
																+        labels:
															
 
																+          service: postgresql
															
 
																+          severity: critical
															
 
																+          severity_num: 300
															
 
																+        annotations:
															
 
																+          description: 'PGSQL Instance {{ $labels.job }} over 250GB in size: {{ $value }} bytes'
															
 
																+          summary: 'PGSQL Instance size critical'
															
 
																+
															
 
																+      - alert: PGReplicationByteLag
															
 
																+        expr: ccp_replication_lag_size_bytes > 5.24288e+07
															
 
																+        for: 60s
															
 
																+        labels:
															
 
																+          service: postgresql
															
 
																+          severity: warning
															
 
																+          severity_num: 200
															
 
																+        annotations:
															
 
																+          description: 'PGSQL Instance {{ $labels.job }} has at least one replica lagging over 50MB behind.'
															
 
																+          summary: 'PGSQL Instance replica lag warning'
															
 
																+
															
 
																+      - alert: PGReplicationByteLag
															
 
																+        expr: ccp_replication_lag_size_bytes > 1.048576e+08
															
 
																+        for: 60s
															
 
																+        labels:
															
 
																+          service: postgresql
															
 
																+          severity: critical
															
 
																+          severity_num: 300
															
 
																+        annotations:
															
 
																+          description: 'PGSQL Instance {{ $labels.job }} has at least one replica lagging over 100MB behind.'
															
 
																+          summary: 'PGSQL Instance replica lag warning'
															
 
																+
															
 
																+      - alert: PGReplicationSlotsInactive
															
 
																+        expr: ccp_replication_slots_active == 0
															
 
																+        for: 60s
															
 
																+        labels:
															
 
																+          service: postgresql
															
 
																+          severity: critical
															
 
																+          severity_num: 300
															
 
																+        annotations:
															
 
																+          description: 'PGSQL Instance {{ $labels.job }} has one or more inactive replication slots'
															
 
																+          summary: 'PGSQL Instance inactive replication slot'
															
 
																+
															
 
																+      - alert: PGXIDWraparound
															
 
																+        expr: ccp_transaction_wraparound_percent_towards_wraparound > 50
															
 
																+        for: 60s
															
 
																+        labels:
															
 
																+          service: postgresql
															
 
																+          severity: warning
															
 
																+          severity_num: 200
															
 
																+        annotations:
															
 
																+          description: 'PGSQL Instance {{ $labels.job }} is over 50% towards transaction id wraparound.'
															
 
																+          summary: 'PGSQL Instance {{ $labels.job }} transaction id wraparound imminent'
															
 
																+
															
 
																+      - alert: PGXIDWraparound
															
 
																+        expr: ccp_transaction_wraparound_percent_towards_wraparound > 75
															
 
																+        for: 60s
															
 
																+        labels:
															
 
																+          service: postgresql
															
 
																+          severity: critical
															
 
																+          severity_num: 300
															
 
																+        annotations:
															
 
																+          description: 'PGSQL Instance {{ $labels.job }} is over 75% towards transaction id wraparound.'
															
 
																+          summary: 'PGSQL Instance transaction id wraparound imminent'
															
 
																+
															
 
																+      - alert: PGEmergencyVacuum
															
 
																+        expr: ccp_transaction_wraparound_percent_towards_emergency_autovac > 110
															
 
																+        for: 60s
															
 
																+        labels:
															
 
																+          service: postgresql
															
 
																+          severity: warning
															
 
																+          severity_num: 200
															
 
																+        annotations:
															
 
																+          description: 'PGSQL Instance {{ $labels.job }} is over 110% beyond autovacuum_freeze_max_age value. Autovacuum may need tuning to better keep up.'
															
 
																+          summary: 'PGSQL Instance emergency vacuum imminent'
															
 
																+
															
 
																+      - alert: PGEmergencyVacuum
															
 
																+        expr: ccp_transaction_wraparound_percent_towards_emergency_autovac > 125
															
 
																+        for: 60s
															
 
																+        labels:
															
 
																+          service: postgresql
															
 
																+          severity: critical
															
 
																+          severity_num: 300
															
 
																+        annotations:
															
 
																+          description: 'PGSQL Instance {{ $labels.job }} is over 125% beyond autovacuum_freeze_max_age value. Autovacuum needs tuning to better keep up.'
															
 
																+          summary: 'PGSQL Instance emergency vacuum imminent'
															
 
																+
															
 
																+      - alert: PGArchiveCommandStatus
															
 
																+        expr: ccp_archive_command_status_seconds_since_last_fail > 300
															
 
																+        for: 60s
															
 
																+        labels:
															
 
																+            service: postgresql
															
 
																+            severity: critical
															
 
																+            severity_num: 300
															
 
																+        annotations:
															
 
																+            description: 'PGSQL Instance {{ $labels.job }} has a recent failing archive command'
															
 
																+            summary: 'Seconds since the last recorded failure of the archive_command'
															
 
																+
															
 
																+      - alert: PGSequenceExhaustion
															
 
																+        expr: ccp_sequence_exhaustion_count > 0
															
 
																+        for: 60s
															
 
																+        labels:
															
 
																+            service: postgresql
															
 
																+            severity: critical
															
 
																+            severity_num: 300
															
 
																+        annotations:
															
 
																+            description: 'Count of sequences on instance {{ $labels.job }} at over 75% usage: {{ $value }}. Run following query to see full sequence status: SELECT * FROM monitor.sequence_status() WHERE percent >= 75'
															
 
																+
															
 
																+      - alert: PGSettingsPendingRestart
															
 
																+        expr: ccp_settings_pending_restart_count > 0
															
 
																+        for: 60s
															
 
																+        labels:
															
 
																+            service: postgresql
															
 
																+            severity: critical
															
 
																+            severity_num: 300
															
 
																+        annotations:
															
 
																+            description: 'One or more settings in the pg_settings system catalog on system {{ $labels.job }} are in a pending_restart state. Check the system catalog for which settings are pending and review postgresql.conf for changes.'
															
--- a/code/runExporters.sh
+++ b/code/runExporters.sh
@@ -59,9 +59,10 @@ else
 
																     if [[ ! -d $postgres_exporter_log_path ]]; then
															
 
																         mkdir -p "$postgres_exporter_log_path"
															
 
																     fi
															
 
																-
															
 
																+    #todo 输入用户名和密码
															
 
																+    export DATA_SOURCE_NAME=postgresql://username:password@localhost:5432/?sslmode=disable
															
 
																     log_postgres_exporter_file="postgres_exporter_$(date "+%Y-%m-%d-%H:%M:%S").log"
															
 
																-    ./lib/postgres_exporter  --web.listen-address=:9187 &>> "${postgres_exporter_log_path}${log_postgres_exporter_file}" &
															
 
																+    ./lib/postgres_exporter  --web.listen-address=:9187  &>> "${postgres_exporter_log_path}${log_postgres_exporter_file}" &
															
 
																     sleep 1
															
 
																     log_info "postgres_exporter 启动成功！"
															
 
																 fi
															
--- a/code/statusExporters.sh
+++ b/code/statusExporters.sh
@@ -28,7 +28,7 @@ fi
 
																 # postgres_exporter
															
 
																 echo -e "\n"
															
 
																-postgres_exporter_pid=$(ps -auxc | grep psotgres_exporter | grep -vE "grep|-Xmx700m" | awk '{print $2}')
															
 
																+postgres_exporter_pid=$(ps -auxc | grep postgres_export | grep -vE "grep|-Xmx700m" | awk '{print $2}')
															
 
																 if [[ $postgres_exporter_pid == "" ]]; then
															
 
																     echo "postgres_exporter未在运行"
															
 
																 else
															
--- a/config/product/prometheus.yml
+++ b/config/product/prometheus.yml
@@ -36,7 +36,7 @@ scrape_configs:
 
																   #clickhouse
															
 
																   - job_name: "clickhouse"
															
 
																     metrics_path: /metrics
															
 
																-    static_config: #todo:设置clickhouse目标
															
 
																+    static_config:
															
 
																       - targets:
															
 
																           - 'target1:port'
															
 
																       - targets:
															
--- a/config/test/prometheus.yml
+++ b/config/test/prometheus.yml
@@ -36,7 +36,7 @@ scrape_configs:
 
																   #clickhouse
															
 
																   - job_name: "clickhouse"
															
 
																     metrics_path: /metrics
															
 
																-    static_config: #todo:设置clickhouse目标
															
 
																+    static_config:
															
 
																       - targets:
															
 
																           - 'target1:port'
															
 
																       - targets:
Author	SHA1 Message	Date
wjinan	c4fec918ba Merge remote-tracking branch 'origin/master'	10 months ago
wjinan	c3e96895f9 opt: todo: 填写监控所必须的信息。	10 months ago