Parcourir la source

opt: todo: 填写监控所必须的信息。

wjinan il y a 10 mois
Parent
commit
c3e96895f9

+ 13 - 1
README.md

@@ -11,4 +11,16 @@ Exporter:
  - node_exporter: 9100
  - blackbox_exporter: 9115
  - postgres_exporter: 9187
- - jmx_exporter: 自定义
+ - jmx_exporter: 自定义
+
+Clickhouse:
+  /etc/clickhouse-server/config.xml
+  打开以下设置
+     <prometheus>
+          <endpoint>/metrics</endpoint>
+          <port>9363</port>
+          <metrics>true</metrics>
+          <events>true</events>
+          <asynchronous_metrics>true</asynchronous_metrics>
+          <status_info>true</status_info>
+     </prometheus>

+ 13 - 0
code/conf/jmx_exporter_readme.md

@@ -0,0 +1,13 @@
+## jmx_exporter使用说明
+
+使用时需要用`jmx_exporter`启动`jar`程序,并且需要用到文件夹下的`config.yml`
+
+启动方式如下:
+```shell
+EXPORTER=$ROOT/jmx_exporter.jar
+EXPORTER_PORT=9996
+JAR_NAME=PATH/TO/JAR_APP
+
+java -javaagent:$EXPORTER=$EXPORTER_PORT:config.yml -jar $JAR_NAME
+```
+

+ 1 - 1
code/prometheus/alertmanager.yml

@@ -20,7 +20,7 @@ receivers:
     # email_configs:
     #   - to: <mail to address>
     #     send_resolved: true
-    webhook_configs: #todo:填写地址
+    webhook_configs: # todo:填写告警收集地址
     - url: 'http://localhost:9995/prometheusalert?type=dd&tpl=prometheus-dd&ddurl=钉钉机器人地址'
     - url: 'http://localhost:9995/prometheusalert?type=email&tpl=prometheus-email&email=Email地址'
 inhibit_rules:

+ 36 - 8
code/prometheus/prometheus.yml

@@ -27,30 +27,34 @@ scrape_configs:
     static_configs:
       - targets: ["localhost:9990"]
 
-  #NodeExporter
+  #NodeExporter(9100) //todo 填写端口与地址
   - job_name: "NodeExporter"
     metrics_path: /metrics
     static_config:
-      - targets: [ "localhost: 9100" ]
+      - targets: 'localhost: 9100'
+      - targets: ''
+      - targets: ''
+      - targets: ''
 
-  #clickhouse
+  #clickhouse(9363) //todo 填写端口与地址 (配置文件中prometheus收集指标的端口 默认为9363)
   - job_name: "clickhouse"
     metrics_path: /metrics
-    static_config: #todo:设置clickhouse目标
+    static_config:
       - targets:
           - 'target1:port'
       - targets:
           - 'target2:port'
 
-  #clickhouse-keeper
+  #clickhousekeeper() //todo : 填写端口与地址
   - job_name: "clickhouse-keeper"
     metrics_path: /metrics
     static_config:
       - targets:
-          - ''
+          - 'target1:port'
       - targets:
-          - ''
+          - 'target2:port'
 
+  #clickhouse(8123) //todo: 填写端口与地址
   - job_name: "blackbox_clickhouse_instance"
     metrics_path: /probe
     params:
@@ -58,6 +62,8 @@ scrape_configs:
     static_configs:
       - targets:
           - 'target1:port'
+          - ':'
+          - ':'
     relabel_configs:
       - source_labels: [ __address__ ]
         target_label: __param_target
@@ -66,6 +72,7 @@ scrape_configs:
       - target_label: __address__
         replacement: 'localhost:9115'
 
+  #clickhouse-keeper //todo:填写端口与地址
   - job_name: "blackbox_ckkeeper_instance"
     metrics_path: /probe
     params:
@@ -73,10 +80,31 @@ scrape_configs:
     static_configs:
       - targets:
           - ':'
+          - ':'
+          - ':'
     relabel_configs:
       - source_labels: [ __address__ ]
         target_label: __param_target
       - source_labels: [ __param_target ]
         target_label: instance
       - target_label: __address__
-        replacement: 'localhost:9115'
+        replacement: 'localhost:9115'
+
+  #postgres(5432)  // todo: 填写端口与地址
+  - job_name: "blackbox_postgres_instance"
+    metrics_path: /probe
+    params:
+      module: [ tcp_connect ]
+    static_configs:
+      - targets:
+          - ':'
+          - ':'
+          - ':'
+    relabel_configs:
+      - source_labels: [ __address__ ]
+        target_label: __param_target
+      - source_labels: [ __param_target ]
+        target_label: instance
+      - target_label: __address__
+        replacement: 'localhost:9115'
+

+ 18 - 8
code/prometheus/rules/blackbox.rules

@@ -13,11 +13,21 @@ groups:
         description: "clickhouse instance down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: ckkeeperServerDown
-          expr: probe_success{job="blackbox_ckkeeper_instance"} != 1
-          for: 30s
-          labels:
-            severity: warning
-            type: ops
-          annotations:
-            summary: "clickhouse keeper instance {{ $labels.instance }} is down"
-            description: "clickhouse keeper instance down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+      expr: probe_success{job="blackbox_ckkeeper_instance"} != 1
+      for: 30s
+      labels:
+         severity: warning
+         type: ops
+      annotations:
+         summary: "clickhouse keeper instance {{ $labels.instance }} is down"
+         description: "clickhouse keeper instance down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: postgresServerDown
+      expr: probe_success{job="blackbox_postgres_instance"} != 1
+      for: 30s
+      labels:
+         severity: warning
+         type: ops
+      annotations:
+         summary: "postgres keeper instance {{ $labels.instance }} is down"
+         description: "postgres keeper instance down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+ 9 - 0
code/prometheus/rules/clickhouse.rules

@@ -1,4 +1,13 @@
 groups:
 
 - name: clickhouse
+  rules:
+  - alert: ClickhouseReadOnlyReplicas
+    #expr: clickhouse_readonly_replica > 0
+    expr:  ClickHouseMetrics_ReadonlyReplica > 0
+    for: 1m
+  - alert: ClickhouseReplicationQueue
+    #expr: clickhouse_replicas_max_queue_size > 5 and deriv(clickhouse_replicas_max_queue_size [10m]) > 0
+    expr:  ClickHouseAsyncMetrics_ReplicasMaxQueueSize > 5 and deriv(ClickHouseAsyncMetrics_ReplicasMaxQueueSize [10m]) > 0
+    for: 1m
 

+ 232 - 0
code/prometheus/rules/postgres.rules

@@ -1,4 +1,236 @@
 groups:
 
 - name: postgresExporter
+  rules:
+    ########## EXPORTER RULES ##########
+      - alert: PGExporterScrapeError
+        expr: pg_exporter_last_scrape_error > 0
+        for: 60s
+        labels:
+          service: postgresql
+          severity: critical
+          severity_num: 300
+        annotations:
+          summary: 'Postgres Exporter running on {{ $labels.job }} (instance: {{ $labels.instance }}) is encountering scrape errors processing queries. Error count: ( {{ $value }} )'
 
+
+    ########## POSTGRESQL RULES ##########
+      - alert: PGIsUp
+        expr: pg_up < 1
+        for: 60s
+        labels:
+          service: postgresql
+          severity: critical
+          severity_num: 300
+        annotations:
+          summary: 'postgres_exporter running on {{ $labels.job }} is unable to communicate with the configured database'
+
+
+
+    ## Monitor for a failover event by checking if the recovery status value has changed within the specified time period
+    ## IMPORTANT NOTE: This alert will *automatically resolve* after the given offset time period has passed! If you desire to have an alert that must be manually resolved, see the commented out alert beneath this one
+      - alert: PGRecoveryStatusSwitch
+        expr: ccp_is_in_recovery_status != ccp_is_in_recovery_status offset 5m
+        for: 60s
+        labels:
+          service: postgresql
+          severity: critical
+          severity_num: 300
+        annotations:
+          summary: '{{ $labels.job }} has had a PostgreSQL failover event. Please check systems involved in this cluster for more details'
+
+      - alert: PGIdleTxn
+        expr: ccp_connection_stats_max_idle_in_txn_time > 300
+        for: 60s
+        labels:
+          service: postgresql
+          severity: warning
+          severity_num: 200
+        annotations:
+          description: '{{ $labels.job }} has at least one session idle in transaction for over 5 minutes.'
+          summary: 'PGSQL Instance idle transactions'
+
+      - alert: PGIdleTxn
+        expr: ccp_connection_stats_max_idle_in_txn_time > 900
+        for: 60s
+        labels:
+          service: postgresql
+          severity: critical
+          severity_num: 300
+        annotations:
+          description: '{{ $labels.job }} has at least one session idle in transaction for over 15 minutes.'
+          summary: 'PGSQL Instance idle transactions'
+
+      - alert: PGQueryTime
+        expr: ccp_connection_stats_max_query_time > 43200
+        for: 60s
+        labels:
+          service: postgresql
+          severity: warning
+          severity_num: 200
+        annotations:
+          description: '{{ $labels.job }} has at least one query running for over 12 hours.'
+          summary: 'PGSQL Max Query Runtime'
+
+      - alert: PGQueryTime
+        expr: ccp_connection_stats_max_query_time > 86400
+        for: 60s
+        labels:
+          service: postgresql
+          severity: critical
+          severity_num: 300
+        annotations:
+          description: '{{ $labels.job }} has at least one query running for over 1 day.'
+          summary: 'PGSQL Max Query Runtime'
+
+      - alert: PGConnPerc
+        expr: 100 * (ccp_connection_stats_total / ccp_connection_stats_max_connections) > 75
+        for: 60s
+        labels:
+          service: postgresql
+          severity: warning
+          severity_num: 200
+        annotations:
+          description: '{{ $labels.job }} is using 75% or more of available connections ({{ $value }}%)'
+          summary: 'PGSQL Instance connections'
+
+      - alert: PGConnPerc
+        expr: 100 * (ccp_connection_stats_total / ccp_connection_stats_max_connections) > 90
+        for: 60s
+        labels:
+          service: postgresql
+          severity: critical
+          severity_num: 300
+        annotations:
+          description: '{{ $labels.job }} is using 90% or more of available connections ({{ $value }}%)'
+          summary: 'PGSQL Instance connections'
+
+      - alert: PGDBSize
+        expr: ccp_database_size_bytes > 1.073741824e+11
+        for: 60s
+        labels:
+          service: postgresql
+          severity: warning
+          severity_num: 200
+        annotations:
+          description: 'PGSQL Instance {{ $labels.job }} over 100GB in size: {{ $value }} bytes'
+          summary: 'PGSQL Instance size warning'
+
+      - alert: PGDBSize
+        expr: ccp_database_size_bytes > 2.68435456e+11
+        for: 60s
+        labels:
+          service: postgresql
+          severity: critical
+          severity_num: 300
+        annotations:
+          description: 'PGSQL Instance {{ $labels.job }} over 250GB in size: {{ $value }} bytes'
+          summary: 'PGSQL Instance size critical'
+
+      - alert: PGReplicationByteLag
+        expr: ccp_replication_lag_size_bytes > 5.24288e+07
+        for: 60s
+        labels:
+          service: postgresql
+          severity: warning
+          severity_num: 200
+        annotations:
+          description: 'PGSQL Instance {{ $labels.job }} has at least one replica lagging over 50MB behind.'
+          summary: 'PGSQL Instance replica lag warning'
+
+      - alert: PGReplicationByteLag
+        expr: ccp_replication_lag_size_bytes > 1.048576e+08
+        for: 60s
+        labels:
+          service: postgresql
+          severity: critical
+          severity_num: 300
+        annotations:
+          description: 'PGSQL Instance {{ $labels.job }} has at least one replica lagging over 100MB behind.'
+          summary: 'PGSQL Instance replica lag warning'
+
+      - alert: PGReplicationSlotsInactive
+        expr: ccp_replication_slots_active == 0
+        for: 60s
+        labels:
+          service: postgresql
+          severity: critical
+          severity_num: 300
+        annotations:
+          description: 'PGSQL Instance {{ $labels.job }} has one or more inactive replication slots'
+          summary: 'PGSQL Instance inactive replication slot'
+
+      - alert: PGXIDWraparound
+        expr: ccp_transaction_wraparound_percent_towards_wraparound > 50
+        for: 60s
+        labels:
+          service: postgresql
+          severity: warning
+          severity_num: 200
+        annotations:
+          description: 'PGSQL Instance {{ $labels.job }} is over 50% towards transaction id wraparound.'
+          summary: 'PGSQL Instance {{ $labels.job }} transaction id wraparound imminent'
+
+      - alert: PGXIDWraparound
+        expr: ccp_transaction_wraparound_percent_towards_wraparound > 75
+        for: 60s
+        labels:
+          service: postgresql
+          severity: critical
+          severity_num: 300
+        annotations:
+          description: 'PGSQL Instance {{ $labels.job }} is over 75% towards transaction id wraparound.'
+          summary: 'PGSQL Instance transaction id wraparound imminent'
+
+      - alert: PGEmergencyVacuum
+        expr: ccp_transaction_wraparound_percent_towards_emergency_autovac > 110
+        for: 60s
+        labels:
+          service: postgresql
+          severity: warning
+          severity_num: 200
+        annotations:
+          description: 'PGSQL Instance {{ $labels.job }} is over 110% beyond autovacuum_freeze_max_age value. Autovacuum may need tuning to better keep up.'
+          summary: 'PGSQL Instance emergency vacuum imminent'
+
+      - alert: PGEmergencyVacuum
+        expr: ccp_transaction_wraparound_percent_towards_emergency_autovac > 125
+        for: 60s
+        labels:
+          service: postgresql
+          severity: critical
+          severity_num: 300
+        annotations:
+          description: 'PGSQL Instance {{ $labels.job }} is over 125% beyond autovacuum_freeze_max_age value. Autovacuum needs tuning to better keep up.'
+          summary: 'PGSQL Instance emergency vacuum imminent'
+
+      - alert: PGArchiveCommandStatus
+        expr: ccp_archive_command_status_seconds_since_last_fail > 300
+        for: 60s
+        labels:
+            service: postgresql
+            severity: critical
+            severity_num: 300
+        annotations:
+            description: 'PGSQL Instance {{ $labels.job }} has a recent failing archive command'
+            summary: 'Seconds since the last recorded failure of the archive_command'
+
+      - alert: PGSequenceExhaustion
+        expr: ccp_sequence_exhaustion_count > 0
+        for: 60s
+        labels:
+            service: postgresql
+            severity: critical
+            severity_num: 300
+        annotations:
+            description: 'Count of sequences on instance {{ $labels.job }} at over 75% usage: {{ $value }}. Run following query to see full sequence status: SELECT * FROM monitor.sequence_status() WHERE percent >= 75'
+
+      - alert: PGSettingsPendingRestart
+        expr: ccp_settings_pending_restart_count > 0
+        for: 60s
+        labels:
+            service: postgresql
+            severity: critical
+            severity_num: 300
+        annotations:
+            description: 'One or more settings in the pg_settings system catalog on system {{ $labels.job }} are in a pending_restart state. Check the system catalog for which settings are pending and review postgresql.conf for changes.'

+ 3 - 2
code/runExporters.sh

@@ -59,9 +59,10 @@ else
     if [[ ! -d $postgres_exporter_log_path ]]; then
         mkdir -p "$postgres_exporter_log_path"
     fi
-
+    #todo 输入用户名和密码
+    export DATA_SOURCE_NAME=postgresql://username:password@localhost:5432/?sslmode=disable
     log_postgres_exporter_file="postgres_exporter_$(date "+%Y-%m-%d-%H:%M:%S").log"
-    ./lib/postgres_exporter  --web.listen-address=:9187 &>> "${postgres_exporter_log_path}${log_postgres_exporter_file}" &
+    ./lib/postgres_exporter  --web.listen-address=:9187  &>> "${postgres_exporter_log_path}${log_postgres_exporter_file}" &
     sleep 1
     log_info "postgres_exporter 启动成功!"
 fi

+ 1 - 1
code/statusExporters.sh

@@ -28,7 +28,7 @@ fi
 
 # postgres_exporter
 echo -e "\n"
-postgres_exporter_pid=$(ps -auxc | grep psotgres_exporter | grep -vE "grep|-Xmx700m" | awk '{print $2}')
+postgres_exporter_pid=$(ps -auxc | grep postgres_export | grep -vE "grep|-Xmx700m" | awk '{print $2}')
 if [[ $postgres_exporter_pid == "" ]]; then
     echo "postgres_exporter未在运行"
 else

+ 1 - 1
config/product/prometheus.yml

@@ -36,7 +36,7 @@ scrape_configs:
   #clickhouse
   - job_name: "clickhouse"
     metrics_path: /metrics
-    static_config: #todo:设置clickhouse目标
+    static_config:
       - targets:
           - 'target1:port'
       - targets:

+ 1 - 1
config/test/prometheus.yml

@@ -36,7 +36,7 @@ scrape_configs:
   #clickhouse
   - job_name: "clickhouse"
     metrics_path: /metrics
-    static_config: #todo:设置clickhouse目标
+    static_config:
       - targets:
           - 'target1:port'
       - targets: