Bladeren bron

opt:初步修改配置文件;停止启动dingtalk;

wjinan 10 maanden geleden
bovenliggende
commit
203e7d5a97

+ 4 - 7
code/prometheus/alertmanager.yml

@@ -1,7 +1,6 @@
 global:
   resolve_timeout: 5m
-  
-  # for mail 
+  # for mail (todo:需要邮件发送告警时需配置)
   # smtp_smarthost: smtp.qq.com:465
   # smtp_from: <smtp mail from>
   # smtp_auth_username: <username>
@@ -14,7 +13,6 @@ route:
   group_wait: 10s
   group_interval: 10s
   repeat_interval: 1h
-
   receiver: 'default-receiver'
 
 receivers:
@@ -22,10 +20,9 @@ receivers:
     # email_configs:
     #   - to: <mail to address>
     #     send_resolved: true
-
-    webhook_configs:
-    - url: 'http://localhost:9994/dingtalk/webhook1/send'
-
+    webhook_configs: #todo:填写地址
+    - url: 'http://localhost:9995/prometheusalert?type=dd&tpl=prometheus-dd&ddurl=钉钉机器人地址'
+    - url: 'http://localhost:9995/prometheusalert?type=email&tpl=prometheus-email&email=Email地址'
 inhibit_rules:
   - source_match:
       severity: 'critical'

+ 51 - 8
code/prometheus/prometheus.yml

@@ -1,7 +1,7 @@
 # my global config
 global:
-  scrape_interval: 1s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
-  evaluation_interval: 1s # Evaluate rules every 15 seconds. The default is every 1 minute.
+  scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
+  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
   # scrape_timeout is set to the global default (10s).
 
 # Alertmanager configuration
@@ -22,18 +22,61 @@ rule_files:
 scrape_configs:
   # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
   - job_name: "prometheus"
-
     # metrics_path defaults to '/metrics'
     # scheme defaults to 'http'.
-
     static_configs:
       - targets: ["localhost:9990"]
-  # - job_name: "traefik"
 
-  #   static_configs:
-  #     - targets: ["localhost:8080"]
+  #NodeExporter
+  - job_name: "NodeExporter"
+    metrics_path: /metrics
+    static_config:
+      - targets: [ "localhost: 9100" ]
 
+  #clickhouse
   - job_name: "clickhouse"
+    metrics_path: /metrics
+    static_config: #todo:设置clickhouse目标
+      - targets:
+          - 'target1:port'
+      - targets:
+          - 'target2:port'
+
+  #clickhouse-keeper
+  - job_name: "clickhouse-keeper"
     metrics_path: /metrics
     static_config:
-      - targets: ["localhost:9116"]
+      - targets:
+          - ''
+      - targets:
+          - ''
+
+  - job_name: "blackbox_clickhouse_instance"
+    metrics_path: /probe
+    params:
+      module: [ tcp_connect ]
+    static_configs:
+      - targets:
+          - 'target1:port'
+    relabel_configs:
+      - source_labels: [ __address__ ]
+        target_label: __param_target
+      - source_labels: [ __param_target ]
+        target_label: instance
+      - target_label: __address__
+        replacement: 'localhost:9115'
+
+  - job_name: "blackbox_ckkeeper_instance"
+    metrics_path: /probe
+    params:
+      module: [ tcp_connect ]
+    static_configs:
+      - targets:
+          - ':'
+    relabel_configs:
+      - source_labels: [ __address__ ]
+        target_label: __param_target
+      - source_labels: [ __param_target ]
+        target_label: instance
+      - target_label: __address__
+        replacement: 'localhost:9115'

+ 13 - 33
code/prometheus/rules/blackbox.rules

@@ -2,42 +2,22 @@ groups:
 - name: blackboxExporter
   rules:
 
-    - alert: MoquetteMqttServerDown
-      expr: probe_success{job="blackbox_moquette_mqtt_server_instance"} != 1
+    - alert: clickhouseServerDown
+      expr: probe_success{job="blackbox_clickhouse_instance"} != 1
       for: 30s
       labels:
         severity: warning
         type: ops
       annotations:
-        summary: "moquette mqtt server instance {{ $labels.instance }} is down"
-        description: "moquette mqtt server instance  down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: "clickhouse instance {{ $labels.instance }} is down"
+        description: "clickhouse instance down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: FaceapiDown
-      expr: probe_success{job="blackbox_faceapi_instance"} != 1
-      for: 30s
-      labels:
-        severity: warning
-        type: ops
-      annotations:
-        summary: "faceapi instance {{ $labels.instance }} is down"
-        description: "faceapi instance down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: IotCloudServerDown
-      expr: probe_success{job="blackbox_iot_cloud_server_instance"} != 1
-      for: 1m
-      labels:
-        severity: warning
-        type: ops
-      annotations:
-        summary: "iot_cloud_server instance {{ $labels.instance }} is down"
-        description: "iot_cloud_server instance down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: ManageWebsiteDown
-      expr: probe_success{job="blackbox_manage_website_instance"} != 1
-      for: 1m
-      labels:
-        severity: warning
-        type: ops
-      annotations:
-        summary: "manage website instance {{ $labels.instance }} is down"
-        description: "manage website down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    - alert: ckkeeperServerDown
+          expr: probe_success{job="blackbox_ckkeeper_instance"} != 1
+          for: 30s
+          labels:
+            severity: warning
+            type: ops
+          annotations:
+            summary: "clickhouse keeper instance {{ $labels.instance }} is down"
+            description: "clickhouse keeper instance down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

+ 4 - 0
code/prometheus/rules/postgres.rules

@@ -0,0 +1,4 @@
+groups:
+
+- name: postgresExporter
+

+ 21 - 21
code/prometheus/run.sh

@@ -73,24 +73,24 @@ else
 fi
 
 
-dingtalklog_path="./log/dingtalk/"
-# dingtalk section
-if ps -auxc | grep "dingtalk" | grep -v grep > /dev/null; then
-    log_info "dingtalk 已在运行"
-elif [[ ! -f "./lib/dingtalk" ]]; then
-    log_info "no dingtalk found, skip"
-else
-
-    if [[ ! -d $dingtalklog_path ]]; then
-        mkdir "$dingtalklog_path"
-    fi
-
-    log_dingtalk_file="dingtalk_$(date "+%Y-%m-%d-%H:%M:%S").log"
-    ./lib/dingtalk --config.file=dingtalk.yml --web.listen-address=:9994 --web.enable-ui &>> "${dingtalklog_path}${log_dingtalk_file}" &
-    sleep 1
-    if grep -E 'cannot|not defined|License does not exist' "${dingtalklog_path}${log_dingtalk_file}" >> /dev/null; then
-            log_err "dingtalk 启动遇到问题,请检查${log_dingtalk_file}"
-         else
-             log_info "dingtalk 启动成功!"
-    fi
-fi
+#dingtalklog_path="./log/dingtalk/"
+## dingtalk section
+#if ps -auxc | grep "dingtalk" | grep -v grep > /dev/null; then
+#    log_info "dingtalk 已在运行"
+#elif [[ ! -f "./lib/dingtalk" ]]; then
+#    log_info "no dingtalk found, skip"
+#else
+#
+#    if [[ ! -d $dingtalklog_path ]]; then
+#        mkdir "$dingtalklog_path"
+#    fi
+#
+#    log_dingtalk_file="dingtalk_$(date "+%Y-%m-%d-%H:%M:%S").log"
+#    ./lib/dingtalk --config.file=dingtalk.yml --web.listen-address=:9994 --web.enable-ui &>> "${dingtalklog_path}${log_dingtalk_file}" &
+#    sleep 1
+#    if grep -E 'cannot|not defined|License does not exist' "${dingtalklog_path}${log_dingtalk_file}" >> /dev/null; then
+#            log_err "dingtalk 启动遇到问题,请检查${log_dingtalk_file}"
+#         else
+#             log_info "dingtalk 启动成功!"
+#    fi
+#fi

+ 16 - 16
code/prometheus/status.sh

@@ -39,19 +39,19 @@ else
     done
 fi
 
-echo -e "\n"
-
-pid=$(ps -auxc | grep dingtalk | grep -vE "grep|-Xmx700m" | awk '{print $2}')
-if [[ $pid == "" ]]; then
-    echo "dingtalk未在运行"
-else
-    running_time=$(ps -o etime= -p "$pid")
-    ports=$(netstat -tnlp | grep dingta | head -n +1 | awk '{print $4}')
-
-    echo "dingtalk当前PID: $pid"
-    echo "当前运行时间: $running_time" | tr -s " "
-    echo "当前运行端口:"
-    for i in "$ports"; do
-        echo "$i"
-    done
-fi
+#echo -e "\n"
+#
+#pid=$(ps -auxc | grep dingtalk | grep -vE "grep|-Xmx700m" | awk '{print $2}')
+#if [[ $pid == "" ]]; then
+#    echo "dingtalk未在运行"
+#else
+#    running_time=$(ps -o etime= -p "$pid")
+#    ports=$(netstat -tnlp | grep dingta | head -n +1 | awk '{print $4}')
+#
+#    echo "dingtalk当前PID: $pid"
+#    echo "当前运行时间: $running_time" | tr -s " "
+#    echo "当前运行端口:"
+#    for i in "$ports"; do
+#        echo "$i"
+#    done
+#fi

+ 1 - 1
code/prometheus/stop.sh

@@ -52,6 +52,6 @@ function stop_process() {
 if [[ -z $1 ]]; then
     stop_process "prometheus"
     stop_process "alertmanager"
-    stop_process "dingtalk"
+#    stop_process "dingtalk"
     log_info "服务停止脚本运行完成"
 fi

+ 27 - 26
config/product/alertmanager.yml

@@ -1,30 +1,31 @@
-!!com.nuaa.iotcloud.utils.entity.AlertManagerFile
 global:
   resolve_timeout: 5m
-  smtp_auth_password: rkjiseqtepvmegbf
-  smtp_auth_username: 1810847883@qq.com
-  smtp_from: 1810847883@qq.com
-  smtp_require_tls: false
-  smtp_smarthost: smtp.exmail.qq.com:465
-inhibit_rules:
-  - equal:
-      - alertname
-      - instance
-    source_match:
-      severity: critical
-    target_match:
-      severity: warning
-receivers:
-  - name: default-receiver
-    webhook_configs:
-      - url:
+  # for mail (todo:需要邮件发送告警时需配置)
+  # smtp_smarthost: smtp.qq.com:465
+  # smtp_from: <smtp mail from>
+  # smtp_auth_username: <username>
+  # smtp_auth_identity: <username>
+  # smtp_auth_password: <password>
+  # smtp_require_tls: false
 
 route:
-  group_by:
-    - alertname
-  group_interval: 30s
-  group_wait: 30s
-  receiver: default-receiver
-  repeat_interval: 30m
-  routes:
-    - receiver:
+  group_by: ['alertname']
+  group_wait: 10s
+  group_interval: 10s
+  repeat_interval: 1h
+  receiver: 'default-receiver'
+
+receivers:
+  - name: default-receiver
+    # email_configs:
+    #   - to: <mail to address>
+    #     send_resolved: true
+    webhook_configs: #todo:填写地址
+      - url: 'http://localhost:9995/prometheusalert?type=dd&tpl=prometheus-dd&ddurl=钉钉机器人地址'
+      - url: 'http://localhost:9995/prometheusalert?type=email&tpl=prometheus-email&email=Email地址'
+inhibit_rules:
+  - source_match:
+      severity: 'critical'
+    target_match:
+      severity: 'warning'
+    equal: ['alertname', 'dev', 'instance']

+ 0 - 5
config/product/mysqld.cnf

@@ -1,5 +0,0 @@
-[client]
-host=47.98.229.145
-port=3306
-user=iotcloud
-password=P@ssw0rd123

+ 53 - 7
config/product/prometheus.yml

@@ -9,12 +9,11 @@ alerting:
   alertmanagers:
     - static_configs:
         - targets:
-          - localhost:9993
+            - localhost:9993
 
 # Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
 rule_files:
   - "rules/*.rules"
-  - "rules/customer/*.rules"
   # - "first_rules.yml"
   # - "second_rules.yml"
 
@@ -23,14 +22,61 @@ rule_files:
 scrape_configs:
   # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
   - job_name: "prometheus"
-
     # metrics_path defaults to '/metrics'
     # scheme defaults to 'http'.
-
     static_configs:
       - targets: ["localhost:9990"]
-  # - job_name: "traefik"
 
-  #   static_configs:
-  #     - targets: ["localhost:8080"]
+  #NodeExporter
+  - job_name: "NodeExporter"
+    metrics_path: /metrics
+    static_config:
+      - targets: [ "localhost: 9100" ]
+
+  #clickhouse
+  - job_name: "clickhouse"
+    metrics_path: /metrics
+    static_config: #todo:设置clickhouse目标
+      - targets:
+          - 'target1:port'
+      - targets:
+          - 'target2:port'
+
+  #clickhouse-keeper
+  - job_name: "clickhouse-keeper"
+    metrics_path: /metrics
+    static_config:
+      - targets:
+          - ''
+      - targets:
+          - ''
 
+  - job_name: "blackbox_clickhouse_instance"
+    metrics_path: /probe
+    params:
+      module: [ tcp_connect ]
+    static_configs:
+      - targets:
+          - 'target1:port'
+    relabel_configs:
+      - source_labels: [ __address__ ]
+        target_label: __param_target
+      - source_labels: [ __param_target ]
+        target_label: instance
+      - target_label: __address__
+        replacement: 'localhost:9115'
+
+  - job_name: "blackbox_ckkeeper_instance"
+    metrics_path: /probe
+    params:
+      module: [ tcp_connect ]
+    static_configs:
+      - targets:
+          - ':'
+    relabel_configs:
+      - source_labels: [ __address__ ]
+        target_label: __param_target
+      - source_labels: [ __param_target ]
+        target_label: instance
+      - target_label: __address__
+        replacement: 'localhost:9115'

+ 27 - 26
config/test/alertmanager.yml

@@ -1,30 +1,31 @@
-!!com.nuaa.iotcloud.utils.entity.AlertManagerFile
 global:
   resolve_timeout: 5m
-  smtp_auth_password: rkjiseqtepvmegbf
-  smtp_auth_username: 1810847883@qq.com
-  smtp_from: 1810847883@qq.com
-  smtp_require_tls: false
-  smtp_smarthost: smtp.exmail.qq.com:465
-inhibit_rules:
-  - equal:
-      - alertname
-      - instance
-    source_match:
-      severity: critical
-    target_match:
-      severity: warning
-receivers:
-  - name: default-receiver
-    webhook_configs:
-      - url:
+  # for mail (todo:需要邮件发送告警时需配置)
+  # smtp_smarthost: smtp.qq.com:465
+  # smtp_from: <smtp mail from>
+  # smtp_auth_username: <username>
+  # smtp_auth_identity: <username>
+  # smtp_auth_password: <password>
+  # smtp_require_tls: false
 
 route:
-  group_by:
-    - alertname
-  group_interval: 30s
-  group_wait: 30s
-  receiver: default-receiver
-  repeat_interval: 30m
-  routes:
-    - receiver:
+  group_by: ['alertname']
+  group_wait: 10s
+  group_interval: 10s
+  repeat_interval: 1h
+  receiver: 'default-receiver'
+
+receivers:
+  - name: default-receiver
+    # email_configs:
+    #   - to: <mail to address>
+    #     send_resolved: true
+    webhook_configs: #todo:填写地址
+      - url: 'http://localhost:9995/prometheusalert?type=dd&tpl=prometheus-dd&ddurl=钉钉机器人地址'
+      - url: 'http://localhost:9995/prometheusalert?type=email&tpl=prometheus-email&email=Email地址'
+inhibit_rules:
+  - source_match:
+      severity: 'critical'
+    target_match:
+      severity: 'warning'
+    equal: ['alertname', 'dev', 'instance']

+ 0 - 5
config/test/mysqld.cnf

@@ -1,5 +0,0 @@
-[client]
-host=114.55.2.212
-port=3306
-user=root
-password=zjymtest

+ 53 - 7
config/test/prometheus.yml

@@ -9,12 +9,11 @@ alerting:
   alertmanagers:
     - static_configs:
         - targets:
-          - localhost:9993
+            - localhost:9993
 
 # Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
 rule_files:
   - "rules/*.rules"
-  - "rules/customer/*.rules"
   # - "first_rules.yml"
   # - "second_rules.yml"
 
@@ -23,14 +22,61 @@ rule_files:
 scrape_configs:
   # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
   - job_name: "prometheus"
-
     # metrics_path defaults to '/metrics'
     # scheme defaults to 'http'.
-
     static_configs:
       - targets: ["localhost:9990"]
-  # - job_name: "traefik"
 
-  #   static_configs:
-  #     - targets: ["localhost:8080"]
+  #NodeExporter
+  - job_name: "NodeExporter"
+    metrics_path: /metrics
+    static_config:
+      - targets: [ "localhost: 9100" ]
+
+  #clickhouse
+  - job_name: "clickhouse"
+    metrics_path: /metrics
+    static_config: #todo:设置clickhouse目标
+      - targets:
+          - 'target1:port'
+      - targets:
+          - 'target2:port'
+
+  #clickhouse-keeper
+  - job_name: "clickhouse-keeper"
+    metrics_path: /metrics
+    static_config:
+      - targets:
+          - ''
+      - targets:
+          - ''
 
+  - job_name: "blackbox_clickhouse_instance"
+    metrics_path: /probe
+    params:
+      module: [ tcp_connect ]
+    static_configs:
+      - targets:
+          - 'target1:port'
+    relabel_configs:
+      - source_labels: [ __address__ ]
+        target_label: __param_target
+      - source_labels: [ __param_target ]
+        target_label: instance
+      - target_label: __address__
+        replacement: 'localhost:9115'
+
+  - job_name: "blackbox_ckkeeper_instance"
+    metrics_path: /probe
+    params:
+      module: [ tcp_connect ]
+    static_configs:
+      - targets:
+          - ':'
+    relabel_configs:
+      - source_labels: [ __address__ ]
+        target_label: __param_target
+      - source_labels: [ __param_target ]
+        target_label: instance
+      - target_label: __address__
+        replacement: 'localhost:9115'