草庐IT

Prometheus+Grafana+alertmanager+ 邮件 +钉钉告警

赵承胜博客 2023-03-28 原文

Prometheus+Grafana+alertmanager + 邮件 +钉钉告警

本文模拟生产环境

一 ansible 部署

ansbile 部署

在线安装
yum install -y epel-release

yum install ansible -y


离线安装


#离线环境,提前在有网络的服务器上下载好需要的软件包
yum install -y epel-release
mkdir -p /home/ansible
yum install ansible -y --downloadonly --downloaddir /home/ansible/

安装

cd /home/ansible

# 安装nfs
rpm -ivh *.rpm --force --nodeps


[root@zcsnode1 ansible]# ansible --version
ansible 2.9.27


#ansible 配置文件更改
[root@zcsmaster1 ~]# cat /etc/ansible/ansible.cfg
[defaults]
host_key_checking=False
[inventory]
[privilege_escalation]
become=True
become_method=su
become_user=root
[paramiko_connection]
[ssh_connection]
[persistent_connection]
[accelerate]
[selinux]
[colors]
[diff]

# ansible 主机清单配置
[root@zcsmaster1 ~]# cd node_exporter/
[root@zcsmaster1 node_exporter]# vim hosts

[root@zcsmaster1 node_exporter]# cat hosts (以下是允许root登录,且免密 )
[node_exporter]
182.168.40.180
182.168.40.181
182.168.40.182
[root@zcsmaster1 node_exporter]#

部署node-exporter

登录ansible 主机

mkdir node_exporter && cd node_exporter

# 下载 anget ,下载不下来,可以手动下载后上传
wget https://github.com/prometheus/node_exporter/releases/download/v1.5.0/node_exporter-1.5.0.linux-amd64.tar.gz

# 编辑配置文件
[root@zcsmaster1 node_exporter]# cat node_exporter.service
[Unit]
Descriptinotallow=node_exporter
After=network.target

[Service]
Type=simple
User=root
ExecStart=/app/prometheus/node_exporter-1.5.0.linux-amd64/node_exporter --web.listen-address=:19100
Restart=on-failure

[Install]
WantedBy=multi-user.target

#

[root@zcsmaster1 node_exporter]# cat node_exporter.yaml
---
- name: install node_exporter
hosts: all
remote_user: root
become: yes
become_method: su
tasks:
- name: mkdir dir
shell: mkdir /app/prometheus
- name: copy package
copy: src=./node_exporter-1.5.0.linux-amd64.tar.gz dest=/app/prometheus/
- name: copy service
copy: src=./node_exporter.service dest=/etc/systemd/system/
- name: unzip backage
shell: cd /app/prometheus/ && tar -zxvf node_exporter-1.5.0.linux-amd64.tar.gz
- name: install service
shell: systemctl daemon-reload && systemctl start node_exporter && systemctl enable node_exporter && systemctl status node_exporter
[root@zcsmaster1 node_exporter]#



#####
配置hosts

# k8s 主机已经有免密了(生产是进制root 登录的 需要修改hosts)
[root@zcsmaster1 node_exporter]# cat hosts
192.168.40.180
192.168.40.181
192.168.40.182

主机批量执行脚本

ansible-playbook -i hosts ./node_exporter.yaml

部署Prometheus +alertmanager

创建命名空间ccse
kubectl create ns ccse




创建sa账号,对sa做rbac授权

#创建一个sa账号monitor
[root@xianchaomaster1 ~]# kubectl create serviceaccount monitor -n ccse
#把sa账号monitor通过clusterrolebing绑定到clusterrole上
[root@xianchaomaster1 ~]# kubectl create clusterrolebinding monitor-clusterrolebinding -n ccse --clusterrole=cluster-admin --serviceaccount=ccse:monitor

10.2 创建prometheus数据存储目录

#在k8s集群的node1节点上创建数据存储目录 (生产可以使用远程存储)
mkdir -p /data/app/grafana_data && chmod 777 -R /data/app/grafana_data
mkdir -p /data/app/alertmanager-storage && chmod 777 -R /data/app/alertmanager-storage
mkdir -p /data/app/prometheus_data && chmod 777 -R /data/app/prometheus_data

#节点拉取镜像
docker pull grafana/grafana:8.5.15
docker pull prom/prometheus:v2.40.6
docker pull prom/alertmanager:v0.24.0

安装kube-state-metrics组件

kube-state-metrics是什么?

kube-state-metrics通过监听API Server生成有关资源对象的状态指标,比如Deployment、Node、Pod,需要注意的是kube-state-metrics只是简单的提供一个metrics数据,并不会存储这些指标数据,所以我们可以使用Prometheus来抓取这些数据然后存储,主要关注的是业务相关的一些元数据,比如Deployment、Pod、副本状态等;调度了多少个replicas?现在可用的有几个?多少个Pod是running/stopped/terminated状态?Pod重启了多少次?我有多少job在运行中。

安装kube-state-metrics组件

mkdir -p kube-state-metrics
cd kube-state-metrics

cat kube-state-metrics-deploy.yaml

apiVersion: apps/v1
kind: Deployment
metadata:
name: kube-state-metrics
namespace: kube-system
spec:
replicas: 1
selector:
matchLabels:
app: kube-state-metrics
template:
metadata:
labels:
app: kube-state-metrics
spec:
serviceAccountName: kube-state-metrics
containers:
- name: kube-state-metrics
image: quay.io/coreos/kube-state-metrics:v1.9.0
ports:
- containerPort: 8080
[root@zcsmaster1 2]# cat kube-state-metrics-rbac.yaml
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: kube-state-metrics
namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: kube-state-metrics
rules:
- apiGroups: [""]
resources: ["nodes", "pods", "services", "resourcequotas", "replicationcontrollers", "limitranges", "persistentvolumeclaims", "persistentvolumes", "namespaces", "endpoints"]
verbs: ["list", "watch"]
- apiGroups: ["extensions"]
resources: ["daemonsets", "deployments", "replicasets"]
verbs: ["list", "watch"]
- apiGroups: ["apps"]
resources: ["statefulsets"]
verbs: ["list", "watch"]
- apiGroups: ["batch"]
resources: ["cronjobs", "jobs"]
verbs: ["list", "watch"]
- apiGroups: ["autoscaling"]
resources: ["horizontalpodautoscalers"]
verbs: ["list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: kube-state-metrics
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: kube-state-metrics
subjects:
- kind: ServiceAccount
name: kube-state-metrics
namespace: kube-system
[root@zcsmaster1 2]# cat kube-state-metrics-svc.yaml
apiVersion: v1
kind: Service
metadata:
annotations:
prometheus.io/scrape: 'true'
name: kube-state-metrics
namespace: kube-system
labels:
app: kube-state-metrics
spec:
ports:
- name: kube-state-metrics
port: 8080
protocol: TCP
selector:
app: kube-state-metrics
[root@zcsmaster1 2]#

etcd-certs

生成一个etcd-certs,这个在部署prometheus需要

kubectl -n ccse create secret generic etcd-certs --from-file=/etc/kubernetes/pki/etcd/server.key --from-file=/etc/kubernetes/pki/etcd/server.crt --from-file=/etc/kubernetes/pki/etcd/ca.crt

部署prometheus-alertmanager-svc.yaml

kubectl apply -f prometheus-alertmanager-svc.yaml

---
apiVersion: v1
kind: Service
metadata:
labels:
name: prometheus
kubernetes.io/cluster-service: 'true'
name: alertmanager
namespace: ccse
spec:
ports:
- name: alertmanager
nodePort: 30066
port: 9093
protocol: TCP
targetPort: 9093
selector:
app: prometheus
sessionAffinity: None
type: NodePort
---
apiVersion: v1
kind: Service
metadata:
name: prometheus
namespace: ccse
labels:
app: prometheus
spec:
type: NodePort
ports:
- port: 9090
targetPort: 9090
protocol: TCP
selector:
app: prometheus
component: server

alertmanager-cm.yaml

kind: ConfigMap
apiVersion: v1
metadata:
name: alertmanager
namespace: ccse
data:
alertmanager.yml: |-
global:
resolve_timeout: 1m
smtp_smarthost: 'smtp.163.com:25'
smtp_from: 'zhaochengsheng_666@163.com'
smtp_auth_username: 'zhaochengsheng_666'
smtp_auth_password: 'DYTRIRAIWRYANDMC'
smtp_require_tls: false
route:
group_by: [alertname]
group_wait: 1s
group_interval: 5s
repeat_interval: 24h
receiver: 5gcmp
receivers:
- name: '5gcmp'
email_configs:
- to: '838032955@qq.com'
send_resolved: true

prometheus-alertmanager-cfg.yaml

kind: ConfigMap
apiVersion: v1
metadata:
labels:
app: prometheus
name: prometheus-config
namespace: ccse
data:
prometheus.yml: |
rule_files:
- /etc/prometheus/rules.yml
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager.ccse.svc.cluster.local:9093"]
global:
scrape_interval: 15s
scrape_timeout: 10s
evaluation_interval: 1m
scrape_configs:
#- job_name: 'kubernetes-node'
# kubernetes_sd_configs:
# - role: node
# relabel_configs:
# - source_labels: [__address__]
# regex: '(.*):10250'
# replacement: '${1}:9100'
# target_label: __address__
# action: replace
# - action: labelmap
# regex: __meta_kubernetes_node_label_(.+)
- job_name: 'kubernetes-node-cadvisor'
metric_relabel_configs:
- source_labels: [instance]
separator: ;
regex: (.+)
target_label: node
replacement: $1
action: replace
kubernetes_sd_configs:
- role: node
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
- job_name: 'kubernetes-apiserver'
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
- job_name: 'kubernetes-service-endpoints'
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
action: replace
target_label: __scheme__
regex: (https?)
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
action: replace
target_label: __address__
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: kubernetes_name
- job_name: 'kubernetes-pods'
kubernetes_sd_configs:
- role: pod
relabel_configs:
- action: keep
regex: true
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_scrape
- action: replace
regex: (.+)
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_path
target_label: __metrics_path__
- action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
source_labels:
- __address__
- __meta_kubernetes_pod_annotation_prometheus_io_port
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- action: replace
source_labels:
- __meta_kubernetes_namespace
target_label: kubernetes_namespace
- action: replace
source_labels:
- __meta_kubernetes_pod_name
target_label: kubernetes_pod_name
- job_name: 'kubernetes-schedule'
scrape_interval: 5s
static_configs:
- targets: ['192.168.40.180:10251']
- job_name: 'kubernetes-controller-manager'
scrape_interval: 5s
static_configs:
- targets: ['192.168.40.180:10252']
- job_name: 'ccse-node-exporter'
static_configs:
- targets: ["192.168.40.180:19100","192.168.40.181:19100","192.168.40.182:19100"]
#- job_name: 'kubernetes-kube-proxy'
# scrape_interval: 5s
# static_configs:
# - targets: ['192.168.40.180:10249','192.168.40.181:10249']
- job_name: 'kubernetes-etcd'
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/k8s-certs/etcd/ca.crt
cert_file: /var/run/secrets/kubernetes.io/k8s-certs/etcd/server.crt
key_file: /var/run/secrets/kubernetes.io/k8s-certs/etcd/server.key
scrape_interval: 5s
static_configs:
- targets: ['192.168.40.180:2379']
rules.yml: |
groups:
- name: example
rules:
- alert: kube-proxy的cpu使用率大于80%
expr: rate(process_cpu_seconds_total{job=~"kubernetes-kube-proxy"}[1m]) * 100 > 80
for: 5m
labels:
severity: warnning
annotations:
description: "{{$labels.instance}}的{{$labels.job}}组件的cpu使用率超过80%"
- alert: kube-proxy的cpu使用率大于90%
expr: rate(process_cpu_seconds_total{job=~"kubernetes-kube-proxy"}[1m]) * 100 > 90
for: 5m
labels:
severity: critical
annotations:
description: "{{$labels.instance}}的{{$labels.job}}组件的cpu使用率超过90%"
- alert: scheduler的cpu使用率大于80%
expr: rate(process_cpu_seconds_total{job=~"kubernetes-schedule"}[1m]) * 100 > 80
for: 5m
labels:
severity: warnning
annotations:
description: "{{$labels.instance}}的{{$labels.job}}组件的cpu使用率超过80%"
- alert: scheduler的cpu使用率大于90%
expr: rate(process_cpu_seconds_total{job=~"kubernetes-schedule"}[1m]) * 100 > 90
for: 5m
labels:
severity: critical
annotations:
description: "{{$labels.instance}}的{{$labels.job}}组件的cpu使用率超过90%"
- alert: controller-manager的cpu使用率大于90%
expr: rate(process_cpu_seconds_total{job=~"kubernetes-controller-manager"}[1m]) * 100 > 0
for: 5m
labels:
severity: critical
annotations:
description: "{{$labels.instance}}的{{$labels.job}}组件的cpu使用率超过90%"
- alert: apiserver的cpu使用率大于80%
expr: rate(process_cpu_seconds_total{job=~"kubernetes-apiserver"}[1m]) * 100 > 80
for: 5m
labels:
severity: warnning
annotations:
description: "{{$labels.instance}}的{{$labels.job}}组件的cpu使用率超过80%"
- alert: apiserver的cpu使用率大于90%
expr: rate(process_cpu_seconds_total{job=~"kubernetes-apiserver"}[1m]) * 100 > 90
for: 5m
labels:
severity: critical
annotations:
description: "{{$labels.instance}}的{{$labels.job}}组件的cpu使用率超过90%"
- alert: etcd的cpu使用率大于80%
expr: rate(process_cpu_seconds_total{job=~"kubernetes-etcd"}[1m]) * 100 > 80
for: 5m
labels:
severity: warnning
annotations:
description: "{{$labels.instance}}的{{$labels.job}}组件的cpu使用率超过80%"
- alert: etcd的cpu使用率大于90%
expr: rate(process_cpu_seconds_total{job=~"kubernetes-etcd"}[1m]) * 100 > 90
for: 5m
labels:
severity: critical
annotations:
description: "{{$labels.instance}}的{{$labels.job}}组件的cpu使用率超过90%"
- alert: kube-state-metrics的cpu使用率大于80%
expr: rate(process_cpu_seconds_total{k8s_app=~"kube-state-metrics"}[1m]) * 100 > 80
for: 5m
labels:
severity: warnning
annotations:
description: "{{$labels.instance}}的{{$labels.k8s_app}}组件的cpu使用率超过80%"
value: "{{ $value }}%"
threshold: "80%"
- alert: kube-state-metrics的cpu使用率大于90%
expr: rate(process_cpu_seconds_total{k8s_app=~"kube-state-metrics"}[1m]) * 100 > 0
for: 5m
labels:
severity: critical
annotations:
description: "{{$labels.instance}}的{{$labels.k8s_app}}组件的cpu使用率超过90%"
value: "{{ $value }}%"
threshold: "90%"
- alert: coredns的cpu使用率大于80%
expr: rate(process_cpu_seconds_total{k8s_app=~"kube-dns"}[1m]) * 100 > 80
for: 5m
labels:
severity: warnning
annotations:
description: "{{$labels.instance}}的{{$labels.k8s_app}}组件的cpu使用率超过80%"
value: "{{ $value }}%"
threshold: "80%"
- alert: coredns的cpu使用率大于90%
expr: rate(process_cpu_seconds_total{k8s_app=~"kube-dns"}[1m]) * 100 > 90
for: 5m
labels:
severity: critical
annotations:
description: "{{$labels.instance}}的{{$labels.k8s_app}}组件的cpu使用率超过90%"
value: "{{ $value }}%"
threshold: "90%"
- alert: kube-proxy打开句柄数>600
expr: process_open_fds{job=~"kubernetes-kube-proxy"} > 600
for: 5m
labels:
severity: warnning
annotations:
description: "{{$labels.instance}}的{{$labels.job}}打开句柄数>600"
value: "{{ $value }}"
- alert: kube-proxy打开句柄数>1000
expr: process_open_fds{job=~"kubernetes-kube-proxy"} > 1000
for: 5m
labels:
severity: critical
annotations:
description: "{{$labels.instance}}的{{$labels.job}}打开句柄数>1000"
value: "{{ $value }}"
- alert: kubernetes-schedule打开句柄数>600
expr: process_open_fds{job=~"kubernetes-schedule"} > 600
for: 5m
labels:
severity: warnning
annotations:
description: "{{$labels.instance}}的{{$labels.job}}打开句柄数>600"
value: "{{ $value }}"
- alert: kubernetes-schedule打开句柄数>1000
expr: process_open_fds{job=~"kubernetes-schedule"} > 1000
for: 5m
labels:
severity: critical
annotations:
description: "{{$labels.instance}}的{{$labels.job}}打开句柄数>1000"
value: "{{ $value }}"
- alert: kubernetes-controller-manager打开句柄数>600
expr: process_open_fds{job=~"kubernetes-controller-manager"} > 600
for: 5m
labels:
severity: warnning
annotations:
description: "{{$labels.instance}}的{{$labels.job}}打开句柄数>600"
value: "{{ $value }}"
- alert: kubernetes-controller-manager打开句柄数>1000
expr: process_open_fds{job=~"kubernetes-controller-manager"} > 1000
for: 5m
labels:
severity: critical
annotations:
description: "{{$labels.instance}}的{{$labels.job}}打开句柄数>1000"
value: "{{ $value }}"
- alert: kubernetes-apiserver打开句柄数>600
expr: process_open_fds{job=~"kubernetes-apiserver"} > 600
for: 5m
labels:
severity: warnning
annotations:
description: "{{$labels.instance}}的{{$labels.job}}打开句柄数>600"
value: "{{ $value }}"
- alert: kubernetes-apiserver打开句柄数>1000
expr: process_open_fds{job=~"kubernetes-apiserver"} > 1000
for: 5m
labels:
severity: critical
annotations:
description: "{{$labels.instance}}的{{$labels.job}}打开句柄数>1000"
value: "{{ $value }}"
- alert: kubernetes-etcd打开句柄数>600
expr: process_open_fds{job=~"kubernetes-etcd"} > 600
for: 5m
labels:
severity: warnning
annotations:
description: "{{$labels.instance}}的{{$labels.job}}打开句柄数>600"
value: "{{ $value }}"
- alert: kubernetes-etcd打开句柄数>1000
expr: process_open_fds{job=~"kubernetes-etcd"} > 1000
for: 5m
labels:
severity: critical
annotations:
description: "{{$labels.instance}}的{{$labels.job}}打开句柄数>1000"
value: "{{ $value }}"
- alert: coredns
expr: process_open_fds{k8s_app=~"kube-dns"} > 600
for: 2s
labels:
severity: warnning
annotations:
description: "插件{{$labels.k8s_app}}({{$labels.instance}}): 打开句柄数超过600"
value: "{{ $value }}"
- alert: coredns
expr: process_open_fds{k8s_app=~"kube-dns"} > 1000
for: 2s
labels:
severity: critical
annotations:
description: "插件{{$labels.k8s_app}}({{$labels.instance}}): 打开句柄数超过1000"
value: "{{ $value }}"
- alert: kube-proxy
expr: process_virtual_memory_bytes{job=~"kubernetes-kube-proxy"} > 2000000000
for: 2s
labels:
severity: warnning
annotations:
description: "组件{{$labels.job}}({{$labels.instance}}): 使用虚拟内存超过2G"
value: "{{ $value }}"
- alert: scheduler
expr: process_virtual_memory_bytes{job=~"kubernetes-schedule"} > 2000000000
for: 2s
labels:
severity: warnning
annotations:
description: "组件{{$labels.job}}({{$labels.instance}}): 使用虚拟内存超过2G"
value: "{{ $value }}"
- alert: kubernetes-controller-manager
expr: process_virtual_memory_bytes{job=~"kubernetes-controller-manager"} > 2000000000
for: 2s
labels:
severity: warnning
annotations:
description: "组件{{$labels.job}}({{$labels.instance}}): 使用虚拟内存超过2G"
value: "{{ $value }}"
- alert: kubernetes-apiserver
expr: process_virtual_memory_bytes{job=~"kubernetes-apiserver"} > 2000000000
for: 2s
labels:
severity: warnning
annotations:
description: "组件{{$labels.job}}({{$labels.instance}}): 使用虚拟内存超过2G"
value: "{{ $value }}"
- alert: kubernetes-etcd
expr: process_virtual_memory_bytes{job=~"kubernetes-etcd"} > 2000000000
for: 2s
labels:
severity: warnning
annotations:
description: "组件{{$labels.job}}({{$labels.instance}}): 使用虚拟内存超过2G"
value: "{{ $value }}"
- alert: kube-dns
expr: process_virtual_memory_bytes{k8s_app=~"kube-dns"} > 2000000000
for: 2s
labels:
severity: warnning
annotations:
description: "插件{{$labels.k8s_app}}({{$labels.instance}}): 使用虚拟内存超过2G"
value: "{{ $value }}"
- alert: HttpRequestsAvg
expr: sum(rate(rest_client_requests_total{job=~"kubernetes-kube-proxy|kubernetes-kubelet|kubernetes-schedule|kubernetes-control-manager|kubernetes-apiservers"}[1m])) > 1000
for: 2s
labels:
team: admin
annotations:
description: "组件{{$labels.job}}({{$labels.instance}}): TPS超过1000"
value: "{{ $value }}"
threshold: "1000"
- alert: Pod_restarts
expr: kube_pod_container_status_restarts_total{namespace=~"ccse"} > 0
for: 2s
labels:
severity: warnning
annotations:
description: "在{{$labels.namespace}}名称空间下发现{{$labels.pod}}这个pod下的容器{{$labels.container}}被重启,这个监控指标是由{{$labels.instance}}采集的"
value: "{{ $value }}"
threshold: "0"
- alert: Pod_waiting
expr: kube_pod_container_status_waiting_reason{namespace=~"kube-system|default"} == 1
for: 10s
labels:
team: admin
annotations:
description: "空间{{$labels.namespace}}({{$labels.instance}}): 发现{{$labels.pod}}下的{{$labels.container}}启动异常等待中"
value: "{{ $value }}"
threshold: "1"
- alert: Pod_terminated
expr: kube_pod_container_status_terminated_reason{namespace=~"ccse"} == 1
for: 2s
labels:
team: admin
annotations:
description: "空间{{$labels.namespace}}({{$labels.instance}}): 发现{{$labels.pod}}下的{{$labels.container}}被删除"
value: "{{ $value }}"
threshold: "1"
- alert: Etcd_leader
expr: etcd_server_has_leader{job="kubernetes-etcd"} == 0
for: 2s
labels:
team: admin
annotations:
description: "组件{{$labels.job}}({{$labels.instance}}): 当前没有leader"
value: "{{ $value }}"
threshold: "0"
- alert: Etcd_leader_changes
expr: rate(etcd_server_leader_changes_seen_total{job="kubernetes-etcd"}[1m]) > 0
for: 2s
labels:
team: admin
annotations:
description: "组件{{$labels.job}}({{$labels.instance}}): 当前leader已发生改变"
value: "{{ $value }}"
threshold: "0"
- alert: Etcd_failed
expr: rate(etcd_server_proposals_failed_total{job="kubernetes-etcd"}[1m]) > 0
for: 2s
labels:
team: admin
annotations:
description: "组件{{$labels.job}}({{$labels.instance}}): 服务失败"
value: "{{ $value }}"
threshold: "0"
- alert: Etcd_db_total_size
expr: etcd_debugging_mvcc_db_total_size_in_bytes{job="kubernetes-etcd"} > 10000000000
for: 2s
labels:
team: admin
annotations:
description: "组件{{$labels.job}}({{$labels.instance}}):db空间超过10G"
value: "{{ $value }}"
threshold: "10G"
- alert: Endpoint_ready
expr: kube_endpoint_address_not_ready{namespace=~"kube-system|default"} == 1
for: 2s
labels:
team: admin
annotations:
description: "空间{{$labels.namespace}}({{$labels.instance}}): 发现{{$labels.endpoint}}不可用"
value: "{{ $value }}"
threshold: "1"
- name: 物理节点状态-监控告警
rules:
- alert: 物理节点cpu使用率
expr: 100-avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by(instance)*100 > 90
for: 5m
labels:
severity: ccritical
annotations:
summary: "{{ $labels.instance }}cpu使用率过高"
description: "{{ $labels.instance }}的cpu使用率超过90%,当前使用率[{{ $value }}],需要排查处理"
- alert: 物理节点内存使用率
expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes)) / node_memory_MemTotal_bytes * 100 > 90
for: 5m
labels:
severity: critical
annotations:
summary: "{{ $labels.instance }}内存使用率过高"
description: "{{ $labels.instance }}的内存使用率超过90%,当前使用率[{{ $value }}],需要排查处理"
- alert: InstanceDown
expr: up == 0
for: 5m
labels:
severity: critical
annotations:
summary: "{{ $labels.instance }}: 服务器宕机"
description: "{{ $labels.instance }}: 服务器延时超过5分钟"
- alert: 物理节点磁盘的IO性能
expr: 100-(avg(irate(node_disk_io_time_seconds_total[1m])) by(instance)* 100) < 10
for: 5m
labels:
severity: critical
annotations:
summary: "{{$labels.mountpoint}} 流入磁盘IO使用率过高!"
description: "{{$labels.mountpoint }} 流入磁盘IO大于90%(目前使用:{{$value}})"
- alert: 入网流量带宽
expr: ((sum(rate (node_network_receive_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])) by (instance)) / 100) > 1024000
for: 5m
labels:
severity: critical
annotations:
summary: "{{$labels.mountpoint}} 流入网络带宽过高!"
description: "{{$labels.mountpoint }}流入网络带宽持续5分钟高于1000M. RX带宽使用率{{$value}}"
- alert: 出网流量带宽
expr: ((sum(rate (node_network_transmit_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])) by (instance)) / 100) > 1024000
for: 5m
labels:
severity: critical
annotations:
summary: "{{$labels.mountpoint}} 流出网络带宽过高!"
description: "{{$labels.mountpoint }}流出网络带宽持续5分钟高于1000M. RX带宽使用率{{$value}}"
# - alert: TCP会话
# expr: node_netstat_Tcp_CurrEstab > 1000
# for: 2s
# labels:
# severity: critical
# annotations:
# summary: "{{$labels.mountpoint}} TCP_ESTABLISHED过高!"
# description: "{{$labels.mountpoint }} TCP_ESTABLISHED大于1000%(目前使用:{{$value}}%)"
- alert: 磁盘容量
expr: 100-(node_filesystem_free_bytes{fstype=~"ext4|xfs"}/node_filesystem_size_bytes {fstype=~"ext4|xfs"}*100) > 80
for: 5m
labels:
severity: critical
annotations:
summary: "{{$labels.mountpoint}} 磁盘分区使用率过高!"
description: "{{$labels.mountpoint }} 磁盘分区使用大于80%(目前使用:{{$value}}%)"

prometheus-alertmanager-deploy.yaml

---
apiVersion: apps/v1
kind: Deployment
metadata:
name: prometheus-server
namespace: ccse
labels:
app: prometheus
spec:
replicas: 1
selector:
matchLabels:
app: prometheus
component: server
template:
metadata:
labels:
app: prometheus
component: server
annotations:
prometheus.io/scrape: 'false'
spec:
nodeName: zcsnode1
serviceAccountName: monitor
containers:
- name: prometheus
image: prom/prometheus:v2.40.6
imagePullPolicy: IfNotPresent
command:
- "/bin/prometheus"
args:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.path=/prometheus"
- "--storage.tsdb.retentinotallow=360h"
- "--web.enable-lifecycle"
ports:
- containerPort: 9090
protocol: TCP
resources:
requests:
memory: "500Mi"
cpu: "0.5"
limits:
memory: "8Gi"
cpu: "8"
volumeMounts:
- mountPath: /etc/prometheus
name: prometheus-config
- mountPath: /prometheus/
name: prometheus-storage-volume
- name: k8s-certs
mountPath: /var/run/secrets/kubernetes.io/k8s-certs/etcd/
- name: alertmanager
image: prom/alertmanager:v0.24.0
imagePullPolicy: IfNotPresent
args:
- "--config.file=/etc/alertmanager/alertmanager.yml"
- "--log.level=debug"
ports:
- containerPort: 9093
protocol: TCP
name: alertmanager
resources:
requests:
memory: "500Mi"
cpu: "0.5"
limits:
memory: "4Gi"
cpu: "4"
volumeMounts:
- name: alertmanager-config
mountPath: /etc/alertmanager
- name: alertmanager-storage
mountPath: /alertmanager
- name: localtime
mountPath: /etc/localtime
volumes:
- name: prometheus-config
configMap:
name: prometheus-config
- name: prometheus-storage-volume
hostPath:
path: /data/app/prometheus_data
type: Directory
- name: k8s-certs
secret:
secretName: etcd-certs
- name: alertmanager-config
configMap:
name: alertmanager
- name: alertmanager-storage
hostPath:
path: /data/app/alertmanager-storage
type: DirectoryOrCreate
- name: localtime
hostPath:
path: /usr/share/zoneinfo/Asia/Shanghai

部署 grafana

内存限制 根据实际情况设置值

[root@zcsmaster1 ~]# cat grafana.yaml
---
apiVersion: apps/v1
kind: Deployment
metadata:
namespace: ccse
labels:
app: grafana
name: grafana
spec:
selector:
matchLabels:
app: grafana
template:
metadata:
labels:
app: grafana
spec:
securityContext:
fsGroup: 472
supplementalGroups:
- 0
nodeName: zcsnode1
containers:
- name: grafana
image: grafana/grafana:8.5.15
imagePullPolicy: IfNotPresent
ports:
- containerPort: 3000
name: http-grafana
protocol: TCP
readinessProbe:
failureThreshold: 3
httpGet:
path: /robots.txt
port: 3000
scheme: HTTP
initialDelaySeconds: 10
periodSeconds: 30
successThreshold: 1
timeoutSeconds: 2
livenessProbe:
failureThreshold: 3
initialDelaySeconds: 30
periodSeconds: 10
successThreshold: 1
tcpSocket:
port: 3000
timeoutSeconds: 1
resources:
requests:
cpu: 250m
memory: 750Mi
limits:
memory: "4Gi"
cpu: "4"
volumeMounts:
- mountPath: /var/lib/grafana
name: grafana-storage-volume
volumes:
- name: grafana-storage-volume
hostPath:
path: /data/app/grafana_data
type: Directory
---
apiVersion: v1
kind: Service
metadata:
namespace: ccse
name: grafana
spec:
ports:
- nodePort: 32673
port: 3000
protocol: TCP
targetPort: http-grafana
selector:
app: grafana
sessionAffinity: None
type: LoadBalancer

访问验证

[root@zcsmaster1 ok]# kubectl get svc -n ccse | grep -E "prometheus|grafana|alertmanager"
alertmanager NodePort 10.103.186.163 <none> 9093:30066/TCP 119s
grafana LoadBalancer 10.106.90.16 <pending> 3000:32673/TCP 11h
prometheus NodePort 10.99.102.44 <none> 9090:30263/TCP 119s

http://192.168.40.180:30263/
http://192.168.40.180:32673/login
http://192.168.40.180:30066/#/alerts

问题1

从上面可以发现kubernetes-controller-manager和kubernetes-schedule都显示连接不上对应的端口

可按如下方法处理:
vim /etc/kubernetes/manifests/kube-scheduler.yaml
修改如下内容:
把--bind-address=127.0.0.1变成--bind-address=192.168.40.180
把httpGet:字段下的hosts由127.0.0.1变成192.168.40.180
把—port=0删除
#注意:192.168.40.180是k8s的控制节点的ip
vim /etc/kubernetes/manifests/kube-controller-manager.yaml
把--bind-address=127.0.0.1变成--bind-address=192.168.40.180
把httpGet:字段下的hosts由127.0.0.1变成192.168.40.180
把—port=0删除

修改之后在k8s各个节点执行
systemctl restart kubelet

kubectl get cs
显示如下:
NAME STATUS MESSAGE ERROR
controller-manager Healthy ok
scheduler Healthy ok
etcd-0 Healthy {"health":"true"}

ss -antulp | grep :10251
ss -antulp | grep :10252
可以看到相应的端口已经被物理机监听了
点击status->targets,可看到如下

grafana 配置

数据源配置

​http://prometheus.ccse.svc:9090​

导入 prometheus-dashboard


alertmanager 配置

邮箱告警

下面一篇文张 ,会详细讲解钉钉告警


有关Prometheus+Grafana+alertmanager+ 邮件 +钉钉告警的更多相关文章

  1. ruby-on-rails - 验证电子邮件地址是 Paypal 用户 - 2

    我想验证一个电子邮件地址是否是PayPal用户。是否有API调用来执行此操作?是否有执行此操作的ruby​​库?谢谢 最佳答案 GetVerifiedStatus来自PayPal'sAdaptiveAccounts平台会为您做这件事。PayPal没有任何codesamples或SDKs用于Ruby中的自适应帐户,但我确实找到了编写codeforGetVerifiedStatusinRuby的人.您需要更改该代码以检查他们拥有的帐户类型的唯一更改是更改if@xml['accountStatus']!=nilaccount_status

  2. ruby-on-rails - Ruby on Rails - 需要在每周的特定时间将消息发送到电子邮件 - 2

    我想知道我应该如何着手这个项目。我需要每周向人们发送一次电子邮件。但是,这必须在每周的特定时间自动生成并发送。编码有多难?我需要知道是否有任何书籍可以提供帮助,或者你们中的任何人是否可以指导我。它必须使用ruby​​onrails进行编程。因此有一个网络服务和数据库集成。干杯 最佳答案 为什么这么复杂?您只需安排工作。您可以使用Delayed::Job例如。Delayed::Job让您可以使用run_at符号在特定时间安排作业,如下所示:Delayed::Job.enqueue(SendEmailJob.new(...),:run_

  3. ruby - 使用ruby mail gem获取电子邮件正文而不获取附件 - 2

    我正在尝试使用IMAP和ruby​​mailgem获取gmail的正文。当我按照anotherstackoverflowanswer.中的描述获取RFC822字段时,它工作得很好.Fiedl很好地描述了这种方法answer类似的问题。这种方法很棒,只是它需要获取RFC822,而RFC822也会获取所有附件。是否有任何其他领域或其他方法我可以采取不获取附件但仍然使用ruby​​mailgem来获得解码良好的正文? 最佳答案 您必须解析并理解返回的BODYSTRUCTURE响应的实际结构,请参阅RFC3501,p.56.还要记住应用相关

  4. ruby - 使用 S/MIME 在 Ruby 中对电子邮件进行数字签名 - 2

    Ruby中是否有一种方法可以使用S/MIME对电子邮件消息进行数字签名?我们的团队使用PKI,我们的用户习惯于期待重要消息的数字签名。我知道我可以调用openssl命令行工具:opensslsmime-sign-signer$CERT_FILE-passinpass:$CERT_PASS-in$UNSIGNED_MAIL-out$SIGNED_MAIL-certfile$CERT_CA_FILE-from'your'-to'recipients'-subject'TheSubject'但我希望利用Ruby解决方案。 最佳答案 我最终

  5. ruby-on-rails - 带有内联附件的 Rails 邮件程序错误 - 2

    我在ActionMailer中有一个非常罕见的行为,我在5个月前实现了一个邮件程序操作并且它工作正常,但昨天,由于一些奇怪的原因,它崩溃了。问题我有一个邮件布局,为了在我所有的电子邮件中使用它,我在其中渲染了一张之前由前置过滤器附加的图像Layout=app/views/layouts/email.html.erbVisionamos>......用户邮件程序=app/mailers/users.rbclassUsuariosMailer"monitoreo@visionamos.com"layout"mail"#Setemaillayoutbefore_filter:add_inli

  6. ruby-on-rails - 发生异常时发送电子邮件不起作用,使用 exception_notification - 2

    我正在从rails2.3迁移到rails3.1,我试图在生成异常时发送电子邮件。我正在使用exception_notificationgem。我的其余电子邮件都在工作。但是异常邮件不会被解雇。以下是我的staging.rb文件中的设置。config.action_mailer.perform_deliveries=trueconfig.action_mailer.raise_delivery_errors=true下面是application.rb中的代码C::Application.config.middleware.useExceptionNotification::Rack,:e

  7. ruby - 使用 gmail gem 跟踪一些电子邮件 - 2

    我正在使用gmailgem发送电子邮件,我需要跟踪这些电子邮件。我该怎么做?我正在尝试搜索带有message_id的电子邮件,但它会从我的收件箱中提取所有电子邮件,而我只想要特定电子邮件的回复。这是我的实际代码:*使用message_id保存电子邮件*mail=gmail.deliver(email)Email.create(:message_id=>mail.message_id,:from=>user.email,:to=>annotation.to,:body=>annotation.content,:title=>annotation.title,:annotation=>an

  8. Streampark集成Cloudera Flink、ldap、告警,以及部署常见问题 - 2

    集成背景我们当前集群使用的是ClouderaCDP,Flink版本为ClouderaVersion1.14,整体Flink安装目录以及配置文件结构与社区版本有较大出入。直接根据Streampark官方文档进行部署,将无法配置FlinkHome,以及后续整体Flink任务提交到集群中,因此需要进行针对化适配集成,在满足使用需求上,尽量提供完整的Streampark使用体验。集成步骤版本匹配问题解决首先解决无法识别Cloudera中的FlinkHome问题,根据报错主要明确到的事情是无法读取到Flink版本、lib下面的jar包名称无法匹配。修改对象:修改源码:(解决无法匹配clouderajar

  9. ruby-on-rails - Rails actionmailer,在特定时间发送邮件 - 2

    我有一个带有actionmailer的Rails应用程序,可以向客户发送提醒。所以我想在特定的日期时间发送电子邮件。我该怎么做,或者来自Controller的任何操作? 最佳答案 您可以通过使用某种后台任务来实现此目标,例如resque或delayed_job例如。此外,像whenever这样的gem(Ruby中的Cron作业)将帮助您实现目标!看看thistutorial它显示通过ActiveJob使用后台处理器发送电子邮件和delayed_job更新要在特定日期时间发送电子邮件,您可以使用deliver_later使用wait_

  10. ruby-on-rails - 在处理电子邮件回复时,我怎样才能忽略任何电子邮件客户端细节和历史记录? - 2

    我有一个通过IMAP处理传入电子邮件的Rails应用程序。当前使用一种方法来搜索TMail对象的各个部分以查找给定的content_type:defself.search_parts_for_content_type(parts,content_type='text/html')parts.eachdo|part|ifpart.content_type==content_typereturnpart.bodyelseifpart.multipart?ifbody=self.search_parts_for_content_type(part.parts,content_type)ret

随机推荐