Prometheus Operator部署/管理Prometheus Server一、数据持久化1.1、prometheus数据持久化1.2、grafana数据持久化二、优化配置三、如何修改alert rule?3.1、通过rule规则修改3.2、修改配置文件方式四、AlterManager报警配置
默认Prometheus和Grafana不做数据持久化,那么服务重启以后配置的Dashboard、账号密码、监控数据等信息将会丢失,所以做数据持久化也是很有必要的。
原始的数据是以 emptyDir 形式存放在pod里面,生命周期与pod相同,出现问题时,容器重启,监控相关的数据就全部消失了。
vim manifests/prometheus-prometheus.yaml
xxxxxxxxxx
apiVersion monitoring.coreos.com/v1
kind Prometheus
metadata
labels
app.kubernetes.io/component prometheus
app.kubernetes.io/name prometheus
app.kubernetes.io/part-of kube-prometheus
app.kubernetes.io/version2.29.1
prometheus k8s
name k8s
namespace monitoring
spec
alerting
alertmanagers
apiVersion v2
name alertmanager-main
namespace monitoring
port web
enableFeatures
externalLabels
image quay.io/prometheus/prometheus v2.29.1
nodeSelector
kubernetes.io/os linux
podMetadata
labels
app.kubernetes.io/component prometheus
app.kubernetes.io/name prometheus
app.kubernetes.io/part-of kube-prometheus
app.kubernetes.io/version2.29.1
podMonitorNamespaceSelector
podMonitorSelector
probeNamespaceSelector
probeSelector
replicas2
resources
requests
memory 400Mi
ruleNamespaceSelector
ruleSelector
securityContext
fsGroup2000
runAsNonRoottrue
runAsUser1000
serviceAccountName prometheus-k8s
serviceMonitorNamespaceSelector
serviceMonitorSelector
version2.29.1
# 新增持久化存储,yaml 末尾添加
retention 3d #加这个参数,表示prometheus数据保留的天数,默认会是1天
storage
volumeClaimTemplate
spec
storageClassName nfs-storage
resources
requests
storage 50Gi
先手动创建grafana的持久化PVC:
xxxxxxxxxx
[root@matser manifests]# vim grafana-pvc.yaml
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: grafana-pvc
namespace: monitoring
spec:
accessModes:
- ReadWriteMany
resources:
requests:
storage: 5Gi
storageClassName: nfs-storage
vim manifests/grafana-deployment.yaml
xxxxxxxxxx
volumes:
# - emptyDir: {} # 注释此两行,新增下三行
# name: grafana-storage
- name: grafana-storage
persistentVolumeClaim:
claimName: grafana-pvc
- name: grafana-datasources
secret:
secretName: grafana-datasources
为了固定grafana的登录密码,添加环境变量:
xxxxxxxxxx
readinessProbe:
httpGet:
path: /api/health
port: http
env: #添加环境变量
- name: GF_SECURITY_ADMIN_USER #添加环境变量
value: admin #添加环境变量
- name: GF_SECURITY_ADMIN_PASSWORD #添加环境变量
value: admin #添加环境变量
resources:
limits:
cpu: 200m
memory: 200Mi
xxxxxxxxxx
$ kubectl exec -it $(kubectl get pod -n monitoring -l app.kubernetes.io/name=grafana \
-o jsonpath='{.items[*].metadata.name}') -n monitoring -- sh
/usr/share/grafana $ grafana-cli plugins install grafana-piechart-panel
/usr/share/grafana $ grafana-cli plugins install camptocamp-prometheus-alertmanager-datasource
/usr/share/grafana $ grafana-cli plugins install grafana-kubernetes-app
grafana
dashboard 时区默认为UTC,比北京时间慢了8小时,很不便于日常监控查看,这里可以修改
xxxxxxxxxx
$ cd ./kube-prometheus/manifests
$ sed -i 's/UTC/UTC+8/g' grafana-dashboardDefinitions.yaml
$ kubectl apply -f grafana-dashboardDefinitions.yaml
xxxxxxxxxx
## edit
$ kubectl edit cm prometheus-k8s-rulefiles-0 -n monitoring
xxxxxxxxxx
$ cd ./kube-prometheus/manifests
$ vim kubernetes-prometheusRule.yaml
### 应用
$ kubectl apply kubernetes-prometheusRule.yaml
这里给出精简版本,详细可以参考 kube-prometheus/manifests/alertmanager-secret.yaml
xxxxxxxxxx
cat << EOF > alertmanager-prometheusAlert.yaml
apiVersion v1
kind Secret
metadata
labels
app.kubernetes.io/component alert-router
app.kubernetes.io/instance main
app.kubernetes.io/name alertmanager
app.kubernetes.io/part-of kube-prometheus
app.kubernetes.io/version0.24.0
name alertmanager-main
namespace monitoring
stringData
alertmanager.yaml -
global
resolve_timeout 5m
route
group_by'env''instance''type''group''job''alertname''cluster'
group_wait 10s
group_interval 2m
repeat_interval 10m
receiver'webhook'
receivers
name'webhook'
webhook_configs
url'http://prometheus-alert-center.monitor.svc:8080/prometheusalert?type=wx&tpl=prometheus-wx&wxurqq.com/cgi-bin/webhook/send?key=71c0a6f0-43a0-4ecf-b8c9-52aff88f3b68&at=ZhangDaDan,ZHDYA'
send_resolvedtrue
type Opaque
EOF
xxxxxxxxxx
$ kubectl apply -f alertmanager-prometheusAlert.yaml