k8s 1.22 环境 kube-prometheus-stack 22.x 升级至 41.x
更新helm repo,下载最新chart
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm repo update
helm pull prometheus-community/kube-prometheus-stack
同步所需镜像到仓库
参考
同步下面镜像到
# kube-prometheus-stack
# https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack/values.yaml
k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.3.0
quay.io/prometheus-operator/prometheus-operator:v0.60.1
quay.io/prometheus/alertmanager:v0.24.0
quay.io/prometheus/prometheus:v2.39.1
quay.io/prometheus-operator/prometheus-config-reloader:v0.60.1
# kube-state-metrics
# https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-state-metrics/values.yaml
registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.6.0
# prometheus-node-exporter
# https://github.com/prometheus-community/helm-charts/blob/main/charts/prometheus-node-exporter/values.yaml
quay.io/prometheus/node-exporter:v1.4.0
# https://github.com/grafana/helm-charts/blob/main/charts/grafana/values.yaml
grafana/grafana:9.2.5
curlimages/curl:7.85.0
quay.io/kiwigrid/k8s-sidecar:1.19.2
busybox:1.31.1
编写 kube-prometheus-stack 41.x values
kube-prometheus-stack-values.yaml
alertmanager:
ingress:
enabled: true
annotations:
kubernetes.io/ingress.class: nginx
#cert-manager.io/cluster-issuer: selfsigned-issuer
tls:
- hosts:
- "alertmanager.apps145205.pldtest.k8s"
#secretName: alertmanager-tls
hosts: ["alertmanager.apps145205.pldtest.k8s"]
pathType: ImplementationSpecific
alertmanagerSpec:
image:
repository: registry.hisun.netwarps.com/prometheus/alertmanager
tag: v0.24.0
replicas: 1
nodeSelector:
monitor: 'true'
storage:
volumeClaimTemplate:
spec:
storageClassName: local-path
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 1Gi
config:
global:
resolve_timeout: 5m
smtp_from: 'cicd@example.com'
smtp_smarthost: 'smtp.exmail.example.com:465'
smtp_auth_username: 'cicd@example.com'
smtp_auth_password: 'xxxxxxxxxxxxxxx'
smtp_require_tls: false
wechat_api_secret: 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx-xxxxx'
wechat_api_corp_id: 'xxxxxxxxxxxxxxxxx'
inhibit_rules:
- source_matchers:
- 'severity = critical'
target_matchers:
- 'severity =~ warning|info'
equal:
- 'namespace'
- 'alertname'
- source_matchers:
- 'severity = warning'
target_matchers:
- 'severity = info'
equal:
- 'namespace'
- 'alertname'
- source_matchers:
- 'alertname = InfoInhibitor'
target_matchers:
- 'severity = info'
equal:
- 'namespace'
route:
group_by: ['namespace']
group_wait: 30s
group_interval: 5m
repeat_interval: 12h
receiver: 'null'
routes:
- receiver: 'null'
matchers:
- alertname =~ "InfoInhibitor"
- receiver: 'deadmansswitch'
matchers:
- alertname =~ "Watchdog"
- receiver: 'critical'
matchers:
- severity =~ "critical"
receivers:
- name: deadmansswitch
email_configs:
- to: 'liujinye@example.com'
- name: critical
email_configs:
- to: 'liujinye@example.com,xxxxxxxxx@example.com'
wechat_configs:
- send_resolved: true
to_user: '@all'
agent_id: 'xxxxxxxx'
corp_id: 'wwxxxxxxxxxxx'
api_secret: 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx-xxxxx'
- name: 'null'
templates:
- '/etc/alertmanager/config/*.tmpl'
grafana:
plugins:
- grafana-piechart-panel
ingress:
enabled: true
annotations:
kubernetes.io/ingress.class: nginx
nginx.ingress.kubernetes.io/ssl-redirect: 'true'
#cert-manager.io/cluster-issuer: selfsigned-issuer
tls:
- hosts:
- "grafana.apps145205.pldtest.k8s"
#secretName: grafana-tls
hosts: ["grafana.apps145205.pldtest.k8s"]
pathType: ImplementationSpecific
replicas: 1
nodeSelector:
monitor: 'true'
persistence:
enabled: true
storageClassName: "local-path"
size: 1Gi
deploymentStrategy:
type: Recreate
image:
repository: registry.hisun.netwarps.com/grafana/grafana
tag: 9.2.5
initChownData:
image:
repository: registry.hisun.netwarps.com/library/busybox
tag: "1.31.1"
downloadDashboardsImage:
repository: registry.hisun.netwarps.com/curlimages/curl
tag: 7.85.0
sidecar:
image:
repository: registry.hisun.netwarps.com/kiwigrid/k8s-sidecar
tag: 1.19.2
prometheusOperator:
admissionWebhooks:
patch:
enabled: true
image:
repository: registry.hisun.netwarps.com/ingress-nginx/kube-webhook-certgen
tag: v1.3.0
sha: ""
image:
repository: registry.hisun.netwarps.com/prometheus-operator/prometheus-operator
tag: v0.60.1
prometheusConfigReloaderImage:
repository: registry.hisun.netwarps.com/prometheus-operator/prometheus-config-reloader
tag: v0.60.1
prometheus:
ingress:
enabled: true
annotations:
kubernetes.io/ingress.class: nginx
#cert-manager.io/cluster-issuer: selfsigned-issuer
tls:
- hosts:
- "prometheus.apps145205.pldtest.k8s"
#secretName: prometheus-tls
hosts: ["prometheus.apps145205.pldtest.k8s"]
pathType: ImplementationSpecific
prometheusSpec:
replicas: 1
nodeSelector:
monitor: 'true'
image:
repository: registry.hisun.netwarps.com/prometheus/prometheus
tag: v2.39.1
storageSpec:
volumeClaimTemplate:
spec:
storageClassName: local-path
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 100Gi
prometheus-node-exporter:
image:
repository: registry.hisun.netwarps.com/prometheus/node-exporter
tag: v1.4.0
kube-state-metrics:
image:
repository: registry.hisun.netwarps.com/kube-state-metrics/kube-state-metrics
tag: v2.6.0
kubeEtcd:
service:
port: 2381
targetPort: 2381
kubeControllerManager:
serviceMonitor:
https: true
insecureSkipVerify: true
kubeScheduler:
service:
# 如果为 null 或未设置,则该值是根据目标 Kubernetes 版本动态确定的 ## Kubernetes 1.23 中的默认端口。
port: 10259
targetPort: 10259
serviceMonitor:
https: true
insecureSkipVerify: true
升级之前先更新CRD
注:因为跨越的版本太多,增加在官方文档基础上添加了 --force-conflicts
参数
kubectl apply --server-side --force-conflicts -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.60.1/example/prometheus-operator-crd/monitoring.coreos.com_alertmanagerconfigs.yaml
kubectl apply --server-side --force-conflicts -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.60.1/example/prometheus-operator-crd/monitoring.coreos.com_alertmanagers.yaml
kubectl apply --server-side --force-conflicts -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.60.1/example/prometheus-operator-crd/monitoring.coreos.com_podmonitors.yaml
kubectl apply --server-side --force-conflicts -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.60.1/example/prometheus-operator-crd/monitoring.coreos.com_probes.yaml
kubectl apply --server-side --force-conflicts -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.60.1/example/prometheus-operator-crd/monitoring.coreos.com_prometheuses.yaml
kubectl apply --server-side --force-conflicts -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.60.1/example/prometheus-operator-crd/monitoring.coreos.com_prometheusrules.yaml
kubectl apply --server-side --force-conflicts -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.60.1/example/prometheus-operator-crd/monitoring.coreos.com_servicemonitors.yaml
kubectl apply --server-side --force-conflicts -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.60.1/example/prometheus-operator-crd/monitoring.coreos.com_thanosrulers.yaml
删除node-exporter
prometheus-node-exporter 从 4.0.0 版本开始,node exporter
chart 使用 Kubernetes 推荐的标签。因此,必须在升级前删除 daemonset。
kubectl delete daemonset -l app=prometheus-node-exporter -n monitoring
执行升级
helm upgrade --install prometheus-community kube-prometheus-stack-41.9.0.tgz -f kube-prometheus-stack-values.yaml -n monitoring
故障处理
解决kube-controller-manager / kube-scheduler metrics 获取异常问题
修改 kugaadm config
kubectl edit cm -n kube-system kubeadm-config
内容参考如下
data:
ClusterConfiguration: |
...
controllerManager:
extraArgs:
bind-address: 0.0.0.0
...
scheduler:
extraArgs:
bind-address: 0.0.0.0
编辑 kubeadm-init.yaml, 内容如下
...
controllerManager:
extraArgs:
bind-address: 0.0.0.0
...
scheduler:
extraArgs:
bind-address: 0.0.0.0
备份 /etc/kubernetes/
刷新 /etc/kubernetes/manifests/kube-controller-manager.yaml 及 /etc/kubernetes/manifests/kube-scheduler.yaml
# kubeadm init phase control-plane <component-name> --config <config-file>
kubeadm init phase control-plane controller-manager --config kubeadm-init.yaml
kubeadm init phase control-plane scheduler --config kubeadm-init.yaml
在 kube-prometheus-stack-values.yaml
中添加下面配置
kubeControllerManager:
serviceMonitor:
https: true
insecureSkipVerify: true
kubeScheduler:
service:
# 如果为 null 或未设置,则该值是根据目标 Kubernetes 版本动态确定的 ## Kubernetes 1.23 中的默认端口。
port: 10259
targetPort: 10259
serviceMonitor:
https: true
insecureSkipVerify: true
更新 helm
helm upgrade --install prometheus-community kube-prometheus-stack-41.9.0.tgz -f kube-prometheus-stack-values.yaml -n monitoring
参考
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack
https://github.com/ss75710541/operator-env/blob/main/prometheus-operator/%E4%BD%BF%E7%94%A8helm%E5%9C%A8k8s1.22.2%E4%B8%AD%E9%83%A8%E7%BD%B2kube-prometheus-stack-19.1.0.md
https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack/values.yaml
https://github.com/grafana/helm-charts/blob/main/charts/grafana/values.yaml
https://github.com/prometheus-operator/prometheus-operator/issues/3199
https://kubernetes.io/zh-cn/docs/tasks/administer-cluster/kubeadm/kubeadm-reconfigure/
Last updated