发布时间: 2025-5-9 文章作者: myluzh 分类名称: Kubernetes 朗读文章
helm repo add ot-helm https://ot-container-kit.github.io/helm-charts/ helm upgrade redis-operator ot-helm/redis-operator \ --install --create-namespace --namespace redis-system kubectl get pod -n redis-systemhelm离线部署
wget https://github.com/OT-CONTAINER-KIT/helm-charts/releases/download/redis-operator-0.21.2/redis-operator-0.21.2.tgz helm upgrade redis-operator ./redis-operator-0.21.2.tgz \ --install --create-namespace --namespace redis-system关于operator 跟对应redis版本支持 可以看支持表:https://github.com/OT-CONTAINER-KIT/redis-operator?tab=readme-ov-file#image-compatibility
# Operator Version: v0.19.x # Redis Image: > v7.0.12, >= v6.2.14 # Sentinel Image: > v7.0.12, >= v6.2.14 # Exporter Image: v1.44.0
--- apiVersion: v1 kind: Secret metadata: name: redis-secret namespace: my-app data: password: TXlsdXpoQDEyMzQlMEE= type: Opaque也可以命令直接创建:kubectl create secret generic redis-secret --from-literal=password=Myluzh@1234 -n my-app
apiVersion: redis.redis.opstreelabs.in/v1beta2 kind: RedisCluster metadata: name: redis-cluster namespace: my-app spec: clusterSize: 3 clusterVersion: v7 podSecurityContext: runAsUser: 1000 fsGroup: 1000 persistenceEnabled: true kubernetesConfig: image: quay.io/opstree/redis:v7.0.15 imagePullPolicy: IfNotPresent redisSecret: name: redis-secret key: password redisExporter: enabled: false image: quay.io/opstree/redis-exporter:v1.44.0 storage: volumeClaimTemplate: spec: # storageClassName: standard accessModes: ["ReadWriteOnce"] resources: requests: storage: 1Gi nodeConfVolumeClaimTemplate: spec: accessModes: ["ReadWriteOnce"] resources: requests: storage: 1Gi查看状态
root@k8s-master01:~# kubectl get Rediscluster -n my-app NAME CLUSTERSIZE READYLEADERREPLICAS READYFOLLOWERREPLICAS redis-cluster 3 3 3 root@k8s-master01:~# kubectl get pod -n my-app NAME READY STATUS RESTARTS AGE redis-cluster-follower-0 1/1 Running 0 2m51s redis-cluster-follower-1 1/1 Running 0 2m49s redis-cluster-follower-2 1/1 Running 0 2m46s redis-cluster-leader-0 1/1 Running 0 3m2s redis-cluster-leader-1 1/1 Running 0 2m58s redis-cluster-leader-2 1/1 Running 0 2m55s root@k8s-master01:~# kubectl get svc -n my-app NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE redis-cluster-follower ClusterIP 10.43.76.76 <none> 6379/TCP 2m57s redis-cluster-follower-additional ClusterIP 10.43.27.197 <none> 6379/TCP 2m57s redis-cluster-follower-headless ClusterIP None <none> 6379/TCP 2m57s redis-cluster-leader ClusterIP 10.43.141.199 <none> 6379/TCP 3m6s redis-cluster-leader-additional ClusterIP 10.43.220.170 <none> 6379/TCP 3m5s redis-cluster-leader-headless ClusterIP None <none> 6379/TCP 3m7s redis-cluster-master ClusterIP 10.43.59.246 <none> 6379/TCP 3m5s
apiVersion: redis.redis.opstreelabs.in/v1beta2 kind: RedisSentinel metadata: # will append '-sentinel' to the names of StatefulSet and Pods, e.g. 'redis-sentinel-sentinel' name: redis-sentinel namespace: my-app spec: clusterSize: 3 podSecurityContext: runAsUser: 1000 fsGroup: 1000 # this is needed for the sentinel to connect to the nodes. `redisSecret` only controls access to sentinel redisSentinelConfig: # 这里的redisReplicationName要跟下面的RedisReplication name对应。 redisReplicationName: redis-replication redisReplicationPassword: secretKeyRef: name: redis-secret key: password kubernetesConfig: image: quay.io/opstree/redis-sentinel:v6.2.17 imagePullPolicy: IfNotPresent # only controls access to sentinel, use `redisReplicationPassword` for node connection redisSecret: name: redis-secret key: password resources: requests: cpu: 101m memory: 128Mi limits: cpu: 101m memory: 128Mi --- apiVersion: redis.redis.opstreelabs.in/v1beta2 kind: RedisReplication metadata: name: redis-replication namespace: my-app spec: clusterSize: 3 kubernetesConfig: image: quay.io/opstree/redis:v6.2.17 imagePullPolicy: IfNotPresent redisSecret: name: redis-secret key: password storage: volumeClaimTemplate: spec: # storageClassName: standard accessModes: ["ReadWriteOnce"] resources: requests: storage: 1Gi redisExporter: enabled: false image: quay.io/opstree/redis-exporter:v1.44.0 podSecurityContext: runAsUser: 1000 fsGroup: 1000查看状态
root@k8s-master01:~# kubectl get RedisReplication -n my-app NAME MASTER AGE redis-replication redis-replication-0 10m root@k8s-master01:~# kubectl get RedisSentinel -n my-app NAME AGE redis-sentinel 10m root@k8s-master01:~# kubectl get pod -n my-app NAME READY STATUS RESTARTS AGE redis-replication-0 1/1 Running 0 103s redis-replication-1 1/1 Running 0 84s redis-replication-2 1/1 Running 0 70s redis-sentinel-sentinel-0 1/1 Running 0 29s redis-sentinel-sentinel-1 1/1 Running 0 27s redis-sentinel-sentinel-2 1/1 Running 0 25s root@k8s-master01:~# kubectl get svc -n my-app NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE redis-replication ClusterIP 10.43.100.80 <none> 6379/TCP 111s redis-replication-additional ClusterIP 10.43.117.24 <none> 6379/TCP 111s redis-replication-headless ClusterIP None <none> 6379/TCP 111s redis-replication-master ClusterIP 10.43.60.17 <none> 6379/TCP 111s redis-replication-replica ClusterIP 10.43.129.103 <none> 6379/TCP 111s redis-sentinel-sentinel ClusterIP 10.43.87.203 <none> 26379/TCP 27s redis-sentinel-sentinel-additional ClusterIP 10.43.230.83 <none> 26379/TCP 27s redis-sentinel-sentinel-headless ClusterIP None <none> 26379/TCP 28s
# 首先查看redis主节点在哪 root@k8s-master01:~# kubectl -n my-app exec -it redis-client -- sh /sentinel-data $ redis-cli -p 26379 -a Myluzh@1234 OK 127.0.0.1:26379> SENTINEL masters 1) 1) "name" 2) "myMaster" 3) "ip" 4) "10.42.1.241" 5) "port" 6) "6379" # 把reids主节点所在的机器直接关机,模拟故障 root@k8s-master01:~# kubectl get pod -n my-app -o wide | grep 10.42.1.241 redis-replication-0 1/1 Running 0 75m 10.42.1.241 k8s-worker01 <none> <none> root@k8s-worker01:~# poweroff # 可以看到已经停止了 root@k8s-master01:~# kubectl get pod -n my-app -o wide | grep worker01 redis-client 1/1 Terminating 15 (19m ago) 15h 10.42.1.131 k8s-worker01 <none> <none> redis-replication-0 1/1 Terminating 0 79m 10.42.1.241 k8s-worker01 <none> <none> redis-sentinel-sentinel-0 1/1 Terminating 0 30m 10.42.1.121 k8s-worker01 <none> <none> # 再次查看redis主节点,可以发现已经改变 root@k8s-master01:~# kubectl -n my-app exec -it rredis-client -- sh /sentinel-data $ redis-cli -p 26379 -a Myluzh@1234 OK 127.0.0.1:26379> SENTINEL masters 1) 1) "name" 2) "myMaster" 3) "ip" 4) "10.42.2.102"
root@k8s-master01:~# kubectl get svc -n pmip-app --show-labels NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE LABELS redis-replication ClusterIP 10.43.33.33 <none> 6379/TCP,9121/TCP 29h app=redis-replication,redis_setup_type=replication,role=replication查看9121端口的name字段
root@k8s-master01:~# kubectl -n pmip-app get svc redis-replication -o jsonpath='{.spec.ports[?(@.port==9121)].name}' redis-exporter编写yaml
root@k8s-master01:~# cat redis-servermonitor.yaml apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: name: redis-replication-metrics namespace: monitoring # Prometheus 所在命名空间 labels: app: redis-replication spec: jobLabel: redis-replication selector: matchLabels: app: redis-replication # 匹配 Service 的标签 namespaceSelector: matchNames: - pmip-app # 指定 redis 所在命名空间 endpoints: - port: "redis-exporter" # 这个是 service 的端口名称,即 service yaml的spec.ports.name, 不是填metrics端口号 9121 interval: 15s path: /metrics没问题的话prometheus就可以收集到指标了,有问题就看prometheus日志。
# cat rules.yaml --- apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: labels: monitor-svc: redis-exporter role: alert-rules name: redis-metrics-rules namespace: monitoring spec: groups: - name: Redis-监控告警 rules: - alert: 警报!Redis应用不可用 expr: redis_up == 0 for: 0m labels: severity: 严重告警 annotations: summary: "{{ $labels.instance }} Redis应用不可用" description: "Redis应用不可达\n 当前值 = {{ $value }}" - alert: 警报!丢失Master节点 expr: (count(redis_instance_info{role="master"}) ) < 1 for: 0m labels: severity: 严重告警 annotations: summary: "{{ $labels.instance }} 丢失Redis master" description: "Redis集群当前没有主节点\n 当前值 = {{ $value }}" - alert: 警报!脑裂,主节点太多 expr: count(redis_instance_info{role="master"}) > 1 for: 0m labels: severity: 严重告警 annotations: summary: "{{ $labels.instance }} Redis脑裂,主节点太多" description: "{{ $labels.instance }} 主节点太多\n 当前值 = {{ $value }}" - alert: 警报!Slave连接不可达 expr: count without (instance, job) (redis_connected_slaves) - sum without (instance, job) (redis_connected_slaves) - 1 > 1 for: 0m labels: severity: 严重告警 annotations: summary: "{{ $labels.instance }} Redis丢失slave节点" description: "Redis slave不可达.请确认主从同步状态\n 当前值 = {{ $value }}" - alert: 警报!Redis副本不一致 expr: delta(redis_connected_slaves[1m]) < 0 for: 0m labels: severity: 严重告警 annotations: summary: "{{ $labels.instance }} Redis 副本不一致" description: "Redis集群丢失一个slave节点\n 当前值 = {{ $value }}" - alert: 警报!Redis集群抖动 expr: changes(redis_connected_slaves[1m]) > 1 for: 2m labels: severity: 严重告警 annotations: summary: "{{ $labels.instance }} Redis集群抖动" description: "Redis集群抖动,请检查.\n 当前值 = {{ $value }}" - alert: 警报!持久化失败 expr: (time() - redis_rdb_last_save_timestamp_seconds) / 3600 > 24 for: 0m labels: severity: 严重告警 annotations: summary: "{{ $labels.instance }} Redis持久化失败" description: "Redis持久化失败(>24小时)\n 当前值 = {{ printf \"%.1f\" $value }}小时" - alert: 警报!内存不足 expr: redis_memory_used_bytes / redis_total_system_memory_bytes * 100 > 90 for: 2m labels: severity: 一般告警 annotations: summary: "{{ $labels.instance }}系统内存不足" description: "Redis占用系统内存(> 90%)\n 当前值 = {{ printf \"%.2f\" $value }}%" - alert: 警报!Maxmemory不足 expr: redis_config_maxmemory !=0 and redis_memory_used_bytes / redis_memory_max_bytes * 100 > 80 for: 2m labels: severity: 一般告警 annotations: summary: "{{ $labels.instance }} Maxmemory设置太小" description: "超出设置最大内存(> 80%)\n 当前值 = {{ printf \"%.2f\" $value }}%" - alert: 警报!连接数太多 expr: redis_connected_clients > 200 for: 2m labels: severity: 一般告警 annotations: summary: "{{ $labels.instance }} 实时连接数太多" description: "连接数太多(>200)\n 当前值 = {{ $value }}" - alert: 警报!连接数太少 expr: redis_connected_clients < 1 for: 2m labels: severity: 一般告警 annotations: summary: "{{ $labels.instance }} 实时连接数太少" description: "连接数(<1)\n 当前值 = {{ $value }}" - alert: 警报!拒绝连接数 expr: increase(redis_rejected_connections_total[1m]) > 0 for: 0m labels: severity: 严重告警 annotations: summary: "{{ $labels.instance }} 拒绝连接" description: "Redis有拒绝连接,请检查连接数配置\n 当前值 = {{ printf \"%.0f\" $value }}" - alert: 警报!执行命令数大于1000 expr: rate(redis_commands_processed_total[1m]) > 1000 for: 0m labels: severity: 严重告警 annotations: summary: "{{ $labels.instance }} 执行命令次数太多" description: "Redis执行命令次数太多\n 当前值 = {{ printf \"%.0f\" $value }}"
标签: k8s kubernetes helm redis rediscluster redisreplication redissentinel ot ot-redis-operator
发表评论