Description:
Recently upgraded our mongodb operator and server from CR 1.14.0 to 1.16.1 and mongo server to version 6.0.15-12.
The mongodb server liveness probe is occasionally failing every hour or so.
Can someone help me investigate what is happening. This has appeard after the succesful upgrade in production.
Steps to Reproduce:
operator.values.yaml
replicaCount: 1
image:
repository: percona/percona-server-mongodb-operator
tag: 1.16.1
pullPolicy: IfNotPresent
# set if you want to specify a namespace to watch
# defaults to `.Release.namespace` if left blank
# watchNamespace:
# set if operator should be deployed in cluster wide mode. defaults to false
watchAllNamespaces: false
# rbac: settings for deployer RBAC creation
rbac:
# rbac.create: if false RBAC resources should be in place
create: true
# serviceAccount: settings for Service Accounts used by the deployer
serviceAccount:
# serviceAccount.create: Whether to create the Service Accounts or not
create: true
podAnnotations: {}
# prometheus.io/scrape: "true"
# prometheus.io/port: "8080"
podSecurityContext: {}
# runAsNonRoot: true
# runAsUser: 2
# runAsGroup: 2
# fsGroup: 2
# fsGroupChangePolicy: "OnRootMismatch"
securityContext: {}
# allowPrivilegeEscalation: false
# capabilities:
# drop:
# - ALL
# seccompProfile:
# type: RuntimeDefault
# set if you want to use a different operator name
# defaults to `percona-server-mongodb-operator`
# operatorName:
imagePullSecrets: []
nameOverride: ""
fullnameOverride: ""
env:
resyncPeriod: 5s
logVerbose: false
resources: {}
# We usually recommend not to specify default resources and to leave this as a conscious
# choice for the user. This also increases chances charts run on environments with little
# resources, such as Minikube. If you do want to specify resources, uncomment the following
# lines, adjust them as necessary, and remove the curly braces after 'resources:'.
# limits:
# cpu: 100m
# memory: 128Mi
# requests:
# cpu: 100m
# memory: 128Mi
nodeSelector:
acm/node-type: ops
tolerations: []
affinity: {}
server.values
# Default values for psmdb-cluster.
# This is a YAML-formatted file.
# Declare variables to be passed into your templates.
# Platform type: kubernetes, openshift
# platform: kubernetes
# Cluster DNS Suffix
# clusterServiceDNSSuffix: svc.cluster.local
# clusterServiceDNSMode: "Internal"
finalizers:
## Set this if you want that operator deletes the primary pod last
- delete-psmdb-pods-in-order
## Set this if you want to delete database persistent volumes on cluster deletion
# - delete-psmdb-pvc
nameOverride: ""
fullnameOverride: ""
crVersion: 1.16.1
pause: false
unmanaged: false
unsafeFlags:
replsetSize: true
# ignoreAnnotations:
# - service.beta.kubernetes.io/aws-load-balancer-backend-protocol
# ignoreLabels:
# - rack
multiCluster:
enabled: false
# DNSSuffix: svc.clusterset.local
updateStrategy: SmartUpdate
upgradeOptions:
versionServiceEndpoint: https://check.percona.com
apply: disabled
schedule: "0 2 * * *"
setFCV: false
image:
repository: percona/percona-server-mongodb
tag: 6.0.15-12
imagePullPolicy: Always
# imagePullSecrets: []
# initImage:
# repository: percona/percona-server-mongodb-operator
# tag: 1.14.0
# initContainerSecurityContext: {}
# tls:
# # 90 days in hours
# certValidityDuration: 2160h
secrets: {}
# If you set users secret here the operator will use existing one or generate random values
# If not set the operator generates the default secret with name <cluster_name>-secrets
# users: my-cluster-name-secrets
# encryptionKey: my-cluster-name-mongodb-encryption-key
pmm:
enabled: false
image:
repository: percona/pmm-client
tag: 2.41.2
serverHost: monitoring-service
replsets:
rs0:
size: 1
# externalNodes:
# - host: 34.124.76.90
# - host: 34.124.76.91
# port: 27017
# votes: 0
# priority: 0
# - host: 34.124.76.92
# configuration: |
# operationProfiling:
# mode: slowOp
# systemLog:
# verbosity: 1
antiAffinityTopologyKey: "kubernetes.io/hostname"
# tolerations: []
# priorityClass: ""
# annotations: {}
# labels: {}
nodeSelector:
acme/node-type: "mongodb"
# livenessProbe:
# failureThreshold: 4
# initialDelaySeconds: 60
# periodSeconds: 30
# timeoutSeconds: 10
# startupDelaySeconds: 7200
# readinessProbe:
# failureThreshold: 8
# initialDelaySeconds: 10
# periodSeconds: 3
# successThreshold: 1
# timeoutSeconds: 2
# runtimeClassName: image-rc
# storage:
# engine: wiredTiger
# wiredTiger:
# engineConfig:
# cacheSizeRatio: 0.5
# directoryForIndexes: false
# journalCompressor: snappy
# collectionConfig:
# blockCompressor: snappy
# indexConfig:
# prefixCompression: true
# inMemory:
# engineConfig:
# inMemorySizeRatio: 0.5
sidecars:
- image: percona/mongodb_exporter:0.36
env:
- name: EXPORTER_USER
valueFrom:
secretKeyRef:
name: psmdb-db-internal-secrets
key: MONGODB_CLUSTER_MONITOR_USER
- name: EXPORTER_PASS
valueFrom:
secretKeyRef:
name: psmdb-db-internal-secrets
key: MONGODB_CLUSTER_MONITOR_PASSWORD
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: status.podIP
- name: MONGODB_URI
value: "mongodb://$(EXPORTER_USER):$(EXPORTER_PASS)@$(POD_IP):27017"
args: ["--discovering-mode", "--compatible-mode", "--collect-all", "--mongodb.uri=$(MONGODB_URI)"]
name: metrics
# volumeMounts:
# - mountPath: /volume1
# name: sidecar-volume-claim
# - mountPath: /secret
# name: sidecar-secret
# - mountPath: /configmap
# name: sidecar-config
# sidecarVolumes:
# - name: sidecar-secret
# secret:
# secretName: mysecret
# - name: sidecar-config
# configMap:
# name: myconfigmap
# sidecarPVCs:
# - apiVersion: v1
# kind: PersistentVolumeClaim
# metadata:
# name: sidecar-volume-claim
# spec:
# resources:
# requests:
# storage: 1Gi
# volumeMode: Filesystem
# accessModes:
# - ReadWriteOnce
podDisruptionBudget:
maxUnavailable: 1
expose:
enabled: true
exposeType: LoadBalancer
# loadBalancerSourceRanges:
# - 10.0.0.0/8
serviceAnnotations:
# Consider enabling cross zone laod balancing and s3 logs
# service.beta.kubernetes.io/aws-load-balancer-target-group-attributes: preserve_client_ip.enabled=false
# service.beta.kubernetes.io/aws-load-balancer-scheme: internal
# service.beta.kubernetes.io/aws-load-balancer-backend-protocol: http
# serviceLabels:
# some-label: some-key
nonvoting:
enabled: false
# podSecurityContext: {}
# containerSecurityContext: {}
size: 3
# configuration: |
# operationProfiling:
# mode: slowOp
# systemLog:
# verbosity: 1
antiAffinityTopologyKey: "kubernetes.io/hostname"
# tolerations: []
# priorityClass: ""
# annotations: {}
# labels: {}
# nodeSelector: {}
podDisruptionBudget:
maxUnavailable: 1
resources:
limits:
cpu: "300m"
memory: "0.5G"
requests:
cpu: "300m"
memory: "0.5G"
volumeSpec:
# emptyDir: {}
# hostPath:
# path: /data
pvc:
# annotations:
# volume.beta.kubernetes.io/storage-class: example-hostpath
# labels:
# rack: rack-22
# storageClassName: standard
# accessModes: [ "ReadWriteOnce" ]
resources:
requests:
storage: 3Gi
arbiter:
enabled: false
size: 1
antiAffinityTopologyKey: "kubernetes.io/hostname"
# tolerations: []
# priorityClass: ""
# annotations: {}
# labels: {}
# nodeSelector: {}
# schedulerName: ""
# resources:
# limits:
# cpu: "300m"
# memory: "0.5G"
# requests:
# cpu: "300m"
# memory: "0.5G"
volumeSpec:
# emptyDir: {}
# hostPath:
# path: /data
pvc:
# annotations:
# volume.beta.kubernetes.io/storage-class: example-hostpath
# labels:
# rack: rack-22
storageClassName: mongodb
# accessModes: [ "ReadWriteOnce" ]
resources:
requests:
storage: 250Gi
sharding:
enabled: false
backup:
enabled: false
image:
repository: percona/percona-backup-mongodb
tag: 2.0.5
serviceAccountName: percona-server-mongodb-operator
# annotations:
# iam.amazonaws.com/role: arn:aws:iam::700849607999:role/acme-test-default-eks-mongodb
# resources:
# limits:
# cpu: "300m"
# memory: "0.5G"
# requests:
# cpu: "300m"
# memory: "0.5G"
storages:
s3-us-east:
type: s3
s3:
bucket: acme-prod-mongodb-backup
credentialsSecret: prod-aws-mongodb
region: us-east-2
prefix: ""
uploadPartSize: 10485760
maxUploadParts: 10000
storageClass: STANDARD
insecureSkipTLSVerify: false
# minio:
# type: s3
# s3:
# bucket: MINIO-BACKUP-BUCKET-NAME-HERE
# region: us-east-1
# credentialsSecret: my-cluster-name-backup-minio
# endpointUrl: http://minio.psmdb.svc.cluster.local:9000/minio/
# prefix: ""
# azure-blob:
# type: azure
# azure:
# container: CONTAINER-NAME
# prefix: PREFIX-NAME
# credentialsSecret: SECRET-NAME
pitr:
enabled: false
# oplogSpanMin: 10
# compressionType: gzip
# compressionLevel: 6
tasks:
- name: "daily-s3-backup"
enabled: true
schedule: "0 1 * * *"
keep: 3
type: logical
storageName: s3-us-east
# - name: daily-s3-us-west
# enabled: true
# schedule: "0 0 * * *"
# keep: 3
# storageName: s3-us-west
# compressionType: gzip
# - name: weekly-s3-us-west
# enabled: false
# schedule: "0 0 * * 0"
# keep: 5
# storageName: s3-us-west
# compressionType: gzip
# - name: weekly-s3-us-west-physical
# enabled: false
# schedule: "0 5 * * 0"
# keep: 5
# type: physical
# storageName: s3-us-west
# compressionType: gzip
# compressionLevel: 6
# If you set users here the secret will be constructed by helm with these values
# users:
# MONGODB_BACKUP_USER: backup
# MONGODB_BACKUP_PASSWORD: backup123456
# MONGODB_DATABASE_ADMIN_USER: databaseAdmin
# MONGODB_DATABASE_ADMIN_PASSWORD: databaseAdmin123456
# MONGODB_CLUSTER_ADMIN_USER: clusterAdmin
# MONGODB_CLUSTER_ADMIN_PASSWORD: clusterAdmin123456
# MONGODB_CLUSTER_MONITOR_USER: clusterMonitor
# MONGODB_CLUSTER_MONITOR_PASSWORD: clusterMonitor123456
# MONGODB_USER_ADMIN_USER: userAdmin
# MONGODB_USER_ADMIN_PASSWORD: userAdmin123456
# PMM_SERVER_API_KEY: apikey
# # PMM_SERVER_USER: admin
# # PMM_SERVER_PASSWORD: admin
Version:
operator + server cr: v1.16.1
mongodb server: 6.0.15-12
aws eks: v1.26
Logs:
This error repeats
2024-07-18T10:24:34.076Z ERROR failed to reconcile cluster {"controller": "psmdb-controller", "object": {"name":"psmdb-db-internal","namespace":"mongodb"}, "namespace": "mongodb", "name": "psmdb-db-internal", "reconcileID": "7a643b99-d025-4ac2-bf12-6d3662dc2acc", "replset": "rs0", "error": "dial: ping mongo: connection() error occurred during connection handshake: dial tcp: lookup psmdb-db-internal-rs0-0.psmdb-db-internal-rs0.mongodb.svc.cluster.local on 172.20.0.10:53: no such host", "errorVerbose": "connection() error occurred during connection handshake: dial tcp: lookup psmdb-db-internal-rs0-0.psmdb-db-internal-rs0.mongodb.svc.cluster.local on 172.20.0.10:53: no such host\nping mongo\ngithub.com/percona/percona-server-mongodb-operator/pkg/psmdb/mongo.Dial\n\t/go/src/github.com/percona/percona-server-mongodb-operator/pkg/psmdb/mongo/mongo.go:112\ngithub.com/percona/percona-server-mongodb-operator/pkg/psmdb.MongoClient\n\t/go/src/github.com/percona/percona-server-mongodb-operator/pkg/psmdb/client.go:62\ngithub.com/percona/percona-server-mongodb-operator/pkg/controller/perconaservermongodb.(*mongoClientProvider).Mongo\n\t/go/src/github.com/percona/percona-server-mongodb-operator/pkg/controller/perconaservermongodb/connections.go:38\ngithub.com/percona/percona-server-mongodb-operator/pkg/controller/perconaservermongodb.(*ReconcilePerconaServerMongoDB).mongoClientWithRole\n\t/go/src/github.com/percona/percona-server-mongodb-operator/pkg/controller/perconaservermongodb/connections.go:60\ngithub.com/percona/percona-server-mongodb-operator/pkg/controller/perconaservermongodb.(*ReconcilePerconaServerMongoDB).reconcileCluster\n\t/go/src/github.com/percona/percona-server-mongodb-operator/pkg/controller/perconaservermongodb/mgo.go:89\ngithub.com/percona/percona-server-mongodb-operator/pkg/controller/perconaservermongodb.(*ReconcilePerconaServerMongoDB).reconcileReplsets\n\t/go/src/github.com/percona/percona-server-mongodb-operator/pkg/controller/perconaservermongodb/psmdb_controller.go:551\ngithub.com/percona/percona-server-mongodb-operator/pkg/controller/perconaservermongodb.(*ReconcilePerconaServerMongoDB).Reconcile\n\t/go/src/github.com/percona/percona-server-mongodb-operator/pkg/controller/perconaservermongodb/psmdb_controller.go:402\nsigs.k8s.io/controller-runtime/pkg/internal/controller.(*Controller).Reconcile\n\t/go/pkg/mod/sigs.k8s.io/controller-runtime@v0.18.1/pkg/internal/controller/controller.go:114\nsigs.k8s.io/controller-runtime/pkg/internal/controller.(*Controller).reconcileHandler\n\t/go/pkg/mod/sigs.k8s.io/controller-runtime@v0.18.1/pkg/internal/controller/controller.go:311\nsigs.k8s.io/controller-runtime/pkg/internal/controller.(*Controller).processNextWorkItem\n\t/go/pkg/mod/sigs.k8s.io/controller-runtime@v0.18.1/pkg/internal/controller/controller.go:261\nsigs.k8s.io/controller-runtime/pkg/internal/controller.(*Controller).Start.func2.2\n\t/go/pkg/mod/sigs.k8s.io/controller-runtime@v0.18.1/pkg/internal/controller/controller.go:222\nruntime.goexit\n\t/usr/local/go/src/runtime/asm_amd64.s:1695\ndial\ngithub.com/percona/percona-server-mongodb-operator/pkg/controller/perconaservermongodb.(*ReconcilePerconaServerMongoDB).reconcileCluster\n\t/go/src/github.com/percona/percona-server-mongodb-operator/pkg/controller/perconaservermongodb/mgo.go:95\ngithub.com/percona/percona-server-mongodb-operator/pkg/controller/perconaservermongodb.(*ReconcilePerconaServerMongoDB).reconcileReplsets\n\t/go/src/github.com/percona/percona-server-mongodb-operator/pkg/controller/perconaservermongodb/psmdb_controller.go:551\ngithub.com/percona/percona-server-mongodb-operator/pkg/controller/perconaservermongodb.(*ReconcilePerconaServerMongoDB).Reconcile\n\t/go/src/github.com/percona/percona-server-mongodb-operator/pkg/controller/perconaservermongodb/psmdb_controller.go:402\nsigs.k8s.io/controller-runtime/pkg/internal/controller.(*Controller).Reconcile\n\t/go/pkg/mod/sigs.k8s.io/controller-runtime@v0.18.1/pkg/internal/controller/controller.go:114\nsigs.k8s.io/controller-runtime/pkg/internal/controller.(*Controller).reconcileHandler\n\t/go/pkg/mod/sigs.k8s.io/controller-runtime@v0.18.1/pkg/internal/controller/controller.go:311\nsigs.k8s.io/controller-runtime/pkg/internal/controller.(*Controller).processNextWorkItem\n\t/go/pkg/mod/sigs.k8s.io/controller-runtime@v0.18.1/pkg/internal/controller/controller.go:261\nsigs.k8s.io/controller-runtime/pkg/internal/controller.(*Controller).Start.func2.2\n\t/go/pkg/mod/sigs.k8s.io/controller-runtime@v0.18.1/pkg/internal/controller/controller.go:222\nruntime.goexit\n\t/usr/local/go/src/runtime/asm_amd64.s:1695"}
github.com/percona/percona-server-mongodb-operator/pkg/controller/perconaservermongodb.(*ReconcilePerconaServerMongoDB).reconcileReplsets
/go/src/github.com/percona/percona-server-mongodb-operator/pkg/controller/perconaservermongodb/psmdb_controller.go:553
github.com/percona/percona-server-mongodb-operator/pkg/controller/perconaservermongodb.(*ReconcilePerconaServerMongoDB).Reconcile
/go/src/github.com/percona/percona-server-mongodb-operator/pkg/controller/perconaservermongodb/psmdb_controller.go:402
sigs.k8s.io/controller-runtime/pkg/internal/controller.(*Controller).Reconcile
/go/pkg/mod/sigs.k8s.io/controller-runtime@v0.18.1/pkg/internal/controller/controller.go:114
sigs.k8s.io/controller-runtime/pkg/internal/controller.(*Controller).reconcileHandler
/go/pkg/mod/sigs.k8s.io/controller-runtime@v0.18.1/pkg/internal/controller/controller.go:311
sigs.k8s.io/controller-runtime/pkg/internal/controller.(*Controller).processNextWorkItem
/go/pkg/mod/sigs.k8s.io/controller-runtime@v0.18.1/pkg/internal/controller/controller.go:261
sigs.k8s.io/controller-runtime/pkg/internal/controller.(*Controller).Start.func2.2
/go/pkg/mod/sigs.k8s.io/controller-runtime@v0.18.1/pkg/internal/controller/controller.go:222
2024-07-18T10:24:37.054Z INFO Cluster state changed {"controller": "psmdb-controller", "object": {"name":"psmdb-db-internal","namespace":"mongodb"}, "namespace": "mongodb", "name": "psmdb-db-internal", "reconcileID": "376fb458-f746-4e65-9291-59ed48d16797", "previous": "initializing", "current": "ready"}
Expected Result:
I expect the liveness probe to not fail.
Actual Result:
Currently liveness probe is failing causing the pod to restart.
Additional Information:
kubectl get all -n mongodb
pod/psmdb-db-internal-rs0-0 2/2 Running 4 (24m ago) 3h22m
pod/psmdb-operator-76967f4b99-p9glv 1/1 Running 0 3h19m
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
service/psmdb-db-internal-rs0 ClusterIP None <none> 27017/TCP 287d
service/psmdb-db-internal-rs0-0 LoadBalancer 172.20.186.15 k8s-mongodb-psmdbdbi-1e0cbbc5c7-fb1c79a8f9dd8089.elb.us-east-2.amazonaws.com 27017:30910/TCP 287d
service/psmdb-internal-metrics ClusterIP 172.20.12.124 <none> 9216/TCP 287d
NAME READY UP-TO-DATE AVAILABLE AGE
deployment.apps/psmdb-operator 1/1 1 1 439d
NAME DESIRED CURRENT READY AGE
replicaset.apps/psmdb-operator-76967f4b99 1 1 1 3h30m
replicaset.apps/psmdb-operator-869b9b99d 0 0 0 439d
NAME READY AGE
statefulset.apps/psmdb-db-internal-rs0 1/1 287d