Description:
Ive deployed a new psmdb operator + mongodb to a new test cluster running aws eks 1.32.
However it is in a state of error
NAME ENDPOINT STATUS AGE
psmdb-default-sharded acme-prod-psmdb-default-sharded-572b8d37b9e51ccf.elb.us-east-2.amazonaws.com error 16h
Name: psmdb-default-sharded
Namespace: mongodb
Labels: app.kubernetes.io/instance=psmdb-default-sharded
app.kubernetes.io/managed-by=Helm
app.kubernetes.io/name=psmdb-default-sharded
app.kubernetes.io/version=1.19.1
argocd.argoproj.io/instance=psmdb-default-sharded
helm.sh/chart=psmdb-db-1.19.1
Annotations: <none>
API Version: psmdb.percona.com/v1
Kind: PerconaServerMongoDB
Metadata:
Creation Timestamp: 2025-04-28T16:17:53Z
Finalizers:
percona.com/delete-psmdb-pods-in-order
Generation: 2
Resource Version: 1462659
UID: f84ea80c-9a9d-4e89-9129-f71b98f8850d
Spec:
Backup:
Enabled: true
Image: percona/percona-backup-mongodb:2.8.0-multi
Pitr:
Enabled: false
Cr Version: 1.19.1
Enable Volume Expansion: true
Image: percona/percona-server-mongodb:8.0.4-2
Image Pull Policy: Always
Multi Cluster:
Enabled: false
Pause: false
Pmm:
Enabled: false
Image: percona/pmm-client:2.44.0
Server Host: monitoring-service
Replsets:
Affinity:
Anti Affinity Topology Key: kubernetes.io/hostname
Arbiter:
Affinity:
Anti Affinity Topology Key: kubernetes.io/hostname
Enabled: false
Size: 1
Expose:
Enabled: false
Type: ClusterIP
Name: rs0
Node Selector:
Karpenter - Node - Pool: mongodb-sharded
Nonvoting:
Affinity:
Anti Affinity Topology Key: kubernetes.io/hostname
Enabled: false
Pod Disruption Budget:
Max Unavailable: 1
Resources:
Limits:
Cpu: 300m
Memory: 0.5G
Requests:
Cpu: 300m
Memory: 0.5G
Size: 3
Volume Spec:
Persistent Volume Claim:
Resources:
Requests:
Storage: 3Gi
Pod Disruption Budget:
Max Unavailable: 1
Resources:
Limits:
Cpu: 2
Memory: 4Gi
Requests:
Cpu: 300m
Memory: 500M
Size: 3
Tolerations:
Effect: NoSchedule
Key: karpenter/mongodb-sharded
Operator: Exists
Volume Spec:
Persistent Volume Claim:
Resources:
Requests:
Storage: 100Gi
Storage Class Name: mongodb
Affinity:
Expose:
Enabled: false
Name: rs1
Node Selector:
Karpenter - Node - Pool: mongodb-sharded
Resources:
Limits:
Cpu: 2
Memory: 4Gi
Requests:
Cpu: 300m
Memory: 500M
Size: 3
Tolerations:
Effect: NoSchedule
Key: karpenter/mongodb-sharded
Operator: Exists
Volume Spec:
Persistent Volume Claim:
Resources:
Requests:
Storage: 100Gi
Storage Class Name: mongodb
Affinity:
Expose:
Enabled: false
Name: rs2
Node Selector:
Karpenter - Node - Pool: mongodb-sharded
Resources:
Limits:
Cpu: 2
Memory: 4Gi
Requests:
Cpu: 300m
Memory: 500M
Size: 3
Tolerations:
Effect: NoSchedule
Key: karpenter/mongodb-sharded
Operator: Exists
Volume Spec:
Persistent Volume Claim:
Resources:
Requests:
Storage: 100Gi
Storage Class Name: mongodb
Secrets:
Users: psmdb-default-sharded-secrets
Sharding:
Balancer:
Enabled: true
Configsvr Repl Set:
Affinity:
Anti Affinity Topology Key: kubernetes.io/hostname
Expose:
Enabled: false
Type: ClusterIP
Node Selector:
Karpenter - Node - Pool: mongodb-sharded
Pod Disruption Budget:
Max Unavailable: 1
Resources:
Limits:
Cpu: 300m
Memory: 0.5G
Requests:
Cpu: 300m
Memory: 0.5G
Size: 3
Tolerations:
Effect: NoSchedule
Key: karpenter/mongodb-sharded
Operator: Exists
Volume Spec:
Persistent Volume Claim:
Resources:
Requests:
Storage: 3Gi
Enabled: true
Mongos:
Affinity:
Anti Affinity Topology Key: kubernetes.io/hostname
Expose:
Annotations:
service.beta.kubernetes.io/aws-load-balancer-ip-address-type: ipv4
service.beta.kubernetes.io/aws-load-balancer-name: acme-prod-psmdb-default-sharded
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: instance
service.beta.kubernetes.io/aws-load-balancer-scheme: internal
Type: LoadBalancer
Node Selector:
Karpenter - Node - Pool: mongodb-sharded
Pod Disruption Budget:
Max Unavailable: 1
Resources:
Limits:
Cpu: 300m
Memory: 0.5G
Requests:
Cpu: 300m
Memory: 0.5G
Size: 3
Tolerations:
Effect: NoSchedule
Key: karpenter/mongodb-sharded
Operator: Exists
Unmanaged: false
Unsafe Flags:
Backup If Unhealthy: false
Mongos Size: false
Replset Size: false
Termination Grace Period: false
Tls: false
Update Strategy: SmartUpdate
Upgrade Options:
Apply: disabled
Schedule: 0 2 * * *
Set FCV: false
Version Service Endpoint: https://check.percona.com
Status:
Conditions:
Last Transition Time: 2025-04-28T16:17:53Z
Status: True
Type: sharding
Last Transition Time: 2025-04-28T16:17:57Z
Status: True
Type: initializing
Last Transition Time: 2025-04-28T16:22:12Z
Message: create system users: failed to get mongo client: ping mongo: connection() error occurred during connection handshake: context canceled
Reason: ErrorReconcile
Status: True
Type: error
Last Transition Time: 2025-04-28T16:34:43Z
Message: rs2: ready
Reason: RSReady
Status: True
Type: ready
Host: acme-prod-psmdb-default-sharded-572b8d37b9e51ccf.elb.us-east-2.amazonaws.com
Message: Error: dial: ping mongo: connection() error occurred during connection handshake: auth error: sasl conversation error: unable to authenticate using mechanism "SCRAM-SHA-1": (AuthenticationFailed) Authentication failed.
Mongos:
Ready: 3
Size: 3
Status: ready
Observed Generation: 2
Ready: 14
Replsets:
Cfg:
Initialized: true
Members:
psmdb-default-sharded-cfg-0:
Name: psmdb-default-sharded-cfg-0.psmdb-default-sharded-cfg.mongodb.svc.cluster.local:27017
State: 2
State Str: SECONDARY
psmdb-default-sharded-cfg-1:
Name: psmdb-default-sharded-cfg-1.psmdb-default-sharded-cfg.mongodb.svc.cluster.local:27017
State: 2
State Str: SECONDARY
psmdb-default-sharded-cfg-2:
Name: psmdb-default-sharded-cfg-2.psmdb-default-sharded-cfg.mongodb.svc.cluster.local:27017
State: 1
State Str: PRIMARY
Ready: 3
Size: 3
Status: ready
rs0:
added_as_shard: true
Initialized: true
Members:
psmdb-default-sharded-rs0-0:
Name: psmdb-default-sharded-rs0-0.psmdb-default-sharded-rs0.mongodb.svc.cluster.local:27017
State: 2
State Str: SECONDARY
psmdb-default-sharded-rs0-1:
Name: psmdb-default-sharded-rs0-1.psmdb-default-sharded-rs0.mongodb.svc.cluster.local:27017
State: 2
State Str: SECONDARY
psmdb-default-sharded-rs0-2:
Name: psmdb-default-sharded-rs0-2.psmdb-default-sharded-rs0.mongodb.svc.cluster.local:27017
State: 1
State Str: PRIMARY
Ready: 3
Size: 3
Status: ready
rs1:
added_as_shard: true
Initialized: true
Members:
psmdb-default-sharded-rs1-0:
Name: psmdb-default-sharded-rs1-0.psmdb-default-sharded-rs1.mongodb.svc.cluster.local:27017
State: 2
State Str: SECONDARY
psmdb-default-sharded-rs1-1:
Name: psmdb-default-sharded-rs1-1.psmdb-default-sharded-rs1.mongodb.svc.cluster.local:27017
State: 1
State Str: PRIMARY
psmdb-default-sharded-rs1-2:
Name: psmdb-default-sharded-rs1-2.psmdb-default-sharded-rs1.mongodb.svc.cluster.local:27017
State: 2
State Str: SECONDARY
Ready: 3
Size: 3
Status: ready
rs2:
added_as_shard: true
Initialized: true
Ready: 2
Size: 3
Status: initializing
Size: 15
State: error
Events: <none>
kubectl get all -n mongodb
NAME READY STATUS RESTARTS AGE
pod/psmdb-default-sharded-cfg-0 2/2 Running 0 12m
pod/psmdb-default-sharded-cfg-1 2/2 Running 0 10m
pod/psmdb-default-sharded-cfg-2 2/2 Running 0 9m2s
pod/psmdb-default-sharded-mongos-0 1/1 Running 0 8m59s
pod/psmdb-default-sharded-mongos-1 1/1 Running 0 12m
pod/psmdb-default-sharded-mongos-2 1/1 Running 0 10m
pod/psmdb-default-sharded-rs0-0 2/2 Running 0 12m
pod/psmdb-default-sharded-rs0-1 2/2 Running 0 10m
pod/psmdb-default-sharded-rs0-2 2/2 Running 0 9m2s
pod/psmdb-default-sharded-rs1-0 2/2 Running 0 12m
pod/psmdb-default-sharded-rs1-1 2/2 Running 0 10m
pod/psmdb-default-sharded-rs1-2 2/2 Running 0 9m2s
pod/psmdb-default-sharded-rs2-0 2/2 Running 0 12m
pod/psmdb-default-sharded-rs2-1 2/2 Running 0 10m
pod/psmdb-default-sharded-rs2-2 1/2 Running 3 (2s ago) 9m17s
pod/psmdb-operator-b6b76bbdc-g746t 1/1 Running 0 16h
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
service/psmdb-default-sharded-cfg ClusterIP None <none> 27017/TCP 16h
service/psmdb-default-sharded-metrics ClusterIP 172.20.93.242 <none> 9216/TCP 16h
service/psmdb-default-sharded-mongos LoadBalancer 172.20.95.86 acme-prod-psmdb-default-sharded-572b8d37b9e51ccf.elb.us-east-2.amazonaws.com 27017:31270/TCP 16h
service/psmdb-default-sharded-rs0 ClusterIP None <none> 27017/TCP 16h
service/psmdb-default-sharded-rs1 ClusterIP None <none> 27017/TCP 16h
service/psmdb-default-sharded-rs2 ClusterIP None <none> 27017/TCP 16h
NAME READY UP-TO-DATE AVAILABLE AGE
deployment.apps/psmdb-operator 1/1 1 1 16h
NAME DESIRED CURRENT READY AGE
replicaset.apps/psmdb-operator-b6b76bbdc 1 1 1 16h
NAME READY AGE
statefulset.apps/psmdb-default-sharded-cfg 3/3 16h
statefulset.apps/psmdb-default-sharded-mongos 3/3 12h
statefulset.apps/psmdb-default-sharded-rs0 3/3 16h
statefulset.apps/psmdb-default-sharded-rs1 3/3 16h
statefulset.apps/psmdb-default-sharded-rs2 2/3 16h
[Detailed description of the issue or question]
Steps to Reproduce:
fializers:
- delete-psmdb-pods-in-order
nameOverride: ${db_name}
upgradeOptions:
apply: disabled
image:
repository: percona/percona-server-mongodb
tag: 8.0.4-2
enableVolumeExpansion: true
replsets:
rs0:
name: rs0
size: 3
tolerations:
- key: "karpenter/mongodb-sharded"
operator: "Exists"
effect: "NoSchedule"
nodeSelector:
karpenter-node-pool: mongodb-sharded
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchLabels:
app.kubernetes.io/replset: rs0
topologyKey: "kubernetes.io/hostname"
resources:
limits:
cpu: 2
memory: 4Gi
requests:
cpu: 300m
memory: 500M
expose:
enabled: false
volumeSpec:
pvc:
storageClassName: mongodb
resources:
requests:
storage: 100Gi
rs1:
name: rs1
size: 3
tolerations:
- key: "karpenter/mongodb-sharded"
operator: "Exists"
effect: "NoSchedule"
nodeSelector:
karpenter-node-pool: mongodb-sharded
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchLabels:
app.kubernetes.io/replset: rs0
topologyKey: "kubernetes.io/hostname"
resources:
limits:
cpu: 2
memory: 4Gi
requests:
cpu: 300m
memory: 500M
expose:
enabled: false
volumeSpec:
pvc:
storageClassName: mongodb
resources:
requests:
storage: 100Gi
rs2:
name: rs2
size: 3
tolerations:
- key: "karpenter/mongodb-sharded"
operator: "Exists"
effect: "NoSchedule"
nodeSelector:
karpenter-node-pool: mongodb-sharded
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchLabels:
app.kubernetes.io/replset: rs0
topologyKey: "kubernetes.io/hostname"
resources:
limits:
cpu: 2
memory: 4Gi
requests:
cpu: 300m
memory: 500M
expose:
enabled: false
volumeSpec:
pvc:
storageClassName: mongodb
resources:
requests:
storage: 100Gi
backup:
enabled: true
sharding:
enabled: true
balancer:
enabled: true
configrs:
size: 3
tolerations:
- key: "karpenter/mongodb-sharded"
operator: "Exists"
effect: "NoSchedule"
nodeSelector:
karpenter-node-pool: mongodb-sharded
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchLabels:
app.kubernetes.io/component: cfg
mongos:
size: 3
expose:
enabled: true
type: LoadBalancer
annotations:
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: instance
service.beta.kubernetes.io/aws-load-balancer-scheme: internal
service.beta.kubernetes.io/aws-load-balancer-ip-address-type: ipv4
service.beta.kubernetes.io/aws-load-balancer-name: acme-prod-psmdb-default-sharded
tolerations:
- key: "karpenter/mongodb-sharded"
operator: "Exists"
effect: "NoSchedule"
nodeSelector:
karpenter-node-pool: mongodb-sharded
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchLabels:
app.kubernetes.io/component: mongos
Version:
helm ls -n mongodb
NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION
psmdb-operator mongodb 1 2025-04-28 17:17:01.915939422 +0100 BST deployed psmdb-operator-1.19.1 1.19.1
Logs:
[If applicable, include any relevant log files or error messages]
Expected Result:
[What the user expected to see or happen before the issue occurred]
Actual Result:
[What actually happened when the user encountered the issue]
Additional Information:
The issue only seems to be affecting a particular sharded set rs2,
- i’ve beefed up the size of the karpenter node incase it was a resoure issue.
- Ive done a restart of the rs2 , even killed the pods .
>
>
> 2025-04-29T08:31:26.864Z ERROR Failed to perform check {"error": "member failed Kubernetes liveness check: connection error: filed to dial mongo: ping mongo: connection() error occurred during connection handshake: auth error: sasl conversation error: unable to authenticate using mechanism \"SCRAM-SHA-1\": (AuthenticationFailed) Authentication failed.", "errorVerbose": "connection() error occurred during connection handshake: auth error: sasl conversation error: unable to authenticate using mechanism \"SCRAM-SHA-1\": (AuthenticationFailed) Authentication failed.\nping mongo\ngithub.com/percona/percona-server-mongodb-operator/pkg/psmdb/mongo.Dial\n\t/go/src/github.com/percona/percona-server-mongodb-operator/pkg/psmdb/mongo/mongo.go:123\ngithub.com/percona/percona-server-mongodb-operator/cmd/mongodb-healthcheck/db.Dial\n\t/go/src/github.com/percona/percona-server-mongodb-operator/cmd/mongodb-healthcheck/db/db.go:48\ngithub.com/percona/percona-server-mongodb-operator/cmd/mongodb-healthcheck/healthcheck.HealthCheckMongodLiveness\n\t/go/src/github.com/percona/percona-server-mongodb-operator/cmd/mongodb-healthcheck/healthcheck/health.go:62\ngithub.com/percona/percona-server-mongodb-operator/cmd/mongodb-healthcheck/tool.(*App).Run\n\t/go/src/github.com/percona/percona-server-mongodb-operator/cmd/mongodb-healthcheck/tool/tool.go:95\nmain.main\n\t/go/src/github.com/percona/percona-server-mongodb-operator/cmd/mongodb-healthcheck/main.go:67\nruntime.main\n\t/usr/local/go/src/runtime/proc.go:272\nruntime.goexit\n\t/usr/local/go/src/runtime/asm_arm64.s:1223\nfiled to dial mongo\ngithub.com/percona/percona-server-mongodb-operator/cmd/mongodb-healthcheck/db.Dial\n\t/go/src/github.com/percona/percona-server-mongodb-operator/cmd/mongodb-healthcheck/db/db.go:50\ngithub.com/percona/percona-server-mongodb-operator/cmd/mongodb-healthcheck/healthcheck.HealthCheckMongodLiveness\n\t/go/src/github.com/percona/percona-server-mongodb-operator/cmd/mongodb-healthcheck/healthcheck/health.go:62\ngithub.com/percona/percona-server-mongodb-operator/cmd/mongodb-healthcheck/tool.(*App).Run\n\t/go/src/github.com/percona/percona-server-mongodb-operator/cmd/mongodb-healthcheck/tool/tool.go:95\nmain.main\n\t/go/src/github.com/percona/percona-server-mongodb-operator/cmd/mongodb-healthcheck/main.go:67\nruntime.main\n\t/usr/local/go/src/runtime/proc.go:272\nruntime.goexit\n\t/usr/local/go/src/runtime/asm_arm64.s:1223\nconnection error\ngithub.com/percona/percona-server-mongodb-operator/cmd/mongodb-healthcheck/healthcheck.HealthCheckMongodLiveness\n\t/go/src/github.com/percona/percona-server-mongodb-operator/cmd/mongodb-healthcheck/healthcheck/health.go:64\ngithub.com/percona/percona-server-mongodb-operator/cmd/mongodb-healthcheck/tool.(*App).Run\n\t/go/src/github.com/percona/percona-server-mongodb-operator/cmd/mongodb-healthcheck/tool/tool.go:95\nmain.main\n\t/go/src/github.com/percona/percona-server-mongodb-operator/cmd/mongodb-healthcheck/main.go:67\nruntime.main\n\t/usr/local/go/src/runtime/proc.go:272\nruntime.goexit\n\t/usr/local/go/src/runtime/asm_arm64.s:1223\nmember failed Kubernetes liveness check"}
> main.main
> /go/src/github.com/percona/percona-server-mongodb-operator/cmd/mongodb-healthcheck/main.go:68
> runtime.main
> /usr/local/go/src/runtime/proc.go:272