Hello. I am building a 3x3 sharded replica set on EKS, and for days now I have been running into the same weird situation:
- coredns has records for all the pods in rs1 and rs2, but none of the pods in rs0
- rs0 fails to reconcile because the pods can’t reach one another, so they crashloop
- mongos fails to start because it’s waiting for rs0 (‘server selection timeout’)
- psmdb remains in error condition because it can’t find a primary.
ConfigRS, RS1 and RS2 are all stable, and the replica set configurations are identical. I have not been able to find any pattern of AZ or node placement. It’s not a networking problem; coredns logs the requests, but gives no answer.
I’ve destroyed and rebuilt the cluster five or six times and it’s always rs0 that fails. Close monitoring inside coredns suggests that the rs0 records are being created and taken down again. Sometimes I can briefly resolve rs0-0 but then it goes away.
Can anyone please suggest why that would happen?
In the mongod pod:
{"t":{"$date":"2025-09-01T23:20:09.097+00:00"},"s":"I", "c":"-", "id":4333222, "ctx":"ReplicaSetMonitor-TaskExecutor","msg":"RSM received error response","attr":{"host":"mdb-production-psmdb-mongodb-production-rs0-2.mdb-production-psmdb-mongodb-production-rs0.mongodb.svc.cluster.local:27018","error":"HostUnreachable: Error connecting to mdb-production-psmdb-mongodb-production-rs0-2.mdb-production-psmdb-mongodb-production-rs0.mongodb.svc.cluster.local:27018 :: caused by :: Could not find address for mdb-production-psmdb-mongodb-production-rs0-2.mdb-production-psmdb-mongodb-production-rs0.mongodb.svc.cluster.local:27018: SocketException: onInvoke :: caused by :: Host not found (authoritative)","replicaSet":"mongodb-production-rs0","response":{}}}
In the coredns pod:
/ # nslookup mdb-production-psmdb-mongodb-production-rs0-2.mdb-production-psmdb-mongodb-production-rs0.mongodb.svc.cluster.local localhost
Server: 127.0.0.1
Address 1: 127.0.0.1 localhost
nslookup: can't resolve 'mdb-production-psmdb-mongodb-production-rs0-2.mdb-production-psmdb-mongodb-production-rs0.mongodb.svc.cluster.local'
Name: mdb-production-psmdb-mongodb-production-rs1-2.mdb-production-psmdb-mongodb-production-rs1.mongodb.svc.cluster.local
Address 1: 10.10.2.194 mdb-production-psmdb-mongodb-production-rs1-2.mdb-production-psmdb-mongodb-production-rs1.mongodb.svc.cluster.local
The values file is populated by terraform and the mongos NodePort addresses captured for other services.
replsets:
rs0:
name: "mongodb-${stage}-rs0"
size: ${replica_count}
nodeSelector:
Role: "mongodb"
affinity:
antiAffinityTopologyKey: "kubernetes.io/hostname"
volumeSpec:
pvc:
storageClassName: "${storage_class}"
resources:
requests:
storage: "${volume_size}"
configuration: |
net:
port: 27018
security:
enableEncryption: true
rs1:
name: "mongodb-${stage}-rs1"
size: ${replica_count}
nodeSelector:
Role: "mongodb"
affinity:
antiAffinityTopologyKey: "kubernetes.io/hostname"
volumeSpec:
pvc:
storageClassName: "${storage_class}"
resources:
requests:
storage: "${volume_size}"
configuration: |
net:
port: 27018
security:
enableEncryption: true
rs2:
name: "mongodb-${stage}-rs2"
size: ${replica_count}
nodeSelector:
Role: "mongodb"
affinity:
antiAffinityTopologyKey: "kubernetes.io/hostname"
volumeSpec:
pvc:
storageClassName: "${storage_class}"
resources:
requests:
storage: "${volume_size}"
configuration: |
net:
port: 27018
security:
enableEncryption: true
sharding:
enabled: true
balancer:
enabled: true
configrs:
size: ${config_count}
nodeSelector:
Role: "mongodb"
affinity:
antiAffinityTopologyKey: "kubernetes.io/hostname"
volumeSpec:
pvc:
storageClassName: "${storage_class}"
resources:
requests:
storage: "${config_volume_size}"
configuration: |
net:
port: 27019
security:
enableEncryption: true
mongos:
size: ${mongos_count}
expose:
enabled: true
type: "NodePort"
nodeSelector:
Role: "mongodb"
affinity:
antiAffinityTopologyKey: "kubernetes.io/hostname"
configuration: |
net:
port: 27017
tls:
mode: "preferTLS"
secrets:
encryptionKey: "${encryption_key}"
pmm:
enabled: true
serverHost: monitoring-service
customClusterName: "mongodb-production"
image:
repository: "percona/pmm-client"
tag: "${pmm_client_version}"
roles:
- role: "pmmMonitor"
db: "admin"
privileges:
- resource:
db: ""
collection: ""
actions:
- dbHash
- find
- listIndexes
- listCollections
- collStats
- dbStats
- indexStats
- resource:
db: ""
collection: "system.version"
actions:
- find
- resource:
db: ""
collection: "system.profile"
actions:
- dbStats
- collStats
- indexStats
users:
- name: "admin"
db: "admin"
passwordSecretRef:
name: "${secretname}"
key: "admin_password"
roles:
- name: "root"
db: "admin"
- name: "${username}"
db: "admin"
passwordSecretRef:
name: "${secretname}"
key: "aimos_password"
roles:
- name: "clusterAdmin"
db: "admin"
- name: "readWriteAnyDatabase"
db: "admin"
- name: "dbAdminAnyDatabase"
db: "admin"
- name: "pmm"
db: "admin"
passwordSecretRef:
name: "${secretname}"
key: "pmm_password"
roles:
- name: "pmmMonitor"
db: "admin"
- name: "read"
db: "local"
- name: "clusterMonitor"
db: "admin"
- name: "directShardOperations"
db: "admin"