The former primary pod did not start as a replica during few scenarios

The former primary pod did not start as a replica during few scenarios either during network isolation or when its worker node was unavailable.

Below are performed to start former primary pod as replica and pod stuck at readiness 4/5 status

  • Noticed pg_rewind started in replica pod logs
  • Replica recovery is blocked by a stale postmaster.pid file left over from the sudden node crash. Logged in replica pod and killed postmaster.pid process but no progress
  • Tried delete replica pod and again stuck at the same point.
  • Deleted pod then immediately deleted the pvc, then replica pod bootstrapped from the primary and pod readiness status is 5/5

This behavior happened in couple of scenarios, is this expected behavior or something our configuration is incorrect. Below is the error in replica pod logs, please suggest.

/tmp/postgres:5432 - rejecting connections
2026-05-18 13:58:14,219 INFO: Lock owner: pg-cluster-ha-pg-db-pgsa-hql8-0; I am pg-cluster-ha-pg-db-pgsa-jmhg-0
2026-05-18 13:58:14,219 INFO: Still starting up as a standby.
2026-05-18 13:58:14,220 INFO: Lock owner: pg-cluster-ha-pg-db-pgsa-hql8-0; I am pg-cluster-ha-pg-db-pgsa-jmhg-0
2026-05-18 13:58:14,220 INFO: establishing a new patroni heartbeat connection to postgres
2026-05-18 13:58:15,043 INFO: establishing a new patroni heartbeat connection to postgres
2026-05-18 13:58:15,044 WARNING: Retry got exception: connection problems
2026-05-18 13:58:15,045 WARNING: Failed to determine PostgreSQL state from the connection, falling back to cached role
2026-05-18 13:58:15,045 INFO: no action. I am (pg-cluster-ha-pg-db-pgsa-jmhg-0), a secondary, and following a leader (pg-cluster-ha-pg-db-pgsa-hql8-0)
2026-05-18 13:58:17,027 INFO: establishing a new patroni heartbeat connection to postgres
2026-05-18 13:58:18,500 INFO: establishing a new patroni heartbeat connection to postgres
2026-05-18 13:58:18,502 WARNING: Retry got exception: connection problems

Below is the values.yaml used for setting up the primary and replica pods

[root@bastion-node pg-db]# cat values.yaml

Default values for pg-cluster.

This is a YAML-formatted file.

Declare variables to be passed into your templates.

annotations:

test-annotation: value

finalizers:

Set this if you want that operator deletes the PVCs on cluster deletion

- percona.com/delete-pvc

Set this if you want that operator deletes the ssl objects on cluster deletion

- percona.com/delete-ssl

Set this if you want that operator deletes the backups on cluster deletion

- percona.com/delete-backups

crVersion: 2.8.2

example-annotation: value

labels:

example-label: value

service:

Valid type are NodePort or LoadBalancer. Defaults to NodePort

type: LoadBalancer

If Postgrescluster has to be deployed on the openshift env, set openshift: true.

openshift: true

repository: registry.connect.redhat.com/percona/percona-postgresql-operator-containers
image: registry.connect.redhat.com/percona/percona-postgresql-operator-containers:2.8.2-ppg-16-postgres
imagePullPolicy: Always
imagePullSecrets:

  • name: rh-catalog
    postgresVersion: 16

port: 5432

pause: false
unmanaged: false
standby:
enabled: false

host: “”

port: “”

repoName: repo1

customRootCATLSSecret:

name: cluster1-ca-cert

items:

- key: “tls.crt”

path: “root.crt”

- key: “tls.key”

path: “root.key”

customTLSSecret:
name: “”
customReplicationTLSSecret:
name: “”

volumes:

instances:

  • name: pgsa
    replicas: 2

    expose primary cluster IP for streaming replication

expose:

type: ClusterIP

dataVolumeClaimSpec:
storageClassName: ocs-storagecluster-ceph-rbd
accessModes:

  • ReadWriteOnce
    resources:
    requests:
    storage: 5Gi

affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:

  • matchExpressions:
  • key: database
    operator: In # The “equals” logic
    values:
  • postgres # The label value on your node
podAntiAffinity:
  requiredDuringSchedulingIgnoredDuringExecution:
  - labelSelector:
      matchLabels:
        postgres-operator.crunchydata.com/data: postgres
    topologyKey: kubernetes.io/hostname

patroni:
dynamicConfiguration:
dcs:
maximum_lag_on_failover: 104857600
loop_wait: 10
retry_timeout: 10

proxy:
pgBouncer:
replicas: 0
image: docker.io/percona/percona-pgbouncer:1.25.0-1

affinity:
  podAntiAffinity:
    preferredDuringSchedulingIgnoredDuringExecution:
    - weight: 1
      podAffinityTerm:
        labelSelector:
          matchLabels:
            postgres-operator.crunchydata.com/role: pgbouncer
        topologyKey: kubernetes.io/hostname

backups:
enabled: true
trackLatestRestorableTime: true

pgbackrest:
image: registry.connect.redhat.com/percona/percona-postgresql-operator-containers:2.8.2-ppg-16-pgbackrest
env:
envFrom:

repos:
  - name: repo1
    schedules:
      full: "12 1 * * *"
      differential: "01 11 * * 1-6"
    volume:
      volumeClaimSpec:
        accessModes:
          - ReadWriteOnce
        resources:
          requests:
            storage: 5Gi
global:
  repo1-retention-full: "2"
  repo1-retention-diff: "7"
  repo1-retention-full-type: time
  repo1-cipher-type: none

repoHost:
  affinity:
    podAntiAffinity:
      preferredDuringSchedulingIgnoredDuringExecution:
        - weight: 1
          podAffinityTerm:
            labelSelector:
              matchLabels:
                postgres-operator.crunchydata.com/data: pgbackrest
            topologyKey: kubernetes.io/hostname

backupaffinity:
enabled: true
nodeSelectorTerms:
key: database
value: postgres

pmm:
enabled: true
image:
repository: docker.io/percona/pmm-client
tag: 3.4.1

imagePullPolicy: IfNotPresent

secret: cluster1-pmm-secret

serverHost: “pmm-server.pmm.svc.cluster.local”
secret: pmm-secret
resources:
requests:
memory: 256Mi
cpu: 500m

secrets:
name:

replication user password

primaryuser:

superuser password

postgres: postgres

pgbouncer user password

pgbouncer:

pguser user password

pguser:
[root@bastion-node pg-db]#