How do I optimize PITR for a sensitive, high-traffic MongoDB replica set?

Hi folks. I inherited a > 500 GiB replica set with transatlantic traffic between nodes, using WiredTiger for storage. There’s one primary (EU), one secondary (US west), and one arbiter (US West). pbm is backing up to an S3-compatible API (US West). oplog spans are large enough to make pbm-agent compete with one high-traffic writer when PITR is on. I attached a pretty print of two slow query warnings that came after enabling PITR.

The biggest problem (outside of transatlantic traffic to one secondary) is that this is being used as a production system, and there’s not much warmth to building a paralell setup on our VPC due to cost. Changing the oplog span is on the table, but the traffic is high enough that even 10 min might capture gigs.

If enabling PITR under these conditions risks service interruption, is there any sane way to lower pbm-agent’s impact without touching the mongods or disabling PITR? I’m not good enough with these log entries to point to the most painful bottleneck, so I would be grateful for additional leads.

{
  "t": {
    "$date": "2022-10-06T04:43:49.407+00:00"
  },
  "s": "I",
  "c": "COMMAND",
  "id": 51803,
  "ctx": "conn24849",
  "msg": "Slow query",
  "attr": {
    "type": "command",
    "ns": "admin.$cmd",
    "appName": "pbm-agent",
    "command": {
      "update": "pbmAgents",
      "ordered": true,
      "writeConcern": {
        "w": "majority"
      },
      "lsid": {
        "id": {
          "$uuid": "REDACTED"
        }
      },
      "txnNumber": 2839,
      "$clusterTime": {
        "clusterTime": {
          "$timestamp": {
            "t": 1665031429,
            "i": 161
          }
        },
        "signature": {
          "hash": {
            "$binary": {
              "base64": "REDACTED",
              "subType": "0"
            }
          },
          "keyId": 0
        }
      },
      "$db": "admin"
    },
    "numYields": 0,
    "reslen": 245,
    "locks": {
      "ParallelBatchWriterMode": {
        "acquireCount": {
          "r": 2
        }
      },
      "ReplicationStateTransition": {
        "acquireCount": {
          "w": 3
        }
      },
      "Global": {
        "acquireCount": {
          "w": 2
        }
      },
      "Database": {
        "acquireCount": {
          "w": 2
        }
      },
      "Collection": {
        "acquireCount": {
          "w": 2
        }
      },
      "Mutex": {
        "acquireCount": {
          "r": 2
        }
      }
    },
    "flowControl": {
      "acquireCount": 1,
      "timeAcquiringMicros": 1
    },
    "readConcern": {
      "level": "local",
      "provenance": "implicitDefault"
    },
    "writeConcern": {
      "w": "majority",
      "wtimeout": 0,
      "provenance": "clientSupplied"
    },
    "storage": {},
    "remote": "REDACTED",
    "protocol": "op_msg",
    "durationMillis": 226
  }
}
{
  "t": {
    "$date": "2022-10-06T04:43:50.707+00:00"
  },
  "s": "I",
  "c": "COMMAND",
  "id": 51803,
  "ctx": "conn24849",
  "msg": "Slow query",
  "attr": {
    "type": "command",
    "ns": "admin.$cmd",
    "appName": "pbm-agent",
    "command": {
      "update": "pbmLock",
      "ordered": true,
      "writeConcern": {
        "w": "majority"
      },
      "lsid": {
        "id": {
          "$uuid": "REDACTED"
        }
      },
      "txnNumber": 2840,
      "$clusterTime": {
        "clusterTime": {
          "$timestamp": {
            "t": 1665031430,
            "i": 594
          }
        },
        "signature": {
          "hash": {
            "$binary": {
              "base64": "REDACTED",
              "subType": "0"
            }
          },
          "keyId": 0
        }
      },
      "$db": "admin"
    },
    "numYields": 0,
    "reslen": 245,
    "locks": {
      "ParallelBatchWriterMode": {
        "acquireCount": {
          "r": 2
        }
      },
      "ReplicationStateTransition": {
        "acquireCount": {
          "w": 3
        }
      },
      "Global": {
        "acquireCount": {
          "w": 2
        }
      },
      "Database": {
        "acquireCount": {
          "w": 2
        }
      },
      "Collection": {
        "acquireCount": {
          "w": 2
        }
      },
      "Mutex": {
        "acquireCount": {
          "r": 2
        }
      }
    },
    "flowControl": {
      "acquireCount": 1,
      "timeAcquiringMicros": 1
    },
    "readConcern": {
      "level": "local",
      "provenance": "implicitDefault"
    },
    "writeConcern": {
      "w": "majority",
      "wtimeout": 0,
      "provenance": "clientSupplied"
    },
    "storage": {},
    "remote": "REDACTED",
    "protocol": "op_msg",
    "durationMillis": 222
  }
}
1 Like