We have an MySQL master-master replication setup with mysql-mmm in Ubuntu 20.04. We recently upgraded from CentOS 6 and MySQL 5.6 and kept most of the settings.
Both of the machines have 48GB of memory, 10 CPU’s and contains 250GB in of data in >100.000 tables spread over >400 databases.
But after the upgrade we are experiencing a couple of freezes per day, on the active and passive server.
Tcpdump is showing me that the queries (from mysql-mmm control vps) are coming and going, but it just takes a longer time and then the timeout of 10 seconds just kicks in.
slow log starts with a lot of “SHOW SLAVE STATUS” and after that a INSERT’s and a few DELETE calls, but not a single ‘normal’ SELECT query.
How can we debug this? Any help is greatly appreciated
mysql ini file
[mysqld]
auto_increment_increment = 3
auto_increment_offset = 2
basedir = /usr
bind-address = 0.0.0.0
binlog_format = ROW
binlog_gtid_simple_recovery = false
datadir = /var/lib/mysql
expire_logs_days = 10
innodb_buffer_pool_size = 32G
innodb_file_per_table
innodb_log_file_size = 8G
innodb_read_io_threads = 8
innodb_stats_on_metadata = 0
innodb_write_io_threads = 8
join_buffer_size = 2M
key_buffer_size = 32M
log-error = /var/log/mysql/error.log
log_bin = /var/log/mysql/mysql-bin.log
log_slave_updates
master_info_repository = table
max_allowed_packet = 64M
max_binlog_size = 2G
max_connections = 1000
open_files_limit = 5000
pid-file = /var/run/mysqld/mysqld.pid
port = 3306
query_cache_size = 0
query_cache_type = 0
read_buffer_size = 1M
read_rnd_buffer_size = 1M
relay-log = /var/log/mysql/mysql-relay-bin.log
relay_log_info_repository = table
server-id = 101
skip-external-locking
skip-name-resolve
slave_net_timeout = 3600
slow_launch_time = 2
slow_query_log
socket = /var/run/mysqld/mysqld.sock
sort_buffer_size = 1M
sql_mode = ERROR_FOR_DIVISION_BY_ZERO,NO_AUTO_CREATE_USER,NO_ENGINE_SUBSTITUTION
ssl = false
ssl-ca = /etc/mysql/cacert.pem
ssl-cert = /etc/mysql/server-cert.pem
ssl-key = /etc/mysql/server-key.pem
table_open_cache = 1995
table_open_cache_instances = 6
thread_cache_size = 64
thread_stack = 256K
tmpdir = /tmp
user = mysql
wait_timeout = 360
slow log:
# Time: 2021-01-13T20:17:16.178603Z # User@Host: mmm_monitor[mmm_monitor] @ [10.0.0.2] Id: 33069778 # Query_time: 113.954901 Lock_time: 0.000000 Rows_sent: 0 Rows_examined: 0 SET timestamp=1610569036; SHOW SLAVE STATUS; # Time: 2021-01-13T20:17:16.178651Z # User@Host: mmm_monitor[mmm_monitor] @ [10.0.0.2] Id: 33070537 # Query_time: 105.950972 Lock_time: 0.000000 Rows_sent: 0 Rows_examined: 0 SET timestamp=1610569036; SHOW SLAVE STATUS; # Time: 2021-01-13T20:17:16.178754Z # User@Host: mmm_monitor[mmm_monitor] @ [10.0.0.2] Id: 33070555 # Query_time: 105.389985 Lock_time: 0.000000 Rows_sent: 0 Rows_examined: 0 SET timestamp=1610569036; SHOW SLAVE STATUS; # Time: 2021-01-13T20:17:16.178895Z # User@Host: mmm_monitor[mmm_monitor] @ [10.0.0.2] Id: 33071147 # Query_time: 97.946772 Lock_time: 0.000000 Rows_sent: 0 Rows_examined: 0 SET timestamp=1610569036; SHOW SLAVE STATUS; # Time: 2021-01-13T20:17:16.179039Z # User@Host: mmm_monitor[mmm_monitor] @ [10.0.0.2] Id: 33069831 # Query_time: 113.394795 Lock_time: 0.000000 Rows_sent: 0 Rows_examined: 0 SET timestamp=1610569036; SHOW SLAVE STATUS; # Time: 2021-01-13T20:17:16.179071Z # User@Host: mmm_monitor[mmm_monitor] @ [10.0.0.2] Id: 33071896 # Query_time: 89.943642 Lock_time: 0.000000 Rows_sent: 0 Rows_examined: 0 SET timestamp=1610569036; SHOW SLAVE STATUS; # Time: 2021-01-13T20:17:16.179229Z # User@Host: mmm_monitor[mmm_monitor] @ [10.0.0.2] Id: 33072722 # Query_time: 81.936055 Lock_time: 0.000000 Rows_sent: 0 Rows_examined: 0 SET timestamp=1610569036; SHOW SLAVE STATUS; # Time: 2021-01-13T20:17:16.179250Z # User@Host: mmm_monitor[mmm_monitor] @ [10.0.0.2] Id: 33071925 # Query_time: 89.382786 Lock_time: 0.000000 Rows_sent: 0 Rows_examined: 0 SET timestamp=1610569036; SHOW SLAVE STATUS; # Time: 2021-01-13T20:17:16.179324Z # User@Host: mmm_monitor[mmm_monitor] @ [10.0.0.2] Id: 33071181 # Query_time: 97.386881 Lock_time: 0.000000 Rows_sent: 0 Rows_examined: 0 SET timestamp=1610569036; SHOW SLAVE STATUS; # Time: 2021-01-13T20:17:16.179366Z # User@Host: mmm_monitor[mmm_monitor] @ [10.0.0.2] Id: 33072771 # Query_time: 81.378021 Lock_time: 0.000000 Rows_sent: 0 Rows_examined: 0 SET timestamp=1610569036; SHOW SLAVE STATUS; # Time: 2021-01-13T20:17:16.179433Z # User@Host: mmm_monitor[mmm_monitor] @ [10.0.0.2] Id: 33073331 # Query_time: 73.928755 Lock_time: 0.000000 Rows_sent: 0 Rows_examined: 0 SET timestamp=1610569036; SHOW SLAVE STATUS;
error file:
2021-01-13T20:17:16.182671Z 33075782 [Note] Got an error reading communication packets
2021-01-13T20:17:16.182910Z 33075785 [Note] Got an error reading communication packets
2021-01-13T20:17:16.182946Z 33075788 [Note] Got an error reading communication packets
2021-01-13T20:17:16.183181Z 33075793 [Note] Got an error reading communication packets
2021-01-13T20:17:16.184134Z 33075805 [Note] Got an error reading communication packets
2021-01-13T20:17:16.184501Z 33075812 [Note] Got an error reading communication packets
2021-01-13T20:17:16.184793Z 33075817 [Note] Got an error reading communication packets
2021-01-13T20:17:16.185001Z 33075820 [Note] Got an error reading communication packets
2021-01-13T20:17:16.185073Z 33075822 [Note] Got an error reading communication packets
2021-01-13T20:17:16.185194Z 33075824 [Note] Got an error reading communication packets
2021-01-13T20:17:16.185269Z 33075823 [Note] Got an error reading communication packets
2021-01-13T20:17:16.185639Z 33075832 [Note] Got an error reading communication packets
2021-01-13T20:17:16.185684Z 33075835 [Note] Got an error reading communication packets
2021-01-13T20:17:16.186003Z 33075841 [Note] Got an error reading communication packets
2021-01-13T20:17:16.186025Z 33075842 [Note] Got an error reading communication packets
2021-01-13T20:17:16.186073Z 33075843 [Note] Got an error reading communication packets
2021-01-13T20:17:16.186135Z 33075844 [Note] Got an error reading communication packets
2021-01-13T20:17:16.186224Z 33075845 [Note] Got an error reading communication packets
2021-01-13T20:17:16.186296Z 33075846 [Note] Got an error reading communication packets
2021-01-13T20:17:16.186412Z 33075847 [Note] Got an error reading communication packets
2021-01-13T20:17:16.186460Z 33075848 [Note] Got an error reading communication packets
2021-01-13T20:17:16.186583Z 33075849 [Note] Got an error reading communication packets
2021-01-13T20:17:16.186631Z 33075850 [Note] Got an error reading communication packets
2021-01-13T20:17:16.186717Z 33075851 [Note] Got an error reading communication packets
2021-01-13T20:17:16.186805Z 33075852 [Note] Got an error reading communication packets
2021-01-13T20:17:16.186834Z 33075853 [Note] Got an error reading communication packets
2021-01-13T20:17:16.186995Z 33075854 [Note] Got an error reading communication packets
2021-01-13T20:17:16.187008Z 33075855 [Note] Got an error reading communication packets
2021-01-13T20:17:16.203678Z 33075753 [Note] Aborted connection 33075753 to db: ‘unconnected’ user: ‘zabbix’ host: ‘127.0.0.1’ (Got an error writing communication packets)
2021-01-13T20:17:16.187373Z 33075857 [Note] Got an error reading communication packets
2021-01-13T20:17:16.187477Z 33075858 [Note] Got an error reading communication packets
2021-01-13T20:17:16.187589Z 33075860 [Note] Got an error reading communication packets
2021-01-13T20:17:16.187554Z 33075859 [Note] Got an error reading communication packets
2021-01-13T20:17:16.187613Z 33075861 [Note] Got an error reading communication packets
2021-01-13T20:17:16.187787Z 33075862 [Note] Got an error reading communication packets
2021-01-13T20:17:16.187835Z 33075863 [Note] Got an error reading communication packets
2021-01-13T20:17:16.187994Z 33075864 [Note] Got an error reading communication packets
2021-01-13T20:17:16.188012Z 33075865 [Note] Got an error reading communication packets
2021-01-13T20:17:16.188055Z 33075866 [Note] Got an error reading communication packets
2021-01-13T20:17:16.188150Z 33075867 [Note] Got an error reading communication packets
2021-01-13T20:17:16.188186Z 33075868 [Note] Got an error reading communication packets
2021-01-13T20:17:16.188369Z 33075869 [Note] Got an error reading communication packets
2021-01-13T20:17:16.188387Z 33075870 [Note] Got an error reading communication packets
2021-01-13T20:17:16.188402Z 33075871 [Note] Got an error reading communication packets
2021-01-13T20:17:16.187198Z 33075856 [Note] Got an error reading communication packets
2021-01-13T20:17:26.186038Z 33075791 [Note] Got timeout reading communication packets
2021-01-13T20:17:26.186468Z 33075794 [Note] Got timeout reading communication packets
2021-01-13T20:17:26.191292Z 33075755 [Note] Got timeout reading communication packets
2021-01-13T20:17:26.191335Z 33075829 [Note] Got timeout reading communication packets
2021-01-13T20:17:26.191391Z 33075757 [Note] Got timeout reading communication packets
2021-01-13T20:17:26.191489Z 33075756 [Note] Got timeout reading communication packets
2021-01-13T20:17:26.191530Z 33075761 [Note] Got timeout reading communication packets