How to to backup Alert Templates and Alert Rules

Hi @steve.hoffman

I tried to implement the backup and restore script for PMM3 but after restoring the backup, inside the PMM3 server container, the PostgreSQL QAN agent is not working. I have updated everything in this forum link.

I am attaching the script for PMM3, please help me to fix the PostgreSQL QAN agent issue inside the PMM3 server container.

NOTE: The below script only works for PMM3 version.

#!/bin/bash

####
# Still NEED
# Run backups in parallel for speed? have to monitor for all 3 being done before moving on
# Do we need to backup pmm logs?  I think no but asking anyway
# Args and help (assuming running script by itself will backup with all defaults but do we allow overrides? i.e. storage location of backup?
#
####

######################################
# Set Defaults
######################################
backup_version="pmm_backup_$(date +%Y%m%d_%H%M%S)"
backup_root="/srv/backups"
backup_dir=${backup_root}/${backup_version}
clickhouse_database="pmm"
pmm_version=$(pmm-managed --version 2> >(grep -Em1 ^Version) | sed 's/.*: //' | awk -F- '{print $1}')
restore=0
upgrade=false
logfile="${backup_root}/pmmBackup.log"



set -Eeuo pipefail
trap cleanup SIGINT SIGTERM ERR EXIT


#######################################
# Show script usage info.
#######################################
usage() {
  cat <<EOF
Usage: $(basename "${BASH_SOURCE[0]}") [-h] [-v] [-i] [-s] [-r] [-u]
This tool is used to take online backups and can be used to restore a backup as well.
Available options:
-h, --help          Print this help and exit
-i, --interactive   Interactive mode will prompt user for values instead of assuming defaults ${RED}Not Yet Implemented${NOFORMAT}
-v, --verbose       Print script debug info
-r, --restore YYYYMMDD_HHMMSS
       Restore backup with date/time code of YYYYMMDD_HHMMSS to a PMM server of the same version (.tar.gz file must be in ${backup_root} directory)
-s, --storage       Choose a different storage location (default: ${backup_root})
-u, --upgrade       Allow restore to a newer PMM server than the backup was taken from (backup and restore version should be 5 or fewer versions apart)
EOF
  exit
}

#######################################
# Accept and parse script's params.
#######################################
parse_params() {
  while :; do
    case "${1-}" in
    -h | --help) usage ;;
    -v | --verbose) set -x ;;
#    -i | --interactive) interactive=1 ;;
    -s | --storage)
      storage="${2-}"
      backup_root="${storage}"
      backup_dir=${backup_root}/${backup_version}
      logfile="${backup_root}/pmmBackup.log"
      msg "${BLUE}Storage override${NOFORMAT} to: ${backup_root}"
      shift
      ;;
    -r | --restore)
      restore="${2-}"
      msg "${BLUE}Restoring${NOFORMAT} ${restore}"
      shift
      ;;
    -u | --upgrade)
      upgrade=true
      msg "${BLUE}Restore${NOFORMAT} to upgraded instance"
      ;;
    -?*) die "Unknown option: ${1}" ;;
    *) break ;;
    esac
    shift
  done

  args=("${@}")

  return 0
}

#######################################
# Clean up setup if interrupt.
#######################################
cleanup() {
  trap - SIGINT SIGTERM ERR EXIT
}

#######################################
# Defines colours for output messages.
#######################################
setup_colors() {
  if [[ -t 2 ]] && [[ -z "${NO_COLOR-}" ]] && [[ "${TERM-}" != "dumb" ]]; then
    NOFORMAT='\033[0m' RED='\033[0;31m' GREEN='\033[0;32m' ORANGE='\033[0;33m'
    BLUE='\033[0;34m' PURPLE='\033[0;35m' CYAN='\033[0;36m' YELLOW='\033[1;33m'
  else
    NOFORMAT='' RED='' GREEN='' ORANGE='' BLUE='' PURPLE='' CYAN='' YELLOW=''
  fi
}

#######################################
# Prints message to stderr with new line at the end.
#######################################
msg() {
  echo >&2 -e "${1-}"
}

#######################################
# Prints message and exit with code.
# Arguments:
#   message string;
#   exit code.
# Outputs:
#   writes message to stderr.
#######################################
die() {
  local message=${1}
  local code=${2-1} # default exit status 1
  msg "${message}"
  exit "${code}"
}

#######################################
# Check if Command exists
#######################################

check_command() {
  command -v "${@}" 1>/dev/null
}

#######################################
# Runs command as root with multiple fallback options.
#######################################
run_root() {
  sh='sh -c'
  if [ "$(id -un)" != 'root' ]; then
    if check_command sudo; then
      sh='sudo -E sh -c'
    elif check_command su; then
      sh='su -c'
    else
      die "${RED}ERROR: root rights needed to run \"${*}\" command ${NOFORMAT}"
    fi
  fi
  ${sh} "${@}" &>>"${logfile}"
}

######################################
# Verify and satisfy prerequisites
######################################
check_prereqs() {

        msg "${ORANGE}Checking${NOFORMAT} for/installing prerequisite software...an internet connection is requried or you must install missing softare manually"
        touch "${logfile}"
        # Does backup location exist and will we be able to write to it

        if [ ! -d "${backup_root}" ] ; then
        echo "Bkp dir: ${backup_root}"
                mkdir -p ${backup_root}
        elif [ ! -w "${backup_root}" ] ; then
                die "${RED}${backup_root} is not writable${NOFORMAT}, please look at permissions for $(id -un)"
        else
            echo "Bkp dir exist: ${backup_root}"
        fi

        #set version
        if [ "${restore}" == 0 ] ; then
                mkdir -p "${backup_dir}"
                echo ${pmm_version} > "${backup_dir}"/pmm_version.txt

                if ! check_command /tmp/vmbackup-prod; then
                        get_vm
                fi

        elif [ "${restore}" != 0 ] ; then
                msg "  Extracting Backup Archive"
                restore_from_dir="${backup_root}/pmm_backup_${restore}"
                #msg "restore from dir: ${restore_from_dir}"
                restore_from_file="${backup_root}/pmm_backup_${restore}.tar.gz"
                #msg "restore from file: ${restore_from_file}"
                mkdir -p "${restore_from_dir}"
                tar zxfm "${restore_from_file}" -C "${restore_from_dir}"
                backup_pmm_version=$(cat "${restore_from_dir}"/pmm_version.txt)
                restore_to_pmm_version=${pmm_version}
                #msg "from ${backup_pmm_version} to ${restore_to_pmm_version}"
                check_version "${backup_pmm_version}" "${restore_to_pmm_version}"
                #msg "${version_check} for restore action"
                # case eq: versions equal, just go
                # case lt: backup from older version of pmm, needs upgrade flag also
                # case gt: backup from newer version of pmm, not implemented
                if [[ ${version_check} == "eq" ]]; then
                        #good to go, do nothing
                        msg "${GREEN}Version Match${NOFORMAT} (${version_check}), proceeding"
                elif [[ ${version_check} == "lt" ]]; then
                        if $upgrade ; then
                                msg "${GREEN}Proceeding${NOFORMAT} with restore to upgraded version of PMM"
                        else
                                die "${RED}WARNING${NOFORMAT}: You must also pass the upgrade flag to restore to a newer version of PMM"
                        fi
                elif [[ ${version_check} == "gt" ]] ; then
                        die "${RED}ERROR${NOFORMAT}: Downgrades are not supported, you can only restore to $backup_pmm_version"
                fi

                if ! check_command /tmp/vmrestore-prod; then
                        get_vm
                fi
        fi


### nice to haves ###
        #will the backup fit on the filesystem (need >2x the size of the /srv directory)
}

#####################################################
# Do a version check to see if victoriametrics
# utils are from before windows/linux filename
# designation was added and download as appropriate
#####################################################

get_vm() {
        cd /tmp
        vm_version=$(victoriametrics --version | cut -d '-' -f7 | sed 's/v//')
        file_name="vmutils-linux-amd64-v${vm_version}.tar.gz"
        msg "  Downloading https://github.com/VictoriaMetrics/VictoriaMetrics/releases/download/v${vm_version}/${file_name}"
        if ! curl -s -L -O https://github.com/VictoriaMetrics/VictoriaMetrics/releases/download/v"${vm_version}"/"${file_name}" &>> "${logfile}" ; then
                die "${RED}ERROR ${NOFORMAT}: Could not download needed component...check internet?"
        fi
        tar zxf "${file_name}"
}

#############################################
# Check to see if version backed up is same,
# older, newer than version being restored to
#############################################
check_version() {
        #reset version_check to nothing for reuse
        version_check=false
        #msg "  Comparing version ${1} to ${2}"
        if [ "${1}" == "${2}" ] ; then
                #versions match, proceed
                version_check="eq"
                return 0
        fi
        local IFS=.
        local i ver1=(${1}) ver2=(${2})
        # fill empty fields in ver1 with zeros
        for ((i=${#ver1[@]}; i<${#ver2[@]}; i++))
          do
            ver1[i]=0
          done

        for ((i=0; i<${#ver1[@]}; i++))
        do
            if [[ -z ${ver2[i]} ]]
            then
                # fill empty fields in ver2 with zeros
                ver2[i]=0
            fi
            if ((10#${ver1[i]} < 10#${ver2[i]}))
            then
                version_check="lt"
                return 0
            fi
            if ((10#${ver1[i]} > 10#${ver2[i]}))
            then
                version_check="gt"
                return 0
            fi
        done
        version_check=false
        return 0


        #       if [ "${backup_pmm_version}" != "${restore}_to_pmm_version" ] ; then
        #               die "Cannot restore backup from PMM version ${backup_pmm_version} to PMM version ${restore}_to_pmm_version, install ${backup_pmm_version} on this host and retry."
        #       fi
}

######################################
# Perform Backup of PMM
######################################
perform_backup() {


        #setup env
        msg "${ORANGE}Creating${NOFORMAT} backup directory structure"
        mkdir -p "${backup_root}"/"${backup_version}"/{postgres,vm,clickhouse,folders}


        #pg backup
        msg "${ORANGE}Starting${NOFORMAT} PostgreSQL backup"
        pg_dump -c -C -U pmm-managed > "${backup_dir}"/postgres/backup.sql
        if [ -f /etc/grafana/grafana.ini ]; then
                pg_dump -c -C -U grafana > "${backup_dir}"/postgres/grafana.sql
        fi

        msg "${GREEN}Completed${NOFORMAT} PostgreSQL backup"

        #vm backup
        msg "${ORANGE}Starting${NOFORMAT} VictoriaMetrics backup"
        /tmp/vmbackup-prod --storageDataPath=/srv/victoriametrics/data -snapshot.createURL=http://localhost:9090/prometheus/snapshot/create -dst=fs://"${backup_dir}"/vm/ -loggerOutput=stdout
        msg "${GREEN}Completed${NOFORMAT} VictoriaMetrics backup"

        #clickhouse Backup

        msg "${ORANGE}Starting${NOFORMAT} Clickhouse backup"
        mapfile -t ch_array < <(/bin/clickhouse-client --host=127.0.0.1 --user=default --password=clickhouse  --query "select name from system.tables where database = '"${clickhouse_database}"'")
        #get engine type
        clickhouse_engine=$(/bin/clickhouse-client --host=127.0.0.1 --user=default --password=clickhouse  --query "select engine from system.databases where name='"${clickhouse_database}"'")
        for table in "${ch_array[@]}"
        do
                if [ "${table}" == "schema_migrations" ] ; then
                        msg "  Backing up ${table} table"
                        /bin/clickhouse-client --host=127.0.0.1 --user=default --password=clickhouse  --database "${clickhouse_database}" --query="SHOW CREATE TABLE ${table}" --format="TabSeparatedRaw" > "${backup_dir}"/clickhouse/"${table}".sql
                        /bin/clickhouse-client --host=127.0.0.1 --user=default --password=clickhouse  --database "${clickhouse_database}" --query="SELECT * from ${table}" --format CSV > "${backup_dir}"/clickhouse/"${table}".data
                else
                        msg "  Backing up ${table} table"
                        /bin/clickhouse-client --host=127.0.0.1 --user=default --password=clickhouse  --database "${clickhouse_database}" --query="SHOW CREATE TABLE ${table}" --format="TabSeparatedRaw" > "${backup_dir}"/clickhouse/"${table}".sql
                        /bin/clickhouse-client --host=127.0.0.1 --user=default --password=clickhouse  --query "alter table ${clickhouse_database}.${table} freeze"
                        if [ ${clickhouse_engine} == "Ordinary" ] ; then
                                msg "${ORANGE}    INFO: ${NOFORMAT}Engine = ${clickhouse_engine}"
                                mv /srv/clickhouse/shadow "${backup_dir}"/clickhouse/"${backup_version}"
                        elif [ ${clickhouse_engine} == "Atomic" ] ; then
                                msg "${ORANGE}    INFO: ${NOFORMAT}Engine = ${clickhouse_engine}"
                                increment=$(cat /srv/clickhouse/shadow/increment.txt)
                                table_uuid=$(/bin/clickhouse-client --host=127.0.0.1 --user=default --password=clickhouse  --query "select uuid from system.tables where database = '"${clickhouse_database}"' and name = '"${table}"'")
                                prefix=$(echo "${table_uuid}" | cut -c1-3)
                                mkdir -p "${backup_dir}"/clickhouse/"${backup_version}"/"${increment}"/data/"${clickhouse_database}"/"${table}"
                                mv /srv/clickhouse/shadow/increment.txt "${backup_dir}"/clickhouse/"${backup_version}"
                                mv /srv/clickhouse/shadow/"${increment}"/store/"${prefix}"/"${table_uuid}"/* "${backup_dir}"/clickhouse/"${backup_version}"/"${increment}"/data/"${clickhouse_database}"/"${table}"
                                rm -rf /srv/clickhouse/shadow/
                        fi
                fi
        done

        msg "${GREEN}Completed${NOFORMAT} Clickhouse backup"

        #support files backup
        msg "${ORANGE}Starting${NOFORMAT} configuration and supporting files backup"

        #cp -af /srv/grafana "${backup_dir}"/folders/
        #cp -af /srv/nginx "${backup_dir}"/folders/
        #cp -af /srv/prometheus "${backup_dir}"/folders/
        #cp -af /srv/pmm-distribution "${backup_dir}"/folders/

        msg "${GREEN}Completed${NOFORMAT} configuration and supporting files backup"

        msg "${ORANGE}Compressing${NOFORMAT} backup artifact"
        cpus=$(cat /proc/cpuinfo | grep processor | wc -l)
        [ ${cpus} -eq 1 ] && use_cpus=${cpus} || use_cpus=$((${cpus}/2))
        #msg "limiting to ${use_cpus}"
        #tar -cf "$backup_root"/"$backup_version".tar.gz -C "$backup_dir" .
        #tar --use-compress-program="pigz -5 -p${use_cpus}" -cf ${backup_root}/${backup_version}.tar.gz -C ${backup_dir} .
        tar -C ${backup_dir} -cf - . | nice pigz -p ${use_cpus} > ${backup_root}/${backup_version}.tar.gz
        msg "  Cleaning up"
        rm -rf "${backup_dir}"
        msg "\n${GREEN}SUCCESS${NOFORMAT}: Backup Complete"
}


######################################
# Perform Restore of PMM
######################################
perform_restore() {

        #stop pmm-managed locally to restore data
        msg "${ORANGE}Stopping${NOFORMAT} services to begin restore"
        supervisorctl stop grafana nginx pmm-agent pmm-managed qan-api2
        sleep 5
        msg "  Services stopped, restore starting"

        #pg restore
        msg "${ORANGE}Starting${NOFORMAT} PostgreSQL restore"
        psql -U postgres -c 'DROP DATABASE grafana;'
        psql -U postgres -c 'DROP DATABASE "pmm-managed";'
		
		psql -U postgres -f "${restore_from_dir}"/postgres/grafana.sql &>>"${logfile}"
        psql -U postgres -f "${restore_from_dir}"/postgres/backup.sql &>>"${logfile}"
		
        msg "${GREEN}Completed${NOFORMAT} PostgreSQL restore"
        
		sleep 60
		
        #vm restore
        msg "${ORANGE}Starting${NOFORMAT} VictoriaMetrics restore"
        supervisorctl stop victoriametrics
		sleep 1
        /tmp/vmrestore-prod -src=fs:///"${restore_from_dir}"/vm/ -storageDataPath=/srv/victoriametrics/data
        chown -R pmm.pmm /srv/victoriametrics/data
        supervisorctl start victoriametrics
        msg "${GREEN}Completed${NOFORMAT} VictoriaMetrics restore"

        sleep 60
		
        #clickhouse restore
        msg "${ORANGE}Starting${NOFORMAT} Clickhouse restore"
        #stop qan api
        supervisorctl stop qan-api2
		sleep 1
        #will need to loop through ${table}
        mapfile -t ch_array < <(ls "${restore_from_dir}"/clickhouse | grep .sql | sed "s/\.sql//")
        for table in "${ch_array[@]}"; do
                if [ "${table}" == "schema_migrations" ] ; then
                        # schema_migrations only needs SQL replay, other tables need data copies and reattaching files
                        msg "  Restoring ${table} table"
                        /bin/clickhouse-client --host=127.0.0.1 --user=default --password=clickhouse  --database "${clickhouse_database}" --query="drop table if exists ${table}"
                        cat "${restore_from_dir}"/clickhouse/"${table}".sql | /bin/clickhouse-client --host=127.0.0.1 --user=default --password=clickhouse  --database "${clickhouse_database}"
                        # this can be improved as all the data to form this statement is in ${table}.sql and will
                        # be a bit more future-proofed if table structure changes
                        cat "${restore_from_dir}"/clickhouse/"${table}".data | /bin/clickhouse-client --host=127.0.0.1 --user=default --password=clickhouse  --database "${clickhouse_database}" --query "INSERT INTO ${clickhouse_database}.${table} SELECT version, dirty, sequence FROM input('version UInt32, dirty UInt8, sequence UInt64') FORMAT CSV"
                        #check that num rows in == num rows inserted
                        rows_in=$(/bin/wc -l "${restore_from_dir}"/clickhouse/"${table}".data | cut -d " " -f1)
                        rows_inserted=$(clickhouse-client --host=127.0.0.1 --user=default --password=clickhouse  --database "${clickhouse_database}" --query="select count(*) from ${table}")
                        if [ "${rows_in}" == "${rows_inserted}" ] ; then
                                msg "  Successfully restored ${table}"
                        else
                                msg "  There was a problem restoring ${table}, ${rows_in} rows backed up but ${rows_inserted} restored"
                        fi
                else
                        msg "  Restoring ${table} table"
                        /bin/clickhouse-client --host=127.0.0.1 --user=default --password=clickhouse  --database "${clickhouse_database}" --query="drop table if exists ${table}"
                        cat "${restore_from_dir}"/clickhouse/"${table}".sql | /bin/clickhouse-client --host=127.0.0.1 --user=default --password=clickhouse  --database "${clickhouse_database}"
                        [ ! -d "/srv/clickhouse/data/${clickhouse_database}/${table}/detached" ] && mkdir -p /srv/clickhouse/data/"${clickhouse_database}"/"${table}"/detached/
                        msg "  Copying files"
                        folder=$(cat "${restore_from_dir}"/clickhouse/pmm_backup_"${restore}"/increment.txt)
                        #if the source and destination folders are on the same physical drive, we can go MUCH faster by creating hard-links
                        #if not, we have to copy the files the slow way
                        if [ $(stat -c %d "${backup_root}") = $(stat -c %D "/srv/clickhouse") ]; then
                                cp -rlf "${restore_from_dir}"/clickhouse/pmm_backup_"${restore}"/"$folder"/data/"${clickhouse_database}"/"${table}"/* /srv/clickhouse/data/"${clickhouse_database}"/"${table}"/detached/
                        else
                                cp -rf "${restore_from_dir}"/clickhouse/pmm_backup_"${restore}"/"$folder"/data/"${clickhouse_database}"/"${table}"/* /srv/clickhouse/data/"${clickhouse_database}"/"${table}"/detached/
                        fi
                        msg "  Gathering partitions"
                        [[ ${UID} -ne 0 ]] && chmod -R o+rx /srv/clickhouse;
                        mapfile -t partitions < <(ls /srv/clickhouse/data/"${clickhouse_database}"/"${table}"/detached/ | cut -d "_" -f1 | uniq)
                        [[ ${UID} -ne 0 ]] && chmod -R o-rx /srv/clickhouse;
                        for partition in "${partitions[@]}"; do
                                msg "    Loading partition ${partition}"
                                /bin/clickhouse-client --host=127.0.0.1 --user=default --password=clickhouse  --database "${clickhouse_database}" --query="alter table ${table} attach partition ${partition}"
                        done
                fi
        done


        msg "${GREEN}Completed${NOFORMAT} Clickhouse restore"
        
        sleep 60

        msg "  Restarting services"
        if ${upgrade} ; then
                supervisorctl reload
                sleep 10
                supervisorctl reload
				sleep 1
        else
                #supervisorctl restart grafana nginx pmm-managed qan-api2
                supervisorctl stop grafana nginx pmm-agent pmm-managed qan-api2
				sleep 5
				supervisorctl start grafana nginx pmm-agent pmm-managed qan-api2
				sleep 5
				supervisorctl restart grafana nginx pmm-agent pmm-managed qan-api2
				sleep 5
				supervisorctl stop grafana nginx pmm-agent pmm-managed qan-api2
				sleep 5
				supervisorctl start grafana nginx pmm-agent pmm-managed qan-api2
				sleep 5
        fi
        msg "${GREEN}Completed${NOFORMAT} configuration and file restore"

        # cleanup
        rm -rf "${restore_from_dir}"
}

main() {
        check_prereqs
        if [ "${restore}" != 0 ]; then
                #do restore stuff here
                msg "  Restoring backup pmm_backup_${restore}.tar.gz"
                perform_restore
        else
                perform_backup
        fi

}

setup_colors
parse_params "${@}"
main
die "Thank you for using the PMM Backup Tool!" 0