Files
Jonathan Miller 6de15810f4 feat: add server auto-heal script with split backup management
- Boot check with safe-point detection for unclean restarts
- Split system (daily) and content (2h) backup schedules
- 4-phase auto-heal: filesystem repair, config restore, service restart, health verify
- Self-installing: creates cron jobs, systemd shutdown hook, and config
- Configurable via /etc/moko/autoheal.conf

Authored-by: Moko Consulting
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-15 20:52:27 -05:00

679 lines
24 KiB
Bash

#!/usr/bin/env bash
# server-autoheal.sh - Auto-heal on restart + split backup management
#
# Copyright (C) 2026 Moko Consulting <hello@mokoconsulting.tech>
# SPDX-License-Identifier: GPL-3.0-or-later
#
# DEFGROUP: MokoStandards.Automation.ServerAutoheal
# INGROUP: MokoStandards.Automation
# REPO: https://git.mokoconsulting.tech/MokoConsulting/moko-platform
# PATH: /automation/server-autoheal.sh
# BRIEF: Server auto-heal on unclean restart + split system/content backups
#
# Usage:
# server-autoheal.sh <command> [options]
#
# Commands:
# boot-check Run at boot — auto-heals if no safe point exists
# set-safepoint Mark current state as safe (call before planned shutdown)
# backup-system Run a system backup (configs, packages, services)
# backup-content Run a content backup (site files, databases, uploads)
# cleanup Prune expired backups per retention policy
# status Show safe point and backup status
#
# Scheduling (cron):
# @reboot server-autoheal.sh boot-check
# 0 3 * * * server-autoheal.sh backup-system (daily at 3am)
# 0 */2 * * * server-autoheal.sh backup-content (every 2 hours)
# 30 */2 * * * server-autoheal.sh cleanup (30 min after content backup)
set -euo pipefail
# ──────────────────────────────────────────────
# Configuration — override via /etc/moko/autoheal.conf
# ──────────────────────────────────────────────
CONF_FILE="/etc/moko/autoheal.conf"
[[ -f "$CONF_FILE" ]] && source "$CONF_FILE"
BACKUP_ROOT="${BACKUP_ROOT:-/var/backups/moko}"
SAFEPOINT_FILE="${SAFEPOINT_FILE:-/var/run/moko/safepoint}"
LOG_FILE="${LOG_FILE:-/var/log/moko/autoheal.log}"
LOCK_DIR="${LOCK_DIR:-/var/run/moko}"
# System backup: configs, package lists, service state, cron
SYSTEM_BACKUP_DIR="${BACKUP_ROOT}/system"
SYSTEM_BACKUP_RETAIN="${SYSTEM_BACKUP_RETAIN:-7}" # keep 7 daily system backups
# Content backup: web roots, databases, uploads
CONTENT_BACKUP_DIR="${BACKUP_ROOT}/content"
CONTENT_BACKUP_RETAIN_HOURS="${CONTENT_BACKUP_RETAIN_HOURS:-24}" # 1 day of content backups
# Paths to back up — override these in /etc/moko/autoheal.conf
SYSTEM_PATHS="${SYSTEM_PATHS:-/etc/nginx /etc/php /etc/mysql /etc/cron.d /etc/systemd/system}"
CONTENT_PATHS="${CONTENT_PATHS:-/var/www}"
DB_NAMES="${DB_NAMES:-}" # space-separated list, empty = auto-detect all
# ──────────────────────────────────────────────
# Helpers
# ──────────────────────────────────────────────
log() {
local level="$1"; shift
local ts
ts=$(date -u '+%Y-%m-%dT%H:%M:%SZ')
local msg="[$ts] [$level] $*"
echo "$msg" | tee -a "$LOG_FILE" >&2
}
ensure_dirs() {
mkdir -p "$SYSTEM_BACKUP_DIR" "$CONTENT_BACKUP_DIR" \
"$LOCK_DIR" "$(dirname "$LOG_FILE")"
}
acquire_lock() {
local lockfile="${LOCK_DIR}/autoheal-${1}.lock"
if [[ -f "$lockfile" ]]; then
local pid
pid=$(<"$lockfile")
if kill -0 "$pid" 2>/dev/null; then
log WARN "Another $1 operation is running (PID $pid), skipping"
exit 0
fi
rm -f "$lockfile"
fi
echo $$ > "$lockfile"
trap "rm -f '$lockfile'" EXIT
}
timestamp() {
date -u '+%Y%m%d_%H%M%S'
}
# ──────────────────────────────────────────────
# Safe-point management
# ──────────────────────────────────────────────
cmd_set_safepoint() {
ensure_dirs
local ts
ts=$(timestamp)
cat > "$SAFEPOINT_FILE" <<EOF
timestamp=$ts
hostname=$(hostname)
kernel=$(uname -r)
uptime=$(uptime -s 2>/dev/null || echo "unknown")
set_by=${SUDO_USER:-$(whoami)}
EOF
log INFO "Safe point set at $ts by ${SUDO_USER:-$(whoami)}"
}
cmd_clear_safepoint() {
rm -f "$SAFEPOINT_FILE"
log INFO "Safe point cleared"
}
has_safepoint() {
[[ -f "$SAFEPOINT_FILE" ]]
}
# ──────────────────────────────────────────────
# System backup (daily)
# ──────────────────────────────────────────────
cmd_backup_system() {
ensure_dirs
acquire_lock "system-backup"
local ts
ts=$(timestamp)
local archive="${SYSTEM_BACKUP_DIR}/system_${ts}.tar.gz"
local manifest="${SYSTEM_BACKUP_DIR}/system_${ts}.manifest"
log INFO "Starting system backup → $archive"
# Collect existing paths only
local existing_paths=()
for p in $SYSTEM_PATHS; do
[[ -e "$p" ]] && existing_paths+=("$p")
done
if [[ ${#existing_paths[@]} -eq 0 ]]; then
log WARN "No system paths found to back up"
return 1
fi
# Archive configs and system files
tar -czf "$archive" "${existing_paths[@]}" 2>/dev/null || true
# Capture package list and service state as manifest
{
echo "=== PACKAGES ==="
if command -v dpkg &>/dev/null; then
dpkg --get-selections
elif command -v rpm &>/dev/null; then
rpm -qa --qf '%{NAME}\t%{VERSION}\n'
fi
echo ""
echo "=== ENABLED SERVICES ==="
if command -v systemctl &>/dev/null; then
systemctl list-unit-files --state=enabled --no-pager 2>/dev/null || true
fi
echo ""
echo "=== CRONTABS ==="
for user_home in /var/spool/cron/crontabs/*; do
[[ -f "$user_home" ]] && echo "--- $(basename "$user_home") ---" && cat "$user_home"
done 2>/dev/null || true
} > "$manifest"
local size
size=$(du -sh "$archive" 2>/dev/null | cut -f1)
log INFO "System backup complete: $archive ($size)"
# Prune old system backups (keep $SYSTEM_BACKUP_RETAIN)
local count
count=$(find "$SYSTEM_BACKUP_DIR" -name 'system_*.tar.gz' | wc -l)
if [[ "$count" -gt "$SYSTEM_BACKUP_RETAIN" ]]; then
local to_remove=$((count - SYSTEM_BACKUP_RETAIN))
find "$SYSTEM_BACKUP_DIR" -name 'system_*.tar.gz' -printf '%T+ %p\n' \
| sort | head -n "$to_remove" | awk '{print $2}' \
| while read -r f; do
rm -f "$f" "${f%.tar.gz}.manifest"
log INFO "Pruned old system backup: $f"
done
fi
}
# ──────────────────────────────────────────────
# Content backup (every 2 hours)
# ──────────────────────────────────────────────
cmd_backup_content() {
ensure_dirs
acquire_lock "content-backup"
local ts
ts=$(timestamp)
local archive="${CONTENT_BACKUP_DIR}/content_${ts}.tar.gz"
local db_dump="${CONTENT_BACKUP_DIR}/content_${ts}.sql.gz"
log INFO "Starting content backup → $archive"
# Back up web content / uploads
local existing_paths=()
for p in $CONTENT_PATHS; do
[[ -e "$p" ]] && existing_paths+=("$p")
done
if [[ ${#existing_paths[@]} -gt 0 ]]; then
tar -czf "$archive" "${existing_paths[@]}" 2>/dev/null || true
local size
size=$(du -sh "$archive" 2>/dev/null | cut -f1)
log INFO "Content files archived: $archive ($size)"
else
log WARN "No content paths found to back up"
fi
# Database dump
if command -v mysqldump &>/dev/null || command -v mariadb-dump &>/dev/null; then
local dump_cmd="mysqldump"
command -v mariadb-dump &>/dev/null && dump_cmd="mariadb-dump"
local databases=()
if [[ -n "$DB_NAMES" ]]; then
read -ra databases <<< "$DB_NAMES"
else
# Auto-detect: dump all databases except system ones
databases=($(${dump_cmd%dump} -N -e \
"SELECT schema_name FROM information_schema.schemata
WHERE schema_name NOT IN ('information_schema','performance_schema','mysql','sys')" \
2>/dev/null | tr '\n' ' ')) || true
fi
if [[ ${#databases[@]} -gt 0 ]]; then
$dump_cmd --single-transaction --routines --triggers \
--databases "${databases[@]}" 2>/dev/null \
| gzip > "$db_dump"
local db_size
db_size=$(du -sh "$db_dump" 2>/dev/null | cut -f1)
log INFO "Database dump complete: $db_dump ($db_size)"
else
log WARN "No databases found to dump"
fi
fi
}
# ──────────────────────────────────────────────
# Cleanup — prune content backups older than retention
# ──────────────────────────────────────────────
cmd_cleanup() {
ensure_dirs
local before_count after_count
# Content: keep only last 24 hours (1 day)
before_count=$(find "$CONTENT_BACKUP_DIR" -name 'content_*' -type f | wc -l)
find "$CONTENT_BACKUP_DIR" -name 'content_*' -type f \
-mmin +$((CONTENT_BACKUP_RETAIN_HOURS * 60)) -delete 2>/dev/null || true
after_count=$(find "$CONTENT_BACKUP_DIR" -name 'content_*' -type f | wc -l)
local removed=$((before_count - after_count))
[[ "$removed" -gt 0 ]] && log INFO "Pruned $removed content backup(s) older than ${CONTENT_BACKUP_RETAIN_HOURS}h"
# System: keep N most recent (handled in backup-system, but double-check here)
before_count=$(find "$SYSTEM_BACKUP_DIR" -name 'system_*' -type f | wc -l)
local max_system_files=$((SYSTEM_BACKUP_RETAIN * 2)) # .tar.gz + .manifest
if [[ "$before_count" -gt "$max_system_files" ]]; then
local excess=$((before_count - max_system_files))
find "$SYSTEM_BACKUP_DIR" -name 'system_*' -type f -printf '%T+ %p\n' \
| sort | head -n "$excess" | awk '{print $2}' \
| xargs -r rm -f
log INFO "Pruned excess system backups"
fi
log INFO "Cleanup complete"
}
# ──────────────────────────────────────────────
# Boot check — the auto-heal entry point
# ──────────────────────────────────────────────
cmd_boot_check() {
ensure_dirs
acquire_lock "boot-check"
log INFO "=== Boot check started ==="
log INFO "Hostname: $(hostname), Kernel: $(uname -r)"
if has_safepoint; then
log INFO "Safe point found — server was shut down cleanly"
log INFO "Clearing safe point for next cycle"
cmd_clear_safepoint
log INFO "=== Boot check passed (clean restart) ==="
return 0
fi
log WARN "NO safe point found — server restarted without clean shutdown"
log WARN "Initiating auto-heal sequence..."
auto_heal
local rc=$?
# Set safe point after successful heal
if [[ $rc -eq 0 ]]; then
cmd_set_safepoint
log INFO "=== Boot check complete (healed successfully) ==="
else
log ERROR "=== Boot check FAILED — manual intervention required ==="
fi
return $rc
}
# ──────────────────────────────────────────────
# Auto-heal strategy
#
# TODO: This is the core decision point. Implement the recovery
# steps that match your server's architecture. See guidance below.
#
# Trade-offs to consider:
# - Restore-from-backup: safest, but content may be up to 2h stale
# - Service-restart-only: faster, keeps current data, but won't fix
# corrupted configs or broken filesystem state
# - Hybrid: restart services first, verify health, only restore if
# health checks fail — best of both worlds but more complex
#
# The function receives no arguments. Use the latest system + content
# backups to restore if needed. Return 0 on success, 1 on failure.
# ──────────────────────────────────────────────
auto_heal() {
log INFO "Phase 1: Verify and repair filesystem"
# Check for common post-crash issues
repair_filesystem
log INFO "Phase 2: Restore system configuration if corrupted"
restore_system_if_needed
log INFO "Phase 3: Restart core services"
restart_services
log INFO "Phase 4: Verify health"
if ! verify_health; then
log WARN "Health check failed after service restart — restoring from backup"
restore_from_backup
restart_services
if ! verify_health; then
log ERROR "Health check still failing after restore — giving up"
return 1
fi
fi
log INFO "Auto-heal completed successfully"
return 0
}
# ──────────────────────────────────────────────
# Heal sub-steps
# ──────────────────────────────────────────────
repair_filesystem() {
# Fix common post-crash filesystem issues
# Clear stale PID/lock/socket files that prevent services from starting
local stale_files=(
/var/run/nginx.pid
/var/run/mysqld/mysqld.pid
/var/run/php-fpm.pid
/var/lib/mysql/*.pid
)
for f in "${stale_files[@]}"; do
for expanded in $f; do
if [[ -f "$expanded" ]]; then
local pid
pid=$(<"$expanded") 2>/dev/null || true
if [[ -n "$pid" ]] && ! kill -0 "$pid" 2>/dev/null; then
rm -f "$expanded"
log INFO "Removed stale PID file: $expanded"
fi
fi
done
done
# Fix permissions on critical dirs that may get mangled
[[ -d /var/run/mysqld ]] && chown mysql:mysql /var/run/mysqld 2>/dev/null || true
[[ -d /var/lib/php/sessions ]] && chmod 1733 /var/lib/php/sessions 2>/dev/null || true
# Repair tmp/cache dirs
for d in /tmp /var/tmp; do
[[ -d "$d" ]] && chmod 1777 "$d" 2>/dev/null || true
done
}
restore_system_if_needed() {
# Find latest system backup
local latest_system
latest_system=$(find "$SYSTEM_BACKUP_DIR" -name 'system_*.tar.gz' -printf '%T+ %p\n' \
2>/dev/null | sort -r | head -1 | awk '{print $2}')
if [[ -z "$latest_system" ]]; then
log WARN "No system backup available to verify against"
return 0
fi
# Check if critical configs exist and are non-empty
local needs_restore=false
local critical_configs=("/etc/nginx/nginx.conf" "/etc/php" "/etc/mysql")
for cfg in "${critical_configs[@]}"; do
if [[ -e "$cfg" ]]; then
# Config exists — check if it's a file and non-empty, or a directory
if [[ -f "$cfg" && ! -s "$cfg" ]]; then
log WARN "Critical config is empty: $cfg"
needs_restore=true
break
fi
fi
done
if $needs_restore; then
log WARN "Restoring system config from $latest_system"
tar -xzf "$latest_system" -C / 2>/dev/null || {
log ERROR "System restore failed from $latest_system"
return 1
}
log INFO "System config restored"
else
log INFO "System configs look intact — skipping restore"
fi
}
restart_services() {
if ! command -v systemctl &>/dev/null; then
log WARN "systemctl not available — skipping service restart"
return 0
fi
local services=("mysql" "mariadb" "nginx" "apache2" "php-fpm" "php8.1-fpm" "php8.2-fpm" "php8.3-fpm")
for svc in "${services[@]}"; do
if systemctl is-enabled "$svc" &>/dev/null; then
log INFO "Restarting $svc..."
systemctl restart "$svc" 2>/dev/null && \
log INFO "$svc restarted OK" || \
log WARN "$svc restart failed"
fi
done
}
verify_health() {
local failures=0
# Check critical services are running
local services=("mysql" "mariadb" "nginx" "apache2")
for svc in "${services[@]}"; do
if systemctl is-enabled "$svc" &>/dev/null; then
if ! systemctl is-active "$svc" &>/dev/null; then
log WARN "Service not running: $svc"
((failures++))
fi
fi
done
# Check if web server responds
if command -v curl &>/dev/null; then
if ! curl -sf -o /dev/null --max-time 10 "http://localhost/" 2>/dev/null; then
log WARN "Local web server not responding"
((failures++))
fi
fi
# Check if database accepts connections
if command -v mysqladmin &>/dev/null; then
if ! mysqladmin ping --silent 2>/dev/null; then
log WARN "Database not responding to ping"
((failures++))
fi
fi
[[ $failures -eq 0 ]]
}
restore_from_backup() {
log WARN "=== Full restore from backup ==="
# Restore system config
local latest_system
latest_system=$(find "$SYSTEM_BACKUP_DIR" -name 'system_*.tar.gz' -printf '%T+ %p\n' \
2>/dev/null | sort -r | head -1 | awk '{print $2}')
if [[ -n "$latest_system" ]]; then
log INFO "Restoring system from $latest_system"
tar -xzf "$latest_system" -C / 2>/dev/null || \
log ERROR "System restore failed"
fi
# Restore content
local latest_content
latest_content=$(find "$CONTENT_BACKUP_DIR" -name 'content_*.tar.gz' -printf '%T+ %p\n' \
2>/dev/null | sort -r | head -1 | awk '{print $2}')
if [[ -n "$latest_content" ]]; then
log INFO "Restoring content from $latest_content"
tar -xzf "$latest_content" -C / 2>/dev/null || \
log ERROR "Content restore failed"
fi
# Restore database
local latest_db
latest_db=$(find "$CONTENT_BACKUP_DIR" -name 'content_*.sql.gz' -printf '%T+ %p\n' \
2>/dev/null | sort -r | head -1 | awk '{print $2}')
if [[ -n "$latest_db" ]]; then
log INFO "Restoring database from $latest_db"
local mysql_cmd="mysql"
command -v mariadb &>/dev/null && mysql_cmd="mariadb"
zcat "$latest_db" | $mysql_cmd 2>/dev/null || \
log ERROR "Database restore failed"
fi
}
# ──────────────────────────────────────────────
# Status
# ──────────────────────────────────────────────
cmd_status() {
echo "=== Moko Server Auto-Heal Status ==="
echo ""
# Safe point
if has_safepoint; then
echo "Safe point: SET"
cat "$SAFEPOINT_FILE" | sed 's/^/ /'
else
echo "Safe point: NOT SET (will auto-heal on next boot)"
fi
echo ""
# System backups
echo "System backups (${SYSTEM_BACKUP_DIR}):"
local sys_count
sys_count=$(find "$SYSTEM_BACKUP_DIR" -name 'system_*.tar.gz' 2>/dev/null | wc -l)
echo " Count: $sys_count (retain $SYSTEM_BACKUP_RETAIN)"
local latest_sys
latest_sys=$(find "$SYSTEM_BACKUP_DIR" -name 'system_*.tar.gz' -printf '%T+ %p\n' \
2>/dev/null | sort -r | head -1)
if [[ -n "$latest_sys" ]]; then
echo " Latest: $(echo "$latest_sys" | awk '{print $2}')"
echo " Timestamp: $(echo "$latest_sys" | awk '{print $1}')"
else
echo " Latest: (none)"
fi
echo ""
# Content backups
echo "Content backups (${CONTENT_BACKUP_DIR}):"
local cnt_count
cnt_count=$(find "$CONTENT_BACKUP_DIR" -name 'content_*.tar.gz' 2>/dev/null | wc -l)
echo " Count: $cnt_count (retain ${CONTENT_BACKUP_RETAIN_HOURS}h)"
local latest_cnt
latest_cnt=$(find "$CONTENT_BACKUP_DIR" -name 'content_*.tar.gz' -printf '%T+ %p\n' \
2>/dev/null | sort -r | head -1)
if [[ -n "$latest_cnt" ]]; then
echo " Latest: $(echo "$latest_cnt" | awk '{print $2}')"
echo " Timestamp: $(echo "$latest_cnt" | awk '{print $1}')"
else
echo " Latest: (none)"
fi
echo ""
# Disk usage
echo "Backup disk usage:"
du -sh "$SYSTEM_BACKUP_DIR" "$CONTENT_BACKUP_DIR" 2>/dev/null | sed 's/^/ /'
}
# ──────────────────────────────────────────────
# Install helper — sets up cron + systemd
# ──────────────────────────────────────────────
cmd_install() {
local script_path
script_path=$(readlink -f "$0")
echo "Installing Moko Auto-Heal..."
# Create config directory
mkdir -p /etc/moko "$(dirname "$LOG_FILE")" "$LOCK_DIR"
# Write example config if none exists
if [[ ! -f "$CONF_FILE" ]]; then
cat > "$CONF_FILE" <<'CONF'
# /etc/moko/autoheal.conf — Server auto-heal configuration
# Uncomment and modify as needed
# BACKUP_ROOT="/var/backups/moko"
# SAFEPOINT_FILE="/var/run/moko/safepoint"
# LOG_FILE="/var/log/moko/autoheal.log"
# System backup paths (space-separated)
# SYSTEM_PATHS="/etc/nginx /etc/php /etc/mysql /etc/cron.d /etc/systemd/system"
# Content backup paths (space-separated)
# CONTENT_PATHS="/var/www"
# Database names (space-separated, empty = auto-detect all)
# DB_NAMES=""
# Retention
# SYSTEM_BACKUP_RETAIN=7 # daily backups to keep
# CONTENT_BACKUP_RETAIN_HOURS=24 # hours of content backups to keep
CONF
echo " Created config: $CONF_FILE"
fi
# Install cron jobs
local cron_file="/etc/cron.d/moko-autoheal"
cat > "$cron_file" <<CRON
# Moko Server Auto-Heal — managed by server-autoheal.sh install
SHELL=/bin/bash
PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
# Boot check — auto-heal if no safe point
@reboot root ${script_path} boot-check
# System backup — daily at 3:00 AM
0 3 * * * root ${script_path} backup-system
# Content backup — every 2 hours
0 */2 * * * root ${script_path} backup-content
# Cleanup expired backups — 30 min after each content backup
30 */2 * * * root ${script_path} cleanup
CRON
echo " Installed cron: $cron_file"
# Install shutdown hook to set safe point on clean shutdown
local shutdown_hook="/etc/systemd/system/moko-safepoint.service"
cat > "$shutdown_hook" <<UNIT
[Unit]
Description=Moko Safe Point — mark clean shutdown
DefaultDependencies=no
Before=shutdown.target reboot.target halt.target
[Service]
Type=oneshot
RemainAfterExit=yes
ExecStart=/bin/true
ExecStop=${script_path} set-safepoint
[Install]
WantedBy=multi-user.target
UNIT
systemctl daemon-reload
systemctl enable moko-safepoint.service
echo " Installed systemd hook: $shutdown_hook"
echo ""
echo "Done! Edit $CONF_FILE to configure paths for your server."
echo "Run '${script_path} status' to verify."
}
# ──────────────────────────────────────────────
# Main dispatcher
# ──────────────────────────────────────────────
main() {
local cmd="${1:-help}"
case "$cmd" in
boot-check) cmd_boot_check ;;
set-safepoint) cmd_set_safepoint ;;
clear-safepoint) cmd_clear_safepoint ;;
backup-system) cmd_backup_system ;;
backup-content) cmd_backup_content ;;
cleanup) cmd_cleanup ;;
status) cmd_status ;;
install) cmd_install ;;
help|--help|-h)
sed -n '2,/^$/s/^# //p' "$0"
echo ""
echo "Commands: boot-check, set-safepoint, clear-safepoint,"
echo " backup-system, backup-content, cleanup, status, install"
;;
*)
echo "Unknown command: $cmd" >&2
echo "Run '$0 help' for usage" >&2
exit 1
;;
esac
}
main "$@"