moko-platform/automation/server-autoheal.sh

#!/usr/bin/env bash
# server-autoheal.sh - Auto-heal on restart + split backup management
#
# Copyright (C) 2026 Moko Consulting <hello@mokoconsulting.tech>
# SPDX-License-Identifier: GPL-3.0-or-later
#
# DEFGROUP: MokoStandards.Automation.ServerAutoheal
# INGROUP:  MokoStandards.Automation
# REPO:     https://git.mokoconsulting.tech/MokoConsulting/moko-platform
# PATH:     /automation/server-autoheal.sh
# BRIEF:    Server auto-heal on unclean restart + split system/content backups
#
# Usage:
#   server-autoheal.sh <command> [options]
#
# Commands:
#   boot-check       Run at boot — auto-heals if no safe point exists
#   set-safepoint    Mark current state as safe (call before planned shutdown)
#   backup-system    Run a system backup (configs, packages, services)
#   backup-content   Run a content backup (site files, databases, uploads)
#   cleanup          Prune expired backups per retention policy
#   status           Show safe point and backup status
#
# Scheduling (cron):
#   @reboot          server-autoheal.sh boot-check
#   0 3 * * *        server-autoheal.sh backup-system    (daily at 3am)
#   0 */2 * * *      server-autoheal.sh backup-content   (every 2 hours)
#   30 */2 * * *     server-autoheal.sh cleanup           (30 min after content backup)

set -euo pipefail

# ──────────────────────────────────────────────
# Configuration — override via /etc/moko/autoheal.conf
# ──────────────────────────────────────────────
CONF_FILE="/etc/moko/autoheal.conf"
[[ -f "$CONF_FILE" ]] && source "$CONF_FILE"

BACKUP_ROOT="${BACKUP_ROOT:-/var/backups/moko}"
SAFEPOINT_FILE="${SAFEPOINT_FILE:-/var/run/moko/safepoint}"
LOG_FILE="${LOG_FILE:-/var/log/moko/autoheal.log}"
LOCK_DIR="${LOCK_DIR:-/var/run/moko}"

# System backup: configs, package lists, service state, cron
SYSTEM_BACKUP_DIR="${BACKUP_ROOT}/system"
SYSTEM_BACKUP_RETAIN="${SYSTEM_BACKUP_RETAIN:-7}"  # keep 7 daily system backups

# Content backup: web roots, databases, uploads
CONTENT_BACKUP_DIR="${BACKUP_ROOT}/content"
CONTENT_BACKUP_RETAIN_HOURS="${CONTENT_BACKUP_RETAIN_HOURS:-24}"  # 1 day of content backups

# Paths to back up — override these in /etc/moko/autoheal.conf
SYSTEM_PATHS="${SYSTEM_PATHS:-/etc/nginx /etc/php /etc/mysql /etc/cron.d /etc/systemd/system}"
CONTENT_PATHS="${CONTENT_PATHS:-/var/www}"
DB_NAMES="${DB_NAMES:-}"  # space-separated list, empty = auto-detect all

# ──────────────────────────────────────────────
# Helpers
# ──────────────────────────────────────────────
log() {
    local level="$1"; shift
    local ts
    ts=$(date -u '+%Y-%m-%dT%H:%M:%SZ')
    local msg="[$ts] [$level] $*"
    echo "$msg" | tee -a "$LOG_FILE" >&2
}

ensure_dirs() {
    mkdir -p "$SYSTEM_BACKUP_DIR" "$CONTENT_BACKUP_DIR" \
             "$LOCK_DIR" "$(dirname "$LOG_FILE")"
}

acquire_lock() {
    local lockfile="${LOCK_DIR}/autoheal-${1}.lock"
    if [[ -f "$lockfile" ]]; then
        local pid
        pid=$(<"$lockfile")
        if kill -0 "$pid" 2>/dev/null; then
            log WARN "Another $1 operation is running (PID $pid), skipping"
            exit 0
        fi
        rm -f "$lockfile"
    fi
    echo $$ > "$lockfile"
    trap "rm -f '$lockfile'" EXIT
}

timestamp() {
    date -u '+%Y%m%d_%H%M%S'
}

# ──────────────────────────────────────────────
# Safe-point management
# ──────────────────────────────────────────────
cmd_set_safepoint() {
    ensure_dirs
    local ts
    ts=$(timestamp)
    cat > "$SAFEPOINT_FILE" <<EOF
timestamp=$ts
hostname=$(hostname)
kernel=$(uname -r)
uptime=$(uptime -s 2>/dev/null || echo "unknown")
set_by=${SUDO_USER:-$(whoami)}
EOF
    log INFO "Safe point set at $ts by ${SUDO_USER:-$(whoami)}"
}

cmd_clear_safepoint() {
    rm -f "$SAFEPOINT_FILE"
    log INFO "Safe point cleared"
}

has_safepoint() {
    [[ -f "$SAFEPOINT_FILE" ]]
}

# ──────────────────────────────────────────────
# System backup (daily)
# ──────────────────────────────────────────────
cmd_backup_system() {
    ensure_dirs
    acquire_lock "system-backup"

    local ts
    ts=$(timestamp)
    local archive="${SYSTEM_BACKUP_DIR}/system_${ts}.tar.gz"
    local manifest="${SYSTEM_BACKUP_DIR}/system_${ts}.manifest"

    log INFO "Starting system backup → $archive"

    # Collect existing paths only
    local existing_paths=()
    for p in $SYSTEM_PATHS; do
        [[ -e "$p" ]] && existing_paths+=("$p")
    done

    if [[ ${#existing_paths[@]} -eq 0 ]]; then
        log WARN "No system paths found to back up"
        return 1
    fi

    # Archive configs and system files
    tar -czf "$archive" "${existing_paths[@]}" 2>/dev/null || true

    # Capture package list and service state as manifest
    {
        echo "=== PACKAGES ==="
        if command -v dpkg &>/dev/null; then
            dpkg --get-selections
        elif command -v rpm &>/dev/null; then
            rpm -qa --qf '%{NAME}\t%{VERSION}\n'
        fi
        echo ""
        echo "=== ENABLED SERVICES ==="
        if command -v systemctl &>/dev/null; then
            systemctl list-unit-files --state=enabled --no-pager 2>/dev/null || true
        fi
        echo ""
        echo "=== CRONTABS ==="
        for user_home in /var/spool/cron/crontabs/*; do
            [[ -f "$user_home" ]] && echo "--- $(basename "$user_home") ---" && cat "$user_home"
        done 2>/dev/null || true
    } > "$manifest"

    local size
    size=$(du -sh "$archive" 2>/dev/null | cut -f1)
    log INFO "System backup complete: $archive ($size)"

    # Prune old system backups (keep $SYSTEM_BACKUP_RETAIN)
    local count
    count=$(find "$SYSTEM_BACKUP_DIR" -name 'system_*.tar.gz' | wc -l)
    if [[ "$count" -gt "$SYSTEM_BACKUP_RETAIN" ]]; then
        local to_remove=$((count - SYSTEM_BACKUP_RETAIN))
        find "$SYSTEM_BACKUP_DIR" -name 'system_*.tar.gz' -printf '%T+ %p\n' \
            | sort | head -n "$to_remove" | awk '{print $2}' \
            | while read -r f; do
                rm -f "$f" "${f%.tar.gz}.manifest"
                log INFO "Pruned old system backup: $f"
            done
    fi
}

# ──────────────────────────────────────────────
# Content backup (every 2 hours)
# ──────────────────────────────────────────────
cmd_backup_content() {
    ensure_dirs
    acquire_lock "content-backup"

    local ts
    ts=$(timestamp)
    local archive="${CONTENT_BACKUP_DIR}/content_${ts}.tar.gz"
    local db_dump="${CONTENT_BACKUP_DIR}/content_${ts}.sql.gz"

    log INFO "Starting content backup → $archive"

    # Back up web content / uploads
    local existing_paths=()
    for p in $CONTENT_PATHS; do
        [[ -e "$p" ]] && existing_paths+=("$p")
    done

    if [[ ${#existing_paths[@]} -gt 0 ]]; then
        tar -czf "$archive" "${existing_paths[@]}" 2>/dev/null || true
        local size
        size=$(du -sh "$archive" 2>/dev/null | cut -f1)
        log INFO "Content files archived: $archive ($size)"
    else
        log WARN "No content paths found to back up"
    fi

    # Database dump
    if command -v mysqldump &>/dev/null || command -v mariadb-dump &>/dev/null; then
        local dump_cmd="mysqldump"
        command -v mariadb-dump &>/dev/null && dump_cmd="mariadb-dump"

        local databases=()
        if [[ -n "$DB_NAMES" ]]; then
            read -ra databases <<< "$DB_NAMES"
        else
            # Auto-detect: dump all databases except system ones
            databases=($(${dump_cmd%dump} -N -e \
                "SELECT schema_name FROM information_schema.schemata
                 WHERE schema_name NOT IN ('information_schema','performance_schema','mysql','sys')" \
                2>/dev/null | tr '\n' ' ')) || true
        fi

        if [[ ${#databases[@]} -gt 0 ]]; then
            $dump_cmd --single-transaction --routines --triggers \
                --databases "${databases[@]}" 2>/dev/null \
                | gzip > "$db_dump"
            local db_size
            db_size=$(du -sh "$db_dump" 2>/dev/null | cut -f1)
            log INFO "Database dump complete: $db_dump ($db_size)"
        else
            log WARN "No databases found to dump"
        fi
    fi
}

# ──────────────────────────────────────────────
# Cleanup — prune content backups older than retention
# ──────────────────────────────────────────────
cmd_cleanup() {
    ensure_dirs
    local before_count after_count

    # Content: keep only last 24 hours (1 day)
    before_count=$(find "$CONTENT_BACKUP_DIR" -name 'content_*' -type f | wc -l)
    find "$CONTENT_BACKUP_DIR" -name 'content_*' -type f \
        -mmin +$((CONTENT_BACKUP_RETAIN_HOURS * 60)) -delete 2>/dev/null || true
    after_count=$(find "$CONTENT_BACKUP_DIR" -name 'content_*' -type f | wc -l)
    local removed=$((before_count - after_count))
    [[ "$removed" -gt 0 ]] && log INFO "Pruned $removed content backup(s) older than ${CONTENT_BACKUP_RETAIN_HOURS}h"

    # System: keep N most recent (handled in backup-system, but double-check here)
    before_count=$(find "$SYSTEM_BACKUP_DIR" -name 'system_*' -type f | wc -l)
    local max_system_files=$((SYSTEM_BACKUP_RETAIN * 2))  # .tar.gz + .manifest
    if [[ "$before_count" -gt "$max_system_files" ]]; then
        local excess=$((before_count - max_system_files))
        find "$SYSTEM_BACKUP_DIR" -name 'system_*' -type f -printf '%T+ %p\n' \
            | sort | head -n "$excess" | awk '{print $2}' \
            | xargs -r rm -f
        log INFO "Pruned excess system backups"
    fi

    log INFO "Cleanup complete"
}

# ──────────────────────────────────────────────
# Boot check — the auto-heal entry point
# ──────────────────────────────────────────────
cmd_boot_check() {
    ensure_dirs
    acquire_lock "boot-check"

    log INFO "=== Boot check started ==="
    log INFO "Hostname: $(hostname), Kernel: $(uname -r)"

    if has_safepoint; then
        log INFO "Safe point found — server was shut down cleanly"
        log INFO "Clearing safe point for next cycle"
        cmd_clear_safepoint
        log INFO "=== Boot check passed (clean restart) ==="
        return 0
    fi

    log WARN "NO safe point found — server restarted without clean shutdown"
    log WARN "Initiating auto-heal sequence..."

    auto_heal
    local rc=$?

    # Set safe point after successful heal
    if [[ $rc -eq 0 ]]; then
        cmd_set_safepoint
        log INFO "=== Boot check complete (healed successfully) ==="
    else
        log ERROR "=== Boot check FAILED — manual intervention required ==="
    fi

    return $rc
}

# ──────────────────────────────────────────────
# Auto-heal strategy
#
# TODO: This is the core decision point. Implement the recovery
# steps that match your server's architecture. See guidance below.
#
# Trade-offs to consider:
#   - Restore-from-backup: safest, but content may be up to 2h stale
#   - Service-restart-only: faster, keeps current data, but won't fix
#     corrupted configs or broken filesystem state
#   - Hybrid: restart services first, verify health, only restore if
#     health checks fail — best of both worlds but more complex
#
# The function receives no arguments. Use the latest system + content
# backups to restore if needed. Return 0 on success, 1 on failure.
# ──────────────────────────────────────────────
auto_heal() {
    log INFO "Phase 1: Verify and repair filesystem"
    # Check for common post-crash issues
    repair_filesystem

    log INFO "Phase 2: Restore system configuration if corrupted"
    restore_system_if_needed

    log INFO "Phase 3: Restart core services"
    restart_services

    log INFO "Phase 4: Verify health"
    if ! verify_health; then
        log WARN "Health check failed after service restart — restoring from backup"
        restore_from_backup
        restart_services

        if ! verify_health; then
            log ERROR "Health check still failing after restore — giving up"
            return 1
        fi
    fi

    log INFO "Auto-heal completed successfully"
    return 0
}

# ──────────────────────────────────────────────
# Heal sub-steps
# ──────────────────────────────────────────────
repair_filesystem() {
    # Fix common post-crash filesystem issues
    # Clear stale PID/lock/socket files that prevent services from starting
    local stale_files=(
        /var/run/nginx.pid
        /var/run/mysqld/mysqld.pid
        /var/run/php-fpm.pid
        /var/lib/mysql/*.pid
    )
    for f in "${stale_files[@]}"; do
        for expanded in $f; do
            if [[ -f "$expanded" ]]; then
                local pid
                pid=$(<"$expanded") 2>/dev/null || true
                if [[ -n "$pid" ]] && ! kill -0 "$pid" 2>/dev/null; then
                    rm -f "$expanded"
                    log INFO "Removed stale PID file: $expanded"
                fi
            fi
        done
    done

    # Fix permissions on critical dirs that may get mangled
    [[ -d /var/run/mysqld ]] && chown mysql:mysql /var/run/mysqld 2>/dev/null || true
    [[ -d /var/lib/php/sessions ]] && chmod 1733 /var/lib/php/sessions 2>/dev/null || true

    # Repair tmp/cache dirs
    for d in /tmp /var/tmp; do
        [[ -d "$d" ]] && chmod 1777 "$d" 2>/dev/null || true
    done
}

restore_system_if_needed() {
    # Find latest system backup
    local latest_system
    latest_system=$(find "$SYSTEM_BACKUP_DIR" -name 'system_*.tar.gz' -printf '%T+ %p\n' \
        2>/dev/null | sort -r | head -1 | awk '{print $2}')

    if [[ -z "$latest_system" ]]; then
        log WARN "No system backup available to verify against"
        return 0
    fi

    # Check if critical configs exist and are non-empty
    local needs_restore=false
    local critical_configs=("/etc/nginx/nginx.conf" "/etc/php" "/etc/mysql")

    for cfg in "${critical_configs[@]}"; do
        if [[ -e "$cfg" ]]; then
            # Config exists — check if it's a file and non-empty, or a directory
            if [[ -f "$cfg" && ! -s "$cfg" ]]; then
                log WARN "Critical config is empty: $cfg"
                needs_restore=true
                break
            fi
        fi
    done

    if $needs_restore; then
        log WARN "Restoring system config from $latest_system"
        tar -xzf "$latest_system" -C / 2>/dev/null || {
            log ERROR "System restore failed from $latest_system"
            return 1
        }
        log INFO "System config restored"
    else
        log INFO "System configs look intact — skipping restore"
    fi
}

restart_services() {
    if ! command -v systemctl &>/dev/null; then
        log WARN "systemctl not available — skipping service restart"
        return 0
    fi

    local services=("mysql" "mariadb" "nginx" "apache2" "php-fpm" "php8.1-fpm" "php8.2-fpm" "php8.3-fpm")

    for svc in "${services[@]}"; do
        if systemctl is-enabled "$svc" &>/dev/null; then
            log INFO "Restarting $svc..."
            systemctl restart "$svc" 2>/dev/null && \
                log INFO "$svc restarted OK" || \
                log WARN "$svc restart failed"
        fi
    done
}

verify_health() {
    local failures=0

    # Check critical services are running
    local services=("mysql" "mariadb" "nginx" "apache2")
    for svc in "${services[@]}"; do
        if systemctl is-enabled "$svc" &>/dev/null; then
            if ! systemctl is-active "$svc" &>/dev/null; then
                log WARN "Service not running: $svc"
                ((failures++))
            fi
        fi
    done

    # Check if web server responds
    if command -v curl &>/dev/null; then
        if ! curl -sf -o /dev/null --max-time 10 "http://localhost/" 2>/dev/null; then
            log WARN "Local web server not responding"
            ((failures++))
        fi
    fi

    # Check if database accepts connections
    if command -v mysqladmin &>/dev/null; then
        if ! mysqladmin ping --silent 2>/dev/null; then
            log WARN "Database not responding to ping"
            ((failures++))
        fi
    fi

    [[ $failures -eq 0 ]]
}

restore_from_backup() {
    log WARN "=== Full restore from backup ==="

    # Restore system config
    local latest_system
    latest_system=$(find "$SYSTEM_BACKUP_DIR" -name 'system_*.tar.gz' -printf '%T+ %p\n' \
        2>/dev/null | sort -r | head -1 | awk '{print $2}')

    if [[ -n "$latest_system" ]]; then
        log INFO "Restoring system from $latest_system"
        tar -xzf "$latest_system" -C / 2>/dev/null || \
            log ERROR "System restore failed"
    fi

    # Restore content
    local latest_content
    latest_content=$(find "$CONTENT_BACKUP_DIR" -name 'content_*.tar.gz' -printf '%T+ %p\n' \
        2>/dev/null | sort -r | head -1 | awk '{print $2}')

    if [[ -n "$latest_content" ]]; then
        log INFO "Restoring content from $latest_content"
        tar -xzf "$latest_content" -C / 2>/dev/null || \
            log ERROR "Content restore failed"
    fi

    # Restore database
    local latest_db
    latest_db=$(find "$CONTENT_BACKUP_DIR" -name 'content_*.sql.gz' -printf '%T+ %p\n' \
        2>/dev/null | sort -r | head -1 | awk '{print $2}')

    if [[ -n "$latest_db" ]]; then
        log INFO "Restoring database from $latest_db"
        local mysql_cmd="mysql"
        command -v mariadb &>/dev/null && mysql_cmd="mariadb"
        zcat "$latest_db" | $mysql_cmd 2>/dev/null || \
            log ERROR "Database restore failed"
    fi
}

# ──────────────────────────────────────────────
# Status
# ──────────────────────────────────────────────
cmd_status() {
    echo "=== Moko Server Auto-Heal Status ==="
    echo ""

    # Safe point
    if has_safepoint; then
        echo "Safe point:  SET"
        cat "$SAFEPOINT_FILE" | sed 's/^/  /'
    else
        echo "Safe point:  NOT SET (will auto-heal on next boot)"
    fi
    echo ""

    # System backups
    echo "System backups (${SYSTEM_BACKUP_DIR}):"
    local sys_count
    sys_count=$(find "$SYSTEM_BACKUP_DIR" -name 'system_*.tar.gz' 2>/dev/null | wc -l)
    echo "  Count:     $sys_count (retain $SYSTEM_BACKUP_RETAIN)"
    local latest_sys
    latest_sys=$(find "$SYSTEM_BACKUP_DIR" -name 'system_*.tar.gz' -printf '%T+ %p\n' \
        2>/dev/null | sort -r | head -1)
    if [[ -n "$latest_sys" ]]; then
        echo "  Latest:    $(echo "$latest_sys" | awk '{print $2}')"
        echo "  Timestamp: $(echo "$latest_sys" | awk '{print $1}')"
    else
        echo "  Latest:    (none)"
    fi
    echo ""

    # Content backups
    echo "Content backups (${CONTENT_BACKUP_DIR}):"
    local cnt_count
    cnt_count=$(find "$CONTENT_BACKUP_DIR" -name 'content_*.tar.gz' 2>/dev/null | wc -l)
    echo "  Count:     $cnt_count (retain ${CONTENT_BACKUP_RETAIN_HOURS}h)"
    local latest_cnt
    latest_cnt=$(find "$CONTENT_BACKUP_DIR" -name 'content_*.tar.gz' -printf '%T+ %p\n' \
        2>/dev/null | sort -r | head -1)
    if [[ -n "$latest_cnt" ]]; then
        echo "  Latest:    $(echo "$latest_cnt" | awk '{print $2}')"
        echo "  Timestamp: $(echo "$latest_cnt" | awk '{print $1}')"
    else
        echo "  Latest:    (none)"
    fi
    echo ""

    # Disk usage
    echo "Backup disk usage:"
    du -sh "$SYSTEM_BACKUP_DIR" "$CONTENT_BACKUP_DIR" 2>/dev/null | sed 's/^/  /'
}

# ──────────────────────────────────────────────
# Install helper — sets up cron + systemd
# ──────────────────────────────────────────────
cmd_install() {
    local script_path
    script_path=$(readlink -f "$0")

    echo "Installing Moko Auto-Heal..."

    # Create config directory
    mkdir -p /etc/moko "$(dirname "$LOG_FILE")" "$LOCK_DIR"

    # Write example config if none exists
    if [[ ! -f "$CONF_FILE" ]]; then
        cat > "$CONF_FILE" <<'CONF'
# /etc/moko/autoheal.conf — Server auto-heal configuration
# Uncomment and modify as needed

# BACKUP_ROOT="/var/backups/moko"
# SAFEPOINT_FILE="/var/run/moko/safepoint"
# LOG_FILE="/var/log/moko/autoheal.log"

# System backup paths (space-separated)
# SYSTEM_PATHS="/etc/nginx /etc/php /etc/mysql /etc/cron.d /etc/systemd/system"

# Content backup paths (space-separated)
# CONTENT_PATHS="/var/www"

# Database names (space-separated, empty = auto-detect all)
# DB_NAMES=""

# Retention
# SYSTEM_BACKUP_RETAIN=7        # daily backups to keep
# CONTENT_BACKUP_RETAIN_HOURS=24  # hours of content backups to keep
CONF
        echo "  Created config: $CONF_FILE"
    fi

    # Install cron jobs
    local cron_file="/etc/cron.d/moko-autoheal"
    cat > "$cron_file" <<CRON
# Moko Server Auto-Heal — managed by server-autoheal.sh install
SHELL=/bin/bash
PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin

# Boot check — auto-heal if no safe point
@reboot root ${script_path} boot-check

# System backup — daily at 3:00 AM
0 3 * * * root ${script_path} backup-system

# Content backup — every 2 hours
0 */2 * * * root ${script_path} backup-content

# Cleanup expired backups — 30 min after each content backup
30 */2 * * * root ${script_path} cleanup
CRON
    echo "  Installed cron: $cron_file"

    # Install shutdown hook to set safe point on clean shutdown
    local shutdown_hook="/etc/systemd/system/moko-safepoint.service"
    cat > "$shutdown_hook" <<UNIT
[Unit]
Description=Moko Safe Point — mark clean shutdown
DefaultDependencies=no
Before=shutdown.target reboot.target halt.target

[Service]
Type=oneshot
RemainAfterExit=yes
ExecStart=/bin/true
ExecStop=${script_path} set-safepoint

[Install]
WantedBy=multi-user.target
UNIT
    systemctl daemon-reload
    systemctl enable moko-safepoint.service
    echo "  Installed systemd hook: $shutdown_hook"

    echo ""
    echo "Done! Edit $CONF_FILE to configure paths for your server."
    echo "Run '${script_path} status' to verify."
}

# ──────────────────────────────────────────────
# Main dispatcher
# ──────────────────────────────────────────────
main() {
    local cmd="${1:-help}"

    case "$cmd" in
        boot-check)       cmd_boot_check ;;
        set-safepoint)    cmd_set_safepoint ;;
        clear-safepoint)  cmd_clear_safepoint ;;
        backup-system)    cmd_backup_system ;;
        backup-content)   cmd_backup_content ;;
        cleanup)          cmd_cleanup ;;
        status)           cmd_status ;;
        install)          cmd_install ;;
        help|--help|-h)
            sed -n '2,/^$/s/^# //p' "$0"
            echo ""
            echo "Commands: boot-check, set-safepoint, clear-safepoint,"
            echo "          backup-system, backup-content, cleanup, status, install"
            ;;
        *)
            echo "Unknown command: $cmd" >&2
            echo "Run '$0 help' for usage" >&2
            exit 1
            ;;
    esac
}

main "$@"