linux-sre-handbook

05-实战脚本案例

1. 服务健康检查

#!/bin/bash
set -euo pipefail

check_port() {
    local host=$1 port=$2 timeout=${3:-3}
    timeout $timeout bash -c "echo >/dev/tcp/$host/$port" 2>/dev/null && \
        echo "OK: $host:$port" || echo "FAIL: $host:$port"
}

check_http() {
    local url=$1
    local code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 "$url")
    [[ $code == "200" ]] && echo "OK: $url ($code)" || echo "FAIL: $url ($code)"
}

# 检查清单
SERVICES=(
    "localhost:22"
    "localhost:80"
    "localhost:3306"
)

for svc in "${SERVICES[@]}"; do
    IFS=: read -r host port <<< "$svc"
    check_port "$host" "$port"
done

check_http "http://localhost/health"

2. 日志轮转与清理

#!/bin/bash
set -euo pipefail

LOG_DIR="/var/log/myapp"
RETENTION_DAYS=30
ARCHIVE_DIR="$LOG_DIR/archive"

mkdir -p "$ARCHIVE_DIR"
find "$LOG_DIR" -name "*.log" -mtime +$RETENTION_DAYS -print0 | while IFS= read -r -d '' f; do
    gzip -c "$f" > "$ARCHIVE_DIR/$(basename "$f")-$(date +%Y%m%d).gz"
    rm "$f"
    echo "Archived: $f"
done

# 清理超过 90 天的归档
find "$ARCHIVE_DIR" -name "*.gz" -mtime +90 -delete
echo "Cleanup done."

3. 批量 SSH 执行

#!/bin/bash
set -euo pipefail

HOSTS_FILE="${1:-hosts.txt}"
COMMAND="${2:-uptime}"

[[ ! -f "$HOSTS_FILE" ]] && { echo "Hosts file not found"; exit 1; }

while IFS= read -r host; do
    [[ -z "$host" || "$host" =~ ^# ]] && continue
    echo "=== $host ==="
    ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "$host" "$COMMAND" 2>&1 || \
        echo "FAILED to connect"
    echo
done < "$HOSTS_FILE"

4. 磁盘告警脚本

#!/bin/bash
THRESHOLD=85
ALERT_EMAIL="sre@example.com"

df -h | awk 'NR>1 {print $5" "$6}' | while read -r used mount; do
    pct=${used%\%}
    if [[ $pct -gt $THRESHOLD ]]; then
        echo "WARNING: $mount is ${pct}% full" | \
            mail -s "Disk Alert: $mount" "$ALERT_EMAIL"
    fi
done

5. 进程守护 (Watchdog)

#!/bin/bash
PROCESS_NAME="myapp"
RESTART_CMD="systemctl restart myapp"
MAX_RESTART=3
CHECK_INTERVAL=10

restart_count=0
while true; do
    if ! pgrep -x "$PROCESS_NAME" > /dev/null; then
        echo "[$(date)] $PROCESS_NAME is DOWN"
        if [[ $restart_count -lt $MAX_RESTART ]]; then
            $RESTART_CMD
            ((restart_count++))
            echo "Restarted (attempt $restart_count/$MAX_RESTART)"
        else
            echo "FATAL: Max restart attempts reached" | \
                mail -s "CRITICAL: $PROCESS_NAME failed" sre@example.com
            exit 1
        fi
    fi
    sleep "$CHECK_INTERVAL"
done

延伸阅读