健康检查脚本#

脚本说明#

健康检查脚本用于检查系统、服务、网络等各方面的健康状况,并提供详细的检查报告。

脚本代码#

#!/bin/bash

# 健康检查脚本
# 功能:检查系统、服务、网络等健康状况
# 作者:System Admin
# 日期:2024-01-01

set -euo pipefail

# 配置变量
LOG_FILE="/var/log/health_check.log"
REPORT_FILE="/tmp/health_check_report.txt"
ALERT_THRESHOLD=80
CHECK_SERVICES=("nginx" "mysql" "redis" "ssh")
CHECK_PORTS=(80 443 22 3306)
CHECK_URLS=("http://localhost/health" "http://localhost/api/status")

# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[0;33m'
BLUE='\033[0;34m'
NC='\033[0m'

# 日志函数
log() {
    local level=$1
    shift
    local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
    echo "[$timestamp] [$level] $@" | tee -a "$LOG_FILE"
}

log_info() {
    log "INFO" "$@"
}

log_error() {
    log "ERROR" "$@"
}

log_warning() {
    log "WARNING" "$@"
}

# 检查系统负载
check_system_load() {
    log_info "检查系统负载"
    
    local load=$(uptime | awk -F'load average:' '{print $2}')
    local load_1min=$(echo $load | awk '{print $1}')
    local load_5min=$(echo $load | awk '{print $2}')
    local load_15min=$(echo $load | awk '{print $3}')
    
    echo "系统负载"
    echo "========"
    echo "1分钟负载: $load_1min"
    echo "5分钟负载: $load_5min"
    echo "15分钟负载: $load_15min"
    
    local cpu_count=$(nproc)
    local load_threshold=$(echo "$cpu_count * 0.8" | bc -l)
    
    if (( $(echo "$load_1min > $load_threshold" | bc -l) )); then
        log_warning "系统负载过高: $load_1min"
        return 1
    fi
    
    log_info "系统负载正常"
    return 0
}

# 检查内存使用
check_memory() {
    log_info "检查内存使用"
    
    local mem_info=$(free -m)
    local total=$(echo "$mem_info" | awk '/Mem:/ {print $2}')
    local used=$(echo "$mem_info" | awk '/Mem:/ {print $3}')
    local free=$(echo "$mem_info" | awk '/Mem:/ {print $4}')
    local usage_percent=$(echo "scale=2; $used * 100 / $total" | bc -l)
    
    echo "内存使用"
    echo "========"
    echo "总内存: ${total}MB"
    echo "已使用: ${used}MB"
    echo "空闲: ${free}MB"
    echo "使用率: ${usage_percent}%"
    
    if (( $(echo "$usage_percent > $ALERT_THRESHOLD" | bc -l) )); then
        log_warning "内存使用率过高: ${usage_percent}%"
        return 1
    fi
    
    log_info "内存使用正常"
    return 0
}

# 检查磁盘使用
check_disk() {
    log_info "检查磁盘使用"
    
    echo "磁盘使用"
    echo "========"
    
    df -h | grep -vE '^Filesystem|tmpfs|cdrom' | while read -r line; do
        local usage=$(echo "$line" | awk '{print $5}' | sed 's/%//')
        local mount=$(echo "$line" | awk '{print $6}')
        
        echo "$line"
        
        if [ $usage -gt $ALERT_THRESHOLD ]; then
            log_warning "磁盘使用率过高: $mount ${usage}%"
        fi
    done
}

# 检查CPU使用
check_cpu() {
    log_info "检查CPU使用"
    
    local cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | sed 's/%us,//')
    
    echo "CPU使用"
    echo "======="
    echo "CPU使用率: ${cpu_usage}%"
    
    if (( $(echo "$cpu_usage > $ALERT_THRESHOLD" | bc -l) )); then
        log_warning "CPU使用率过高: ${cpu_usage}%"
        return 1
    fi
    
    log_info "CPU使用正常"
    return 0
}

# 检查服务状态
check_services() {
    log_info "检查服务状态"
    
    echo "服务状态"
    echo "========"
    
    local failed=0
    
    for service in "${CHECK_SERVICES[@]}"; do
        local status=$(systemctl is-active "$service" 2>/dev/null || echo "unknown")
        local enabled=$(systemctl is-enabled "$service" 2>/dev/null || echo "unknown")
        
        echo "$service: $status (开机启动: $enabled)"
        
        if [ "$status" != "active" ]; then
            log_error "服务未运行: $service"
            failed=$((failed + 1))
        fi
    done
    
    if [ $failed -gt 0 ]; then
        return 1
    fi
    
    log_info "所有服务运行正常"
    return 0
}

# 检查端口状态
check_ports() {
    log_info "检查端口状态"
    
    echo "端口状态"
    echo "========"
    
    local failed=0
    
    for port in "${CHECK_PORTS[@]}"; do
        if netstat -tuln 2>/dev/null | grep -q ":$port "; then
            echo "端口 $port: 开放"
        else
            echo "端口 $port: 关闭"
            log_warning "端口未开放: $port"
            failed=$((failed + 1))
        fi
    done
    
    if [ $failed -gt 0 ]; then
        return 1
    fi
    
    log_info "所有端口状态正常"
    return 0
}

# 检查网络连接
check_network() {
    log_info "检查网络连接"
    
    echo "网络连接"
    echo "========"
    
    # 检查网络接口
    local interfaces=$(ip -o link show | awk -F': ' '{print $2}' | grep -v lo)
    echo "网络接口:"
    for interface in $interfaces; do
        local ip=$(ip -o -4 addr show dev "$interface" | awk '{print $4}')
        echo "  $interface: $ip"
    done
    
    # 检查网络连接
    local connections=$(netstat -an 2>/dev/null | grep ESTABLISHED | wc -l)
    echo "活动连接数: $connections"
    
    # 检查默认网关
    local gateway=$(ip route | grep default | awk '{print $3}')
    echo "默认网关: $gateway"
    
    # 检查DNS
    local dns=$(grep nameserver /etc/resolv.conf | awk '{print $2}')
    echo "DNS服务器: $dns"
}

# 检查HTTP服务
check_http() {
    log_info "检查HTTP服务"
    
    echo "HTTP服务"
    echo "======="
    
    local failed=0
    
    for url in "${CHECK_URLS[@]}"; do
        local http_code=$(curl -s -o /dev/null -w "%{http_code}" "$url" 2>/dev/null || echo "000")
        local response_time=$(curl -s -o /dev/null -w "%{time_total}" "$url" 2>/dev/null || echo "0.000")
        
        echo "$url: HTTP $http_code (${response_time}s)"
        
        if [ "$http_code" != "200" ]; then
            log_error "HTTP服务异常: $url (HTTP $http_code)"
            failed=$((failed + 1))
        fi
    done
    
    if [ $failed -gt 0 ]; then
        return 1
    fi
    
    log_info "HTTP服务正常"
    return 0
}

# 检查进程状态
check_processes() {
    log_info "检查进程状态"
    
    echo "进程状态"
    echo "========"
    
    local total_processes=$(ps aux | wc -l)
    local zombie_processes=$(ps aux | awk '{print $8}' | grep -c Z || true)
    
    echo "总进程数: $total_processes"
    echo "僵尸进程: $zombie_processes"
    
    if [ $zombie_processes -gt 0 ]; then
        log_warning "发现僵尸进程: $zombie_processes"
        return 1
    fi
    
    log_info "进程状态正常"
    return 0
}

# 检查系统日志
check_logs() {
    log_info "检查系统日志"
    
    echo "系统日志"
    echo "========"
    
    local error_count=$(journalctl -p err -n 100 --no-pager 2>/dev/null | wc -l)
    local warning_count=$(journalctl -p warning -n 100 --no-pager 2>/dev/null | wc -l)
    
    echo "最近错误数: $error_count"
    echo "最近警告数: $warning_count"
    
    if [ $error_count -gt 10 ]; then
        log_warning "系统错误较多: $error_count"
        return 1
    fi
    
    log_info "系统日志正常"
    return 0
}

# 检查系统时间
check_time() {
    log_info "检查系统时间"
    
    echo "系统时间"
    echo "========"
    
    local current_time=$(date)
    local uptime=$(uptime -p)
    
    echo "当前时间: $current_time"
    echo "系统运行时间: $uptime"
    
    # 检查NTP同步
    if command -v timedatectl &> /dev/null; then
        local ntp_status=$(timedatectl status | grep "System clock synchronized" | awk '{print $4}')
        echo "NTP同步: $ntp_status"
        
        if [ "$ntp_status" != "yes" ]; then
            log_warning "系统时间未同步"
            return 1
        fi
    fi
    
    log_info "系统时间正常"
    return 0
}

# 生成健康检查报告
generate_report() {
    log_info "生成健康检查报告"
    
    {
        echo "健康检查报告"
        echo "============"
        echo "检查时间: $(date)"
        echo "主机名: $(hostname)"
        echo "操作系统: $(cat /etc/os-release | grep PRETTY_NAME | cut -d= -f2 | tr -d '\"')"
        echo "内核版本: $(uname -r)"
        echo ""
        
        echo "检查结果"
        echo "========"
        echo ""
        
        check_system_load
        echo ""
        
        check_memory
        echo ""
        
        check_disk
        echo ""
        
        check_cpu
        echo ""
        
        check_services
        echo ""
        
        check_ports
        echo ""
        
        check_network
        echo ""
        
        check_http
        echo ""
        
        check_processes
        echo ""
        
        check_logs
        echo ""
        
        check_time
        echo ""
        
        echo "检查完成"
        echo "========"
        echo "报告生成时间: $(date)"
    } > "$REPORT_FILE"
    
    log_info "健康检查报告已生成: $REPORT_FILE"
}

# 发送告警
send_alert() {
    local message=$1
    
    log_error "发送告警: $message"
    
    # 这里可以添加邮件、短信等告警方式
    # 例如: echo "$message" | mail -s "健康检查告警" admin@example.com
    
    log_error "告警消息: $message"
}

# 执行完整检查
run_full_check() {
    log_info "开始完整健康检查"
    
    local failed=0
    
    # 执行各项检查
    check_system_load || failed=$((failed + 1))
    check_memory || failed=$((failed + 1))
    check_disk || failed=$((failed + 1))
    check_cpu || failed=$((failed + 1))
    check_services || failed=$((failed + 1))
    check_ports || failed=$((failed + 1))
    check_network || failed=$((failed + 1))
    check_http || failed=$((failed + 1))
    check_processes || failed=$((failed + 1))
    check_logs || failed=$((failed + 1))
    check_time || failed=$((failed + 1))
    
    # 生成报告
    generate_report
    
    if [ $failed -gt 0 ]; then
        send_alert "健康检查发现 $failed 个问题"
        return 1
    fi
    
    log_info "健康检查完成,所有检查项正常"
    return 0
}

# 显示帮助
show_help() {
    echo "用法: $0 [选项] [检查项]"
    echo ""
    echo "选项:"
    echo "  -r <文件>        报告文件(默认: /tmp/health_check_report.txt)"
    echo "  -h               显示帮助信息"
    echo ""
    echo "检查项:"
    echo "  all               执行完整检查(默认)"
    echo "  load             检查系统负载"
    echo "  memory           检查内存使用"
    echo "  disk             检查磁盘使用"
    echo "  cpu              检查CPU使用"
    echo "  services         检查服务状态"
    echo "  ports            检查端口状态"
    echo "  network          检查网络连接"
    echo "  http             检查HTTP服务"
    echo "  processes        检查进程状态"
    echo "  logs             检查系统日志"
    echo "  time             检查系统时间"
    echo ""
    echo "示例:"
    echo "  $0"
    echo "  $0 all"
    echo "  $0 load"
    echo "  $0 services"
}

# 主函数
main() {
    # 解析选项
    while getopts "r:h" opt; do
        case $opt in
            r)
                REPORT_FILE="$OPTARG"
                log_info "报告文件: $REPORT_FILE"
                ;;
            h)
                show_help
                exit 0
                ;;
            *)
                log_error "无效选项: $opt"
                show_help
                exit 1
                ;;
        esac
    done
    
    shift $((OPTIND - 1))
    
    # 检查检查项
    local check_item=${1:-all}
    
    # 执行检查
    case $check_item in
        all)
            run_full_check
            ;;
        load)
            check_system_load
            ;;
        memory)
            check_memory
            ;;
        disk)
            check_disk
            ;;
        cpu)
            check_cpu
            ;;
        services)
            check_services
            ;;
        ports)
            check_ports
            ;;
        network)
            check_network
            ;;
        http)
            check_http
            ;;
        processes)
            check_processes
            ;;
        logs)
            check_logs
            ;;
        time)
            check_time
            ;;
        *)
            log_error "无效的检查项: $check_item"
            show_help
            exit 1
            ;;
    esac
}

# 执行主函数
main "$@"

使用说明#

  1. 添加执行权限:

    chmod +x health_check.sh
  2. 基本用法:

    # 执行完整检查
    ./health_check.sh
    
    # 检查系统负载
    ./health_check.sh load
    
    # 检查内存使用
    ./health_check.sh memory
    
    # 检查服务状态
    ./health_check.sh services
  3. 高级用法:

    # 指定报告文件
    ./health_check.sh -r /tmp/custom_report.txt
    
    # 检查网络连接
    ./health_check.sh network
    
    # 检查HTTP服务
    ./health_check.sh http

功能特点#

  • 系统负载检查
  • 内存使用检查
  • 磁盘使用检查
  • CPU使用检查
  • 服务状态检查
  • 端口状态检查
  • 网络连接检查
  • HTTP服务检查
  • 进程状态检查
  • 系统日志检查
  • 系统时间检查
  • 详细检查报告
  • 告警功能

依赖项#

  • bc: 用于数值计算
  • curl: 用于HTTP检查
  • netstat: 用于网络检查
  • systemctl: 用于服务检查

注意事项#

  1. 某些检查需要root权限
  2. 告警功能需要配置邮件等
  3. 检查间隔建议不要太频繁
  4. 报告文件路径需要有写权限
  5. 可以根据实际情况调整阈值