健康检查脚本#
脚本说明#
健康检查脚本用于检查系统、服务、网络等各方面的健康状况,并提供详细的检查报告。
脚本代码#
#!/bin/bash
# 健康检查脚本
# 功能:检查系统、服务、网络等健康状况
# 作者:System Admin
# 日期:2024-01-01
set -euo pipefail
# 配置变量
LOG_FILE="/var/log/health_check.log"
REPORT_FILE="/tmp/health_check_report.txt"
ALERT_THRESHOLD=80
CHECK_SERVICES=("nginx" "mysql" "redis" "ssh")
CHECK_PORTS=(80 443 22 3306)
CHECK_URLS=("http://localhost/health" "http://localhost/api/status")
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[0;33m'
BLUE='\033[0;34m'
NC='\033[0m'
# 日志函数
log() {
local level=$1
shift
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
echo "[$timestamp] [$level] $@" | tee -a "$LOG_FILE"
}
log_info() {
log "INFO" "$@"
}
log_error() {
log "ERROR" "$@"
}
log_warning() {
log "WARNING" "$@"
}
# 检查系统负载
check_system_load() {
log_info "检查系统负载"
local load=$(uptime | awk -F'load average:' '{print $2}')
local load_1min=$(echo $load | awk '{print $1}')
local load_5min=$(echo $load | awk '{print $2}')
local load_15min=$(echo $load | awk '{print $3}')
echo "系统负载"
echo "========"
echo "1分钟负载: $load_1min"
echo "5分钟负载: $load_5min"
echo "15分钟负载: $load_15min"
local cpu_count=$(nproc)
local load_threshold=$(echo "$cpu_count * 0.8" | bc -l)
if (( $(echo "$load_1min > $load_threshold" | bc -l) )); then
log_warning "系统负载过高: $load_1min"
return 1
fi
log_info "系统负载正常"
return 0
}
# 检查内存使用
check_memory() {
log_info "检查内存使用"
local mem_info=$(free -m)
local total=$(echo "$mem_info" | awk '/Mem:/ {print $2}')
local used=$(echo "$mem_info" | awk '/Mem:/ {print $3}')
local free=$(echo "$mem_info" | awk '/Mem:/ {print $4}')
local usage_percent=$(echo "scale=2; $used * 100 / $total" | bc -l)
echo "内存使用"
echo "========"
echo "总内存: ${total}MB"
echo "已使用: ${used}MB"
echo "空闲: ${free}MB"
echo "使用率: ${usage_percent}%"
if (( $(echo "$usage_percent > $ALERT_THRESHOLD" | bc -l) )); then
log_warning "内存使用率过高: ${usage_percent}%"
return 1
fi
log_info "内存使用正常"
return 0
}
# 检查磁盘使用
check_disk() {
log_info "检查磁盘使用"
echo "磁盘使用"
echo "========"
df -h | grep -vE '^Filesystem|tmpfs|cdrom' | while read -r line; do
local usage=$(echo "$line" | awk '{print $5}' | sed 's/%//')
local mount=$(echo "$line" | awk '{print $6}')
echo "$line"
if [ $usage -gt $ALERT_THRESHOLD ]; then
log_warning "磁盘使用率过高: $mount ${usage}%"
fi
done
}
# 检查CPU使用
check_cpu() {
log_info "检查CPU使用"
local cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | sed 's/%us,//')
echo "CPU使用"
echo "======="
echo "CPU使用率: ${cpu_usage}%"
if (( $(echo "$cpu_usage > $ALERT_THRESHOLD" | bc -l) )); then
log_warning "CPU使用率过高: ${cpu_usage}%"
return 1
fi
log_info "CPU使用正常"
return 0
}
# 检查服务状态
check_services() {
log_info "检查服务状态"
echo "服务状态"
echo "========"
local failed=0
for service in "${CHECK_SERVICES[@]}"; do
local status=$(systemctl is-active "$service" 2>/dev/null || echo "unknown")
local enabled=$(systemctl is-enabled "$service" 2>/dev/null || echo "unknown")
echo "$service: $status (开机启动: $enabled)"
if [ "$status" != "active" ]; then
log_error "服务未运行: $service"
failed=$((failed + 1))
fi
done
if [ $failed -gt 0 ]; then
return 1
fi
log_info "所有服务运行正常"
return 0
}
# 检查端口状态
check_ports() {
log_info "检查端口状态"
echo "端口状态"
echo "========"
local failed=0
for port in "${CHECK_PORTS[@]}"; do
if netstat -tuln 2>/dev/null | grep -q ":$port "; then
echo "端口 $port: 开放"
else
echo "端口 $port: 关闭"
log_warning "端口未开放: $port"
failed=$((failed + 1))
fi
done
if [ $failed -gt 0 ]; then
return 1
fi
log_info "所有端口状态正常"
return 0
}
# 检查网络连接
check_network() {
log_info "检查网络连接"
echo "网络连接"
echo "========"
# 检查网络接口
local interfaces=$(ip -o link show | awk -F': ' '{print $2}' | grep -v lo)
echo "网络接口:"
for interface in $interfaces; do
local ip=$(ip -o -4 addr show dev "$interface" | awk '{print $4}')
echo " $interface: $ip"
done
# 检查网络连接
local connections=$(netstat -an 2>/dev/null | grep ESTABLISHED | wc -l)
echo "活动连接数: $connections"
# 检查默认网关
local gateway=$(ip route | grep default | awk '{print $3}')
echo "默认网关: $gateway"
# 检查DNS
local dns=$(grep nameserver /etc/resolv.conf | awk '{print $2}')
echo "DNS服务器: $dns"
}
# 检查HTTP服务
check_http() {
log_info "检查HTTP服务"
echo "HTTP服务"
echo "======="
local failed=0
for url in "${CHECK_URLS[@]}"; do
local http_code=$(curl -s -o /dev/null -w "%{http_code}" "$url" 2>/dev/null || echo "000")
local response_time=$(curl -s -o /dev/null -w "%{time_total}" "$url" 2>/dev/null || echo "0.000")
echo "$url: HTTP $http_code (${response_time}s)"
if [ "$http_code" != "200" ]; then
log_error "HTTP服务异常: $url (HTTP $http_code)"
failed=$((failed + 1))
fi
done
if [ $failed -gt 0 ]; then
return 1
fi
log_info "HTTP服务正常"
return 0
}
# 检查进程状态
check_processes() {
log_info "检查进程状态"
echo "进程状态"
echo "========"
local total_processes=$(ps aux | wc -l)
local zombie_processes=$(ps aux | awk '{print $8}' | grep -c Z || true)
echo "总进程数: $total_processes"
echo "僵尸进程: $zombie_processes"
if [ $zombie_processes -gt 0 ]; then
log_warning "发现僵尸进程: $zombie_processes"
return 1
fi
log_info "进程状态正常"
return 0
}
# 检查系统日志
check_logs() {
log_info "检查系统日志"
echo "系统日志"
echo "========"
local error_count=$(journalctl -p err -n 100 --no-pager 2>/dev/null | wc -l)
local warning_count=$(journalctl -p warning -n 100 --no-pager 2>/dev/null | wc -l)
echo "最近错误数: $error_count"
echo "最近警告数: $warning_count"
if [ $error_count -gt 10 ]; then
log_warning "系统错误较多: $error_count"
return 1
fi
log_info "系统日志正常"
return 0
}
# 检查系统时间
check_time() {
log_info "检查系统时间"
echo "系统时间"
echo "========"
local current_time=$(date)
local uptime=$(uptime -p)
echo "当前时间: $current_time"
echo "系统运行时间: $uptime"
# 检查NTP同步
if command -v timedatectl &> /dev/null; then
local ntp_status=$(timedatectl status | grep "System clock synchronized" | awk '{print $4}')
echo "NTP同步: $ntp_status"
if [ "$ntp_status" != "yes" ]; then
log_warning "系统时间未同步"
return 1
fi
fi
log_info "系统时间正常"
return 0
}
# 生成健康检查报告
generate_report() {
log_info "生成健康检查报告"
{
echo "健康检查报告"
echo "============"
echo "检查时间: $(date)"
echo "主机名: $(hostname)"
echo "操作系统: $(cat /etc/os-release | grep PRETTY_NAME | cut -d= -f2 | tr -d '\"')"
echo "内核版本: $(uname -r)"
echo ""
echo "检查结果"
echo "========"
echo ""
check_system_load
echo ""
check_memory
echo ""
check_disk
echo ""
check_cpu
echo ""
check_services
echo ""
check_ports
echo ""
check_network
echo ""
check_http
echo ""
check_processes
echo ""
check_logs
echo ""
check_time
echo ""
echo "检查完成"
echo "========"
echo "报告生成时间: $(date)"
} > "$REPORT_FILE"
log_info "健康检查报告已生成: $REPORT_FILE"
}
# 发送告警
send_alert() {
local message=$1
log_error "发送告警: $message"
# 这里可以添加邮件、短信等告警方式
# 例如: echo "$message" | mail -s "健康检查告警" admin@example.com
log_error "告警消息: $message"
}
# 执行完整检查
run_full_check() {
log_info "开始完整健康检查"
local failed=0
# 执行各项检查
check_system_load || failed=$((failed + 1))
check_memory || failed=$((failed + 1))
check_disk || failed=$((failed + 1))
check_cpu || failed=$((failed + 1))
check_services || failed=$((failed + 1))
check_ports || failed=$((failed + 1))
check_network || failed=$((failed + 1))
check_http || failed=$((failed + 1))
check_processes || failed=$((failed + 1))
check_logs || failed=$((failed + 1))
check_time || failed=$((failed + 1))
# 生成报告
generate_report
if [ $failed -gt 0 ]; then
send_alert "健康检查发现 $failed 个问题"
return 1
fi
log_info "健康检查完成,所有检查项正常"
return 0
}
# 显示帮助
show_help() {
echo "用法: $0 [选项] [检查项]"
echo ""
echo "选项:"
echo " -r <文件> 报告文件(默认: /tmp/health_check_report.txt)"
echo " -h 显示帮助信息"
echo ""
echo "检查项:"
echo " all 执行完整检查(默认)"
echo " load 检查系统负载"
echo " memory 检查内存使用"
echo " disk 检查磁盘使用"
echo " cpu 检查CPU使用"
echo " services 检查服务状态"
echo " ports 检查端口状态"
echo " network 检查网络连接"
echo " http 检查HTTP服务"
echo " processes 检查进程状态"
echo " logs 检查系统日志"
echo " time 检查系统时间"
echo ""
echo "示例:"
echo " $0"
echo " $0 all"
echo " $0 load"
echo " $0 services"
}
# 主函数
main() {
# 解析选项
while getopts "r:h" opt; do
case $opt in
r)
REPORT_FILE="$OPTARG"
log_info "报告文件: $REPORT_FILE"
;;
h)
show_help
exit 0
;;
*)
log_error "无效选项: $opt"
show_help
exit 1
;;
esac
done
shift $((OPTIND - 1))
# 检查检查项
local check_item=${1:-all}
# 执行检查
case $check_item in
all)
run_full_check
;;
load)
check_system_load
;;
memory)
check_memory
;;
disk)
check_disk
;;
cpu)
check_cpu
;;
services)
check_services
;;
ports)
check_ports
;;
network)
check_network
;;
http)
check_http
;;
processes)
check_processes
;;
logs)
check_logs
;;
time)
check_time
;;
*)
log_error "无效的检查项: $check_item"
show_help
exit 1
;;
esac
}
# 执行主函数
main "$@"使用说明#
添加执行权限:
chmod +x health_check.sh基本用法:
# 执行完整检查 ./health_check.sh # 检查系统负载 ./health_check.sh load # 检查内存使用 ./health_check.sh memory # 检查服务状态 ./health_check.sh services高级用法:
# 指定报告文件 ./health_check.sh -r /tmp/custom_report.txt # 检查网络连接 ./health_check.sh network # 检查HTTP服务 ./health_check.sh http
功能特点#
- 系统负载检查
- 内存使用检查
- 磁盘使用检查
- CPU使用检查
- 服务状态检查
- 端口状态检查
- 网络连接检查
- HTTP服务检查
- 进程状态检查
- 系统日志检查
- 系统时间检查
- 详细检查报告
- 告警功能
依赖项#
- bc: 用于数值计算
- curl: 用于HTTP检查
- netstat: 用于网络检查
- systemctl: 用于服务检查
注意事项#
- 某些检查需要root权限
- 告警功能需要配置邮件等
- 检查间隔建议不要太频繁
- 报告文件路径需要有写权限
- 可以根据实际情况调整阈值