ci: 增强部署诊断能力
Some checks failed
Deploy to Production / Build and Test (push) Has been cancelled
Deploy to Production / Deploy to Server (push) Has been cancelled

- 添加容器状态和端口占用检查
- 添加容器内部监听情况诊断
- 增加详细的健康检查日志(100行)
- 健康检查重试次数从5次增加到10次
- 第5次失败时执行深度诊断
- 添加独立的部署健康检查脚本

改进点:
1. 诊断端口冲突问题
2. 检查容器内部监听配置
3. 增加详细的错误日志输出
4. SSH回连获取实时状态
This commit is contained in:
你的用户名
2025-11-04 21:23:33 +08:00
parent c5dd72c68c
commit faafcf926a
2 changed files with 157 additions and 14 deletions

View File

@@ -127,15 +127,35 @@ jobs:
echo "⏳ 等待服务启动..."
sleep 10
# 检查容器状态
# 1. 检查容器状态
echo "📊 容器状态:"
sudo docker-compose ps
# 检查容器日志
echo "📝 容器日志:"
sudo docker-compose logs --tail=20
# 2. 检查端口占用情况
echo ""
echo "🔍 检查端口8080占用:"
sudo lsof -i :8080 || echo "端口8080未被占用"
# 3. 检查容器内部监听情况
echo ""
echo "🔍 检查容器内部监听:"
CONTAINER_ID=$(sudo docker-compose ps -q kt-financial 2>/dev/null || echo "")
if [ -n "$CONTAINER_ID" ]; then
sudo docker exec $CONTAINER_ID ss -tlnp | grep ':80' || echo "容器内无80端口监听"
fi
# 4. 检查容器详细日志(增加行数)
echo ""
echo "📝 容器日志最近100行:"
sudo docker-compose logs --tail=100
# 5. 检查容器健康状态
echo ""
echo "🏥 容器健康检查:"
sudo docker inspect --format='{{.State.Health.Status}}' $CONTAINER_ID 2>/dev/null || echo "未配置健康检查"
# 清理旧镜像和悬空镜像
echo ""
echo "🧹 清理旧镜像..."
sudo docker image prune -f
@@ -146,23 +166,52 @@ jobs:
run: |
echo "🔍 执行健康检查..."
# 等待服务完全启动
sleep 15
# 等待服务完全启动(延长等待时间)
sleep 20
# 健康检查
for i in {1..5}; do
echo "尝试 $i/5: 检查服务..."
# 健康检查(增加重试次数和诊断信息)
for i in {1..10}; do
echo ""
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo "尝试 $i/10: 检查服务 ${{ env.HEALTH_CHECK_URL }}"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
if curl -f -s -o /dev/null -w "%{http_code}" ${{ env.HEALTH_CHECK_URL }} | grep -q "200\|301\|302"; then
echo "✅ 服务健康检查通过!"
# 详细的curl诊断
HTTP_CODE=$(curl -v -s -o /dev/null -w "%{http_code}" --connect-timeout 5 --max-time 10 ${{ env.HEALTH_CHECK_URL }} 2>&1)
echo "响应: $HTTP_CODE"
if echo "$HTTP_CODE" | grep -q "200\|301\|302"; then
echo "✅ 服务健康检查通过HTTP状态码正常"
echo ""
echo "🎉 部署成功!服务已正常运行"
exit 0
fi
echo "⏳ 等待5秒后重试..."
sleep 5
# 如果失败,显示更多诊断信息
if [ $i -eq 5 ]; then
echo ""
echo "⚠️ 第5次尝试失败执行深度诊断..."
echo ""
echo "🔍 检查容器运行状态:"
ssh -o StrictHostKeyChecking=no ${{ secrets.SERVER_USER || 'atai' }}@${{ secrets.SERVER_HOST || '172.16.74.149' }} "cd /home/atai/kt-financial-system && sudo docker-compose ps" || true
echo ""
echo "📝 最新容器日志:"
ssh -o StrictHostKeyChecking=no ${{ secrets.SERVER_USER || 'atai' }}@${{ secrets.SERVER_HOST || '172.16.74.149' }} "cd /home/atai/kt-financial-system && sudo docker-compose logs --tail=50" || true
fi
if [ $i -lt 10 ]; then
echo "⏳ 等待6秒后重试..."
sleep 6
fi
done
echo "❌ 健康检查失败,服务可能未正常启动"
echo ""
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo "❌ 健康检查失败10次尝试均未成功"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo ""
echo "🔍 最终诊断信息:"
ssh -o StrictHostKeyChecking=no ${{ secrets.SERVER_USER || 'atai' }}@${{ secrets.SERVER_HOST || '172.16.74.149' }} "cd /home/atai/kt-financial-system && sudo docker-compose ps && echo '---' && sudo docker-compose logs --tail=100" || true
exit 1
- name: Send notification on success

94
scripts/check-deployment.sh Executable file
View File

@@ -0,0 +1,94 @@
#!/bin/bash
# 部署健康检查脚本
echo "================================================"
echo "KT财务系统部署健康检查"
echo "================================================"
echo ""
TARGET_HOST="172.16.74.149"
TARGET_PORT="8080"
MAX_RETRIES=10
RETRY_INTERVAL=5
echo "🔍 检查目标: http://${TARGET_HOST}:${TARGET_PORT}"
echo ""
# 1. 网络连通性检查
echo "1⃣ 检查网络连通性..."
if ping -c 3 $TARGET_HOST > /dev/null 2>&1; then
echo " ✅ 主机 $TARGET_HOST 可达"
else
echo " ❌ 主机 $TARGET_HOST 不可达"
exit 1
fi
# 2. 端口检查
echo ""
echo "2⃣ 检查端口连接 (${MAX_RETRIES}次重试)..."
for i in $(seq 1 $MAX_RETRIES); do
echo " 尝试 $i/$MAX_RETRIES..."
if nc -zv -w 3 $TARGET_HOST $TARGET_PORT 2>&1 | grep -q "succeeded\|Connected"; then
echo " ✅ 端口 $TARGET_PORT 已开放"
PORT_OPEN=true
break
fi
if [ $i -lt $MAX_RETRIES ]; then
echo " ⏳ 等待 ${RETRY_INTERVAL}秒后重试..."
sleep $RETRY_INTERVAL
fi
done
if [ "$PORT_OPEN" != "true" ]; then
echo " ❌ 端口 $TARGET_PORT 无法连接"
echo ""
echo "⚠️ 可能的原因:"
echo " - Docker容器未启动"
echo " - 端口映射配置错误"
echo " - 防火墙阻止连接"
echo " - 服务启动失败"
exit 1
fi
# 3. HTTP服务检查
echo ""
echo "3⃣ 检查HTTP服务..."
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 5 http://${TARGET_HOST}:${TARGET_PORT}/ 2>/dev/null)
if [ -z "$HTTP_CODE" ]; then
echo " ❌ 无法获取HTTP响应"
exit 1
fi
echo " HTTP状态码: $HTTP_CODE"
if [ "$HTTP_CODE" = "200" ] || [ "$HTTP_CODE" = "301" ] || [ "$HTTP_CODE" = "302" ]; then
echo " ✅ HTTP服务正常"
else
echo " ⚠️ HTTP状态码异常: $HTTP_CODE"
fi
# 4. 响应时间检查
echo ""
echo "4⃣ 检查响应时间..."
RESPONSE_TIME=$(curl -s -o /dev/null -w "%{time_total}" --connect-timeout 5 http://${TARGET_HOST}:${TARGET_PORT}/ 2>/dev/null)
if [ -n "$RESPONSE_TIME" ]; then
echo " 响应时间: ${RESPONSE_TIME}"
if [ $(echo "$RESPONSE_TIME < 3" | bc) -eq 1 ]; then
echo " ✅ 响应时间正常"
else
echo " ⚠️ 响应时间较慢"
fi
fi
echo ""
echo "================================================"
echo "✅ 部署健康检查完成"
echo "================================================"
echo ""
echo "🌐 访问地址: http://${TARGET_HOST}:${TARGET_PORT}"