From 3f858257ea9c11567fabb048ad9c5fa85543338e Mon Sep 17 00:00:00 2001 From: seaislee1209 Date: Mon, 27 Apr 2026 17:40:42 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20v0.19.6=20CI=20deploy.yaml=20retry=20?= =?UTF-8?q?=E5=BE=AA=E7=8E=AF=E5=A4=B1=E8=B4=A5=E6=97=B6=E6=AD=A3=E7=A1=AE?= =?UTF-8?q?=20exit=201?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 根因 deploy.yaml 6 处 retry 循环用 \`for ... do command && break; done\` 模式, bash for 循环本身的 exit code 永远是 0(只要循环正常结束), 即使所有 attempt 都失败。CI 看 step exit 0 -> 误判绿色。 实际事故 v0.19.5 (85aa024) push dev 后 Gitea Actions 显示绿色钩, 但测试服 K8s 上没有创建对应的 ReplicaSet, web pod 仍跑 v0.19.4。 查 K8s ReplicaSet 历史发现自 4-24 12:12 之后没有任何新 RS, 说明 deploy step 的 kubectl apply 没把新 image tag 提交到 etcd (或某处中间静默失败被吞)。SWR 上镜像已经推上去, 是 deploy 这步 后续操作出了问题但 CI 没察觉。 修复 6 处 retry 全部加 \`ok=0/ok=1/break\` flag, 循环结束后 \`[ $ok -eq 1 ] || exit 1\` 守卫, 真失败时 step exit 非 0 -> CI 红色: - backend build (3 次) - backend push (3 次) - web build (3 次) - web push (3 次) - kubectl download (3 次) - deploy to K3s (5 次, 含 kubectl apply / rollout restart) 以后再遇到部署失败, Gitea Actions 会真正显示红色, 不再"假绿色" 骗人。同时已有的 Report-failure-to-Log-Center step (if: failure()) 会被触发, 飞书 / log-center 收到告警。 Co-Authored-By: Claude Opus 4.7 (1M context) --- .gitea/workflows/deploy.yaml | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/.gitea/workflows/deploy.yaml b/.gitea/workflows/deploy.yaml index f408c57..701fcb5 100644 --- a/.gitea/workflows/deploy.yaml +++ b/.gitea/workflows/deploy.yaml @@ -49,45 +49,55 @@ jobs: id: build_backend run: | set -o pipefail + ok=0 for attempt in 1 2 3; do echo "Build backend attempt $attempt/3..." DOCKER_BUILDKIT=0 docker build \ --tag ${{ env.CR_SERVER_ACTIVE }}/${{ env.CR_ORG }}/video-backend:${{ env.IMAGE_TAG }} \ --tag ${{ env.CR_SERVER_ACTIVE }}/${{ env.CR_ORG }}/video-backend:latest \ - ./backend 2>&1 | tee /tmp/build.log && break + ./backend 2>&1 | tee /tmp/build.log && { ok=1; break; } echo "Attempt $attempt failed, retrying in 10s..." && sleep 10 done + [ $ok -eq 1 ] || { echo "ERROR: backend build failed after 3 attempts"; exit 1; } + ok=0 for attempt in 1 2 3; do docker push ${{ env.CR_SERVER_ACTIVE }}/${{ env.CR_ORG }}/video-backend:${{ env.IMAGE_TAG }} && \ - docker push ${{ env.CR_SERVER_ACTIVE }}/${{ env.CR_ORG }}/video-backend:latest && break + docker push ${{ env.CR_SERVER_ACTIVE }}/${{ env.CR_ORG }}/video-backend:latest && { ok=1; break; } echo "Push attempt $attempt failed, retrying in 10s..." && sleep 10 done + [ $ok -eq 1 ] || { echo "ERROR: backend push failed after 3 attempts"; exit 1; } - name: Build and Push Web id: build_web run: | set -o pipefail + ok=0 for attempt in 1 2 3; do echo "Build web attempt $attempt/3..." DOCKER_BUILDKIT=0 docker build \ --tag ${{ env.CR_SERVER_ACTIVE }}/${{ env.CR_ORG }}/video-web:${{ env.IMAGE_TAG }} \ --tag ${{ env.CR_SERVER_ACTIVE }}/${{ env.CR_ORG }}/video-web:latest \ - ./web 2>&1 | tee -a /tmp/build.log && break + ./web 2>&1 | tee -a /tmp/build.log && { ok=1; break; } echo "Attempt $attempt failed, retrying in 10s..." && sleep 10 done + [ $ok -eq 1 ] || { echo "ERROR: web build failed after 3 attempts"; exit 1; } + ok=0 for attempt in 1 2 3; do docker push ${{ env.CR_SERVER_ACTIVE }}/${{ env.CR_ORG }}/video-web:${{ env.IMAGE_TAG }} && \ - docker push ${{ env.CR_SERVER_ACTIVE }}/${{ env.CR_ORG }}/video-web:latest && break + docker push ${{ env.CR_SERVER_ACTIVE }}/${{ env.CR_ORG }}/video-web:latest && { ok=1; break; } echo "Push attempt $attempt failed, retrying in 10s..." && sleep 10 done + [ $ok -eq 1 ] || { echo "ERROR: web push failed after 3 attempts"; exit 1; } - name: Setup Kubectl run: | if ! command -v kubectl &>/dev/null; then + ok=0 for attempt in 1 2 3; do - curl -LO "https://files.m.daocloud.io/dl.k8s.io/release/v1.28.0/bin/linux/amd64/kubectl" && break + curl -LO "https://files.m.daocloud.io/dl.k8s.io/release/v1.28.0/bin/linux/amd64/kubectl" && { ok=1; break; } echo "Download attempt $attempt failed, retrying in 5s..." && sleep 5 done + [ $ok -eq 1 ] || { echo "ERROR: kubectl download failed after 3 attempts"; exit 1; } chmod +x kubectl && mv kubectl /usr/bin/kubectl fi kubectl version --client @@ -135,6 +145,7 @@ jobs: # All kubectl operations with retry (K3s 内网连接可能抖动) export KUBECTL_TIMEOUT="--request-timeout=4s" + ok=0 for attempt in 1 2 3 4 5; do echo "Deploy attempt $attempt/5..." { @@ -169,10 +180,11 @@ jobs: kubectl $KUBECTL_TIMEOUT rollout restart deployment/video-backend kubectl $KUBECTL_TIMEOUT rollout restart deployment/celery-worker kubectl $KUBECTL_TIMEOUT rollout restart deployment/video-web - } 2>&1 | tee /tmp/deploy.log && break + } 2>&1 | tee /tmp/deploy.log && { ok=1; break; } echo "Attempt $attempt failed, retrying in 30s..." sleep 30 done + [ $ok -eq 1 ] || { echo "ERROR: deploy to K3s failed after 5 attempts — check /tmp/deploy.log"; exit 1; } # ===== Log Center: failure reporting ===== - name: Report failure to Log Center