diff --git a/.gitea/workflows/deploy.yaml b/.gitea/workflows/deploy.yaml index abec937..d61ac1c 100644 --- a/.gitea/workflows/deploy.yaml +++ b/.gitea/workflows/deploy.yaml @@ -49,28 +49,45 @@ jobs: id: build_backend run: | set -o pipefail - DOCKER_BUILDKIT=0 docker build \ - --tag ${{ env.CR_SERVER_ACTIVE }}/${{ env.CR_ORG }}/video-backend:${{ env.IMAGE_TAG }} \ - --tag ${{ env.CR_SERVER_ACTIVE }}/${{ env.CR_ORG }}/video-backend:latest \ - ./backend 2>&1 | tee /tmp/build.log - docker push ${{ env.CR_SERVER_ACTIVE }}/${{ env.CR_ORG }}/video-backend:${{ env.IMAGE_TAG }} - docker push ${{ env.CR_SERVER_ACTIVE }}/${{ env.CR_ORG }}/video-backend:latest + for attempt in 1 2 3; do + echo "Build backend attempt $attempt/3..." + DOCKER_BUILDKIT=0 docker build \ + --tag ${{ env.CR_SERVER_ACTIVE }}/${{ env.CR_ORG }}/video-backend:${{ env.IMAGE_TAG }} \ + --tag ${{ env.CR_SERVER_ACTIVE }}/${{ env.CR_ORG }}/video-backend:latest \ + ./backend 2>&1 | tee /tmp/build.log && break + echo "Attempt $attempt failed, retrying in 10s..." && sleep 10 + done + for attempt in 1 2 3; do + docker push ${{ env.CR_SERVER_ACTIVE }}/${{ env.CR_ORG }}/video-backend:${{ env.IMAGE_TAG }} && \ + docker push ${{ env.CR_SERVER_ACTIVE }}/${{ env.CR_ORG }}/video-backend:latest && break + echo "Push attempt $attempt failed, retrying in 10s..." && sleep 10 + done - name: Build and Push Web id: build_web run: | set -o pipefail - DOCKER_BUILDKIT=0 docker build \ - --tag ${{ env.CR_SERVER_ACTIVE }}/${{ env.CR_ORG }}/video-web:${{ env.IMAGE_TAG }} \ - --tag ${{ env.CR_SERVER_ACTIVE }}/${{ env.CR_ORG }}/video-web:latest \ - ./web 2>&1 | tee -a /tmp/build.log - docker push ${{ env.CR_SERVER_ACTIVE }}/${{ env.CR_ORG }}/video-web:${{ env.IMAGE_TAG }} - docker push ${{ env.CR_SERVER_ACTIVE }}/${{ env.CR_ORG }}/video-web:latest + for attempt in 1 2 3; do + echo "Build web attempt $attempt/3..." + DOCKER_BUILDKIT=0 docker build \ + --tag ${{ env.CR_SERVER_ACTIVE }}/${{ env.CR_ORG }}/video-web:${{ env.IMAGE_TAG }} \ + --tag ${{ env.CR_SERVER_ACTIVE }}/${{ env.CR_ORG }}/video-web:latest \ + ./web 2>&1 | tee -a /tmp/build.log && break + echo "Attempt $attempt failed, retrying in 10s..." && sleep 10 + done + for attempt in 1 2 3; do + docker push ${{ env.CR_SERVER_ACTIVE }}/${{ env.CR_ORG }}/video-web:${{ env.IMAGE_TAG }} && \ + docker push ${{ env.CR_SERVER_ACTIVE }}/${{ env.CR_ORG }}/video-web:latest && break + echo "Push attempt $attempt failed, retrying in 10s..." && sleep 10 + done - name: Setup Kubectl run: | if ! command -v kubectl &>/dev/null; then - curl -LO "https://mirrors.aliyun.com/kubernetes/kubectl/v1.28.0/bin/linux/amd64/kubectl" + for attempt in 1 2 3; do + curl -LO "https://files.m.daocloud.io/dl.k8s.io/release/v1.28.0/bin/linux/amd64/kubectl" && break + echo "Download attempt $attempt failed, retrying in 5s..." && sleep 5 + done chmod +x kubectl && mv kubectl /usr/local/bin/ fi kubectl version --client @@ -79,11 +96,13 @@ jobs: run: | mkdir -p $HOME/.kube if [[ "${{ github.ref_name }}" == "master" ]]; then - echo "${{ secrets.VOLCANO_PROD_KUBE_CONFIG }}" > $HOME/.kube/config + printf '%s\n' '${{ secrets.VOLCANO_PROD_KUBE_CONFIG }}' > $HOME/.kube/config elif [[ "${{ github.ref_name }}" == "dev" ]]; then - echo "${{ secrets.VOLCANO_TEST_KUBE_CONFIG }}" > $HOME/.kube/config + printf '%s\n' '${{ secrets.VOLCANO_TEST_KUBE_CONFIG }}' > $HOME/.kube/config fi chmod 600 $HOME/.kube/config + echo "kubeconfig lines: $(wc -l < $HOME/.kube/config)" + grep server $HOME/.kube/config || echo "WARNING: no server found in kubeconfig" - name: Deploy to K3s id: deploy @@ -113,38 +132,43 @@ jobs: sed -i "s|redis://zyc:Zyc188208@redis-shzlsczo52dft8mia.redis.ivolces.com:6379/0|${{ env.REDIS_URL }}|g" k8s/backend-deployment.yaml sed -i "s|redis://zyc:Zyc188208@redis-shzlsczo52dft8mia.redis.ivolces.com:6379/0|${{ env.REDIS_URL }}|g" k8s/celery-deployment.yaml - # Create/update image pull secret for CR - kubectl create secret docker-registry cr-pull-secret \ - --docker-server="${{ env.CR_SERVER_ACTIVE }}" \ - --docker-username="${{ env.CR_USERNAME_ACTIVE }}" \ - --docker-password="${{ env.CR_PASSWORD_ACTIVE }}" \ - --dry-run=client -o yaml | kubectl apply -f - + # All kubectl operations with retry (K3s 内网连接可能抖动) + for attempt in 1 2 3; do + echo "Deploy attempt $attempt/3..." + { + # Create/update image pull secret for CR + kubectl create secret docker-registry cr-pull-secret \ + --docker-server="${{ env.CR_SERVER_ACTIVE }}" \ + --docker-username="${{ env.CR_USERNAME_ACTIVE }}" \ + --docker-password="${{ env.CR_PASSWORD_ACTIVE }}" \ + --dry-run=client -o yaml | kubectl apply -f - - # Create/update secrets (业务密钥,DB 已写在 yaml 里) - kubectl create secret generic video-backend-secrets \ - --from-literal=ARK_API_KEY='${{ secrets.ARK_API_KEY }}' \ - --from-literal=TOS_ACCESS_KEY='${{ secrets.TOS_ACCESS_KEY }}' \ - --from-literal=TOS_SECRET_KEY='${{ secrets.TOS_SECRET_KEY }}' \ - --from-literal=DJANGO_SECRET_KEY='${{ secrets.DJANGO_SECRET_KEY }}' \ - --from-literal=ALIYUN_SMS_ACCESS_KEY='${{ secrets.ALIYUN_SMS_ACCESS_KEY }}' \ - --from-literal=ALIYUN_SMS_ACCESS_SECRET='${{ secrets.ALIYUN_SMS_ACCESS_SECRET }}' \ - --dry-run=client -o yaml | kubectl apply -f - + # Create/update secrets (业务密钥,DB 已写在 yaml 里) + kubectl create secret generic video-backend-secrets \ + --from-literal=ARK_API_KEY='${{ secrets.ARK_API_KEY }}' \ + --from-literal=TOS_ACCESS_KEY='${{ secrets.TOS_ACCESS_KEY }}' \ + --from-literal=TOS_SECRET_KEY='${{ secrets.TOS_SECRET_KEY }}' \ + --from-literal=DJANGO_SECRET_KEY='${{ secrets.DJANGO_SECRET_KEY }}' \ + --from-literal=ALIYUN_SMS_ACCESS_KEY='${{ secrets.ALIYUN_SMS_ACCESS_KEY }}' \ + --from-literal=ALIYUN_SMS_ACCESS_SECRET='${{ secrets.ALIYUN_SMS_ACCESS_SECRET }}' \ + --dry-run=client -o yaml | kubectl apply -f - - # Apply manifests - set -o pipefail - { - kubectl apply -f k8s/backend-deployment.yaml - kubectl apply -f k8s/celery-deployment.yaml - kubectl apply -f k8s/web-deployment.yaml - kubectl apply -f k8s/ingress.yaml + # Apply manifests + kubectl apply -f k8s/backend-deployment.yaml + kubectl apply -f k8s/celery-deployment.yaml + kubectl apply -f k8s/web-deployment.yaml + kubectl apply -f k8s/ingress.yaml - # Preserve real client IP - kubectl patch svc traefik -n kube-system -p '{"spec":{"externalTrafficPolicy":"Local"}}' 2>/dev/null || true + # Preserve real client IP + kubectl patch svc traefik -n kube-system -p '{"spec":{"externalTrafficPolicy":"Local"}}' 2>/dev/null || true - kubectl rollout restart deployment/video-backend - kubectl rollout restart deployment/celery-worker - kubectl rollout restart deployment/video-web - } 2>&1 | tee /tmp/deploy.log + kubectl rollout restart deployment/video-backend + kubectl rollout restart deployment/celery-worker + kubectl rollout restart deployment/video-web + } 2>&1 | tee /tmp/deploy.log && break + echo "Attempt $attempt failed, retrying in 10s..." + sleep 10 + done # ===== Log Center: failure reporting ===== - name: Report failure to Log Center @@ -204,3 +228,13 @@ jobs: \"run_url\": \"https://gitea.airlabs.art/${{ github.repository }}/actions/runs/${{ github.run_number }}\" } }" || true + + # ===== Cleanup: remove unused Docker resources ===== + - name: Docker Cleanup + if: always() + run: | + docker container prune -f + docker image prune -a -f + docker builder prune -a -f + echo "Disk usage after cleanup:" + df -h / | tail -1 diff --git a/backend/Dockerfile b/backend/Dockerfile index c271d13..dedbfe7 100644 --- a/backend/Dockerfile +++ b/backend/Dockerfile @@ -30,4 +30,4 @@ RUN chmod +x /app/entrypoint.sh EXPOSE 8000 ENTRYPOINT ["/app/entrypoint.sh"] -CMD ["gunicorn", "--bind", "0.0.0.0:8000", "--workers", "2", "--timeout", "120", "--access-logfile", "-", "--error-logfile", "-", "config.wsgi:application"] +CMD ["gunicorn", "--bind", "0.0.0.0:8000", "--workers", "2", "--worker-class", "gevent", "--worker-connections", "200", "--timeout", "120", "--access-logfile", "-", "--error-logfile", "-", "config.wsgi:application"] diff --git a/backend/apps/generation/tasks.py b/backend/apps/generation/tasks.py index f42832f..0448aee 100644 --- a/backend/apps/generation/tasks.py +++ b/backend/apps/generation/tasks.py @@ -21,19 +21,29 @@ def poll_video_task(self, record_id): from apps.generation.models import GenerationRecord from utils.airdrama_client import query_task, map_status + # 防重复:同一 record 同一时刻只允许一个 poll 在执行 + from django.core.cache import cache + lock_key = f'poll_lock:{record_id}' + if not cache.add(lock_key, '1', timeout=POLL_INTERVAL * 3): + logger.info('poll_video_task: record %s already being polled, skipping', record_id) + return + try: record = GenerationRecord.objects.get(pk=record_id) except GenerationRecord.DoesNotExist: logger.warning('poll_video_task: record %s not found', record_id) + cache.delete(lock_key) return ark_task_id = record.ark_task_id if not ark_task_id: logger.warning('poll_video_task: record %s has no ark_task_id', record_id) + cache.delete(lock_key) return if record.status not in ('queued', 'processing'): logger.info('poll_video_task: record %s already in terminal state: %s', record_id, record.status) + cache.delete(lock_key) return # Poll Volcano API @@ -42,12 +52,14 @@ def poll_video_task(self, record_id): new_status = map_status(ark_resp.get('status', '')) except Exception: logger.exception('poll_video_task: API query failed for %s, will retry', ark_task_id) + cache.delete(lock_key) raise self.retry(countdown=POLL_INTERVAL) if new_status in ('queued', 'processing'): # Still running — update status, then re-enqueue record.status = new_status record.save(update_fields=['status', 'updated_at']) + cache.delete(lock_key) raise self.retry(countdown=POLL_INTERVAL) # Terminal state reached — process result diff --git a/backend/config/settings.py b/backend/config/settings.py index ce7004c..b9a1127 100644 --- a/backend/config/settings.py +++ b/backend/config/settings.py @@ -189,7 +189,7 @@ CELERY_BEAT_SCHEDULE = { LANGUAGE_CODE = 'zh-hans' TIME_ZONE = 'Asia/Shanghai' USE_I18N = True -USE_TZ = True +USE_TZ = False STATIC_URL = 'static/' STATIC_ROOT = BASE_DIR / 'staticfiles'