Compare commits
2 Commits
a4c36e4fee
...
36ff1b5aca
| Author | SHA1 | Date | |
|---|---|---|---|
| 36ff1b5aca | |||
| 43228d255e |
@ -3,102 +3,130 @@ name: Build and Deploy
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
- master
|
||||
- dev
|
||||
|
||||
jobs:
|
||||
build-and-deploy:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
run: |
|
||||
git clone --depth=1 --branch=${{ github.ref_name }} https://gitea.airlabs.art/${{ github.repository }}.git .
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v2
|
||||
with:
|
||||
config-inline: |
|
||||
[registry."docker.io"]
|
||||
mirrors = ["https://docker.m.daocloud.io", "https://docker.1panel.live", "https://hub.rat.dev"]
|
||||
- name: Set environment by branch
|
||||
run: |
|
||||
SHORT_SHA=$(echo "${{ github.sha }}" | cut -c1-7)
|
||||
BUILD_DATE=$(date +%Y%m%d)
|
||||
|
||||
- name: Login to Huawei Cloud SWR
|
||||
uses: docker/login-action@v2
|
||||
with:
|
||||
registry: ${{ secrets.SWR_SERVER }}
|
||||
username: ${{ secrets.SWR_USERNAME }}
|
||||
password: ${{ secrets.SWR_PASSWORD }}
|
||||
if [[ "${{ github.ref_name }}" == "master" ]]; then
|
||||
echo "IMAGE_TAG=prod-${BUILD_DATE}-${SHORT_SHA}" >> $GITHUB_ENV
|
||||
echo "CR_ORG=prod" >> $GITHUB_ENV
|
||||
echo "DEPLOY_ENV=production" >> $GITHUB_ENV
|
||||
echo "DOMAIN_API=airflow-studio-api.airlabs.art" >> $GITHUB_ENV
|
||||
echo "DOMAIN_WEB=airflow-studio.airlabs.art" >> $GITHUB_ENV
|
||||
echo "REDIS_URL=redis://zyc:Zyc188208@redis-shzlf5t46gjvow7ua.redis.ivolces.com:6379/0" >> $GITHUB_ENV
|
||||
elif [[ "${{ github.ref_name }}" == "dev" ]]; then
|
||||
echo "IMAGE_TAG=dev-${BUILD_DATE}-${SHORT_SHA}" >> $GITHUB_ENV
|
||||
echo "CR_ORG=dev" >> $GITHUB_ENV
|
||||
echo "DEPLOY_ENV=development" >> $GITHUB_ENV
|
||||
echo "DOMAIN_API=airflow-studio-api.test.airlabs.art" >> $GITHUB_ENV
|
||||
echo "DOMAIN_WEB=airflow-studio.test.airlabs.art" >> $GITHUB_ENV
|
||||
echo "REDIS_URL=redis://zyc:Zyc188208@redis-shzlsczo52dft8mia.redis.ivolces.com:6379/0" >> $GITHUB_ENV
|
||||
fi
|
||||
|
||||
- name: Login to Volcano Engine CR
|
||||
run: |
|
||||
echo "${{ secrets.CR_PASSWORD }}" | docker login --username "${{ secrets.CR_USERNAME }}" --password-stdin ${{ secrets.CR_SERVER }}
|
||||
|
||||
- name: Build and Push Backend
|
||||
id: build_backend
|
||||
run: |
|
||||
set -o pipefail
|
||||
docker buildx build \
|
||||
--push \
|
||||
--no-cache \
|
||||
--provenance=false \
|
||||
--tag ${{ secrets.SWR_SERVER }}/${{ secrets.SWR_ORG }}/video-backend:latest \
|
||||
DOCKER_BUILDKIT=0 docker build \
|
||||
--tag ${{ secrets.CR_SERVER }}/${{ env.CR_ORG }}/video-backend:${{ env.IMAGE_TAG }} \
|
||||
--tag ${{ secrets.CR_SERVER }}/${{ env.CR_ORG }}/video-backend:latest \
|
||||
./backend 2>&1 | tee /tmp/build.log
|
||||
docker push ${{ secrets.CR_SERVER }}/${{ env.CR_ORG }}/video-backend:${{ env.IMAGE_TAG }}
|
||||
docker push ${{ secrets.CR_SERVER }}/${{ env.CR_ORG }}/video-backend:latest
|
||||
|
||||
- name: Build and Push Web
|
||||
id: build_web
|
||||
run: |
|
||||
set -o pipefail
|
||||
docker buildx build \
|
||||
--push \
|
||||
--provenance=false \
|
||||
--tag ${{ secrets.SWR_SERVER }}/${{ secrets.SWR_ORG }}/video-web:latest \
|
||||
DOCKER_BUILDKIT=0 docker build \
|
||||
--tag ${{ secrets.CR_SERVER }}/${{ env.CR_ORG }}/video-web:${{ env.IMAGE_TAG }} \
|
||||
--tag ${{ secrets.CR_SERVER }}/${{ env.CR_ORG }}/video-web:latest \
|
||||
./web 2>&1 | tee -a /tmp/build.log
|
||||
docker push ${{ secrets.CR_SERVER }}/${{ env.CR_ORG }}/video-web:${{ env.IMAGE_TAG }}
|
||||
docker push ${{ secrets.CR_SERVER }}/${{ env.CR_ORG }}/video-web:latest
|
||||
|
||||
- name: Setup SSH
|
||||
- name: Setup Kubectl
|
||||
run: kubectl version --client
|
||||
|
||||
- name: Set kubeconfig
|
||||
run: |
|
||||
mkdir -p ~/.ssh
|
||||
echo "${{ secrets.K3S_SSH_KEY }}" > ~/.ssh/id_rsa
|
||||
chmod 600 ~/.ssh/id_rsa
|
||||
ssh-keyscan -H ${{ secrets.K3S_HOST }} >> ~/.ssh/known_hosts 2>/dev/null
|
||||
mkdir -p $HOME/.kube
|
||||
if [[ "${{ github.ref_name }}" == "master" ]]; then
|
||||
echo "${{ secrets.VOLCANO_PROD_KUBE_CONFIG }}" > $HOME/.kube/config
|
||||
elif [[ "${{ github.ref_name }}" == "dev" ]]; then
|
||||
echo "${{ secrets.VOLCANO_TEST_KUBE_CONFIG }}" > $HOME/.kube/config
|
||||
fi
|
||||
chmod 600 $HOME/.kube/config
|
||||
|
||||
- name: Deploy to K3s via SSH
|
||||
- name: Deploy to K3s
|
||||
id: deploy
|
||||
run: |
|
||||
SWR_IMAGE="${{ secrets.SWR_SERVER }}/${{ secrets.SWR_ORG }}"
|
||||
echo "Environment: ${{ env.DEPLOY_ENV }}"
|
||||
CR_IMAGE="${{ secrets.CR_SERVER }}/${{ env.CR_ORG }}"
|
||||
|
||||
# Replace image placeholders in yaml files
|
||||
sed -i "s|\${CI_REGISTRY_IMAGE}/video-backend:latest|${SWR_IMAGE}/video-backend:latest|g" k8s/backend-deployment.yaml
|
||||
sed -i "s|\${CI_REGISTRY_IMAGE}/video-backend:latest|${SWR_IMAGE}/video-backend:latest|g" k8s/celery-deployment.yaml
|
||||
sed -i "s|\${CI_REGISTRY_IMAGE}/video-web:latest|${SWR_IMAGE}/video-web:latest|g" k8s/web-deployment.yaml
|
||||
# Replace image placeholders
|
||||
sed -i "s|\${CI_REGISTRY_IMAGE}/video-backend:latest|${CR_IMAGE}/video-backend:${{ env.IMAGE_TAG }}|g" k8s/backend-deployment.yaml
|
||||
sed -i "s|\${CI_REGISTRY_IMAGE}/video-backend:latest|${CR_IMAGE}/video-backend:${{ env.IMAGE_TAG }}|g" k8s/celery-deployment.yaml
|
||||
sed -i "s|\${CI_REGISTRY_IMAGE}/video-web:latest|${CR_IMAGE}/video-web:${{ env.IMAGE_TAG }}|g" k8s/web-deployment.yaml
|
||||
|
||||
# Copy k8s manifests to server
|
||||
scp -o StrictHostKeyChecking=no k8s/backend-deployment.yaml k8s/web-deployment.yaml k8s/ingress.yaml k8s/celery-deployment.yaml root@${{ secrets.K3S_HOST }}:/tmp/
|
||||
# Replace domain placeholders in ingress
|
||||
sed -i "s|airflow-studio-api.airlabs.art|${{ env.DOMAIN_API }}|g" k8s/ingress.yaml
|
||||
sed -i "s|airflow-studio.airlabs.art|${{ env.DOMAIN_WEB }}|g" k8s/ingress.yaml
|
||||
|
||||
# Create/update secrets and apply manifests on server
|
||||
# Replace DB config for production
|
||||
if [[ "${{ env.DEPLOY_ENV }}" == "production" ]]; then
|
||||
sed -i "s|mysql-8351f937d637.rds.ivolces.com|mysqld9bb4e81696d.rds.ivolces.com|g" k8s/backend-deployment.yaml
|
||||
sed -i "s|mysql-8351f937d637.rds.ivolces.com|mysqld9bb4e81696d.rds.ivolces.com|g" k8s/celery-deployment.yaml
|
||||
fi
|
||||
|
||||
# Replace CORS origin
|
||||
sed -i "s|https://airflow-studio.airlabs.art|https://${{ env.DOMAIN_WEB }}|g" k8s/backend-deployment.yaml
|
||||
|
||||
# Replace Redis URL by environment
|
||||
sed -i "s|redis://zyc:Zyc188208@redis-shzlsczo52dft8mia.redis.ivolces.com:6379/0|${{ env.REDIS_URL }}|g" k8s/backend-deployment.yaml
|
||||
sed -i "s|redis://zyc:Zyc188208@redis-shzlsczo52dft8mia.redis.ivolces.com:6379/0|${{ env.REDIS_URL }}|g" k8s/celery-deployment.yaml
|
||||
|
||||
# Create/update secrets (业务密钥,DB 已写在 yaml 里)
|
||||
kubectl create secret generic video-backend-secrets \
|
||||
--from-literal=ARK_API_KEY='${{ secrets.ARK_API_KEY }}' \
|
||||
--from-literal=TOS_ACCESS_KEY='${{ secrets.TOS_ACCESS_KEY }}' \
|
||||
--from-literal=TOS_SECRET_KEY='${{ secrets.TOS_SECRET_KEY }}' \
|
||||
--from-literal=DJANGO_SECRET_KEY='${{ secrets.DJANGO_SECRET_KEY }}' \
|
||||
--from-literal=ALIYUN_SMS_ACCESS_KEY='${{ secrets.ALIYUN_SMS_ACCESS_KEY }}' \
|
||||
--from-literal=ALIYUN_SMS_ACCESS_SECRET='${{ secrets.ALIYUN_SMS_ACCESS_SECRET }}' \
|
||||
--dry-run=client -o yaml | kubectl apply -f -
|
||||
|
||||
# Apply manifests
|
||||
set -o pipefail
|
||||
ssh -o StrictHostKeyChecking=no root@${{ secrets.K3S_HOST }} << ENDSSH
|
||||
export KUBECONFIG=/etc/rancher/k3s/k3s.yaml
|
||||
{
|
||||
kubectl apply -f k8s/backend-deployment.yaml
|
||||
kubectl apply -f k8s/celery-deployment.yaml
|
||||
kubectl apply -f k8s/web-deployment.yaml
|
||||
kubectl apply -f k8s/ingress.yaml
|
||||
|
||||
kubectl create secret generic video-backend-secrets \
|
||||
--from-literal=ARK_API_KEY='${{ secrets.ARK_API_KEY }}' \
|
||||
--from-literal=TOS_ACCESS_KEY='${{ secrets.TOS_ACCESS_KEY }}' \
|
||||
--from-literal=TOS_SECRET_KEY='${{ secrets.TOS_SECRET_KEY }}' \
|
||||
--from-literal=DJANGO_SECRET_KEY='${{ secrets.DJANGO_SECRET_KEY }}' \
|
||||
--from-literal=DB_HOST='${{ secrets.DB_HOST }}' \
|
||||
--from-literal=DB_USER='${{ secrets.DB_USER }}' \
|
||||
--from-literal=DB_PASSWORD='${{ secrets.DB_PASSWORD }}' \
|
||||
--from-literal=ALIYUN_SMS_ACCESS_KEY='${{ secrets.ALIYUN_SMS_ACCESS_KEY }}' \
|
||||
--from-literal=ALIYUN_SMS_ACCESS_SECRET='${{ secrets.ALIYUN_SMS_ACCESS_SECRET }}' \
|
||||
--dry-run=client -o yaml | kubectl apply -f -
|
||||
|
||||
kubectl apply -f /tmp/backend-deployment.yaml
|
||||
kubectl apply -f /tmp/celery-deployment.yaml
|
||||
kubectl apply -f /tmp/web-deployment.yaml
|
||||
kubectl apply -f /tmp/ingress.yaml
|
||||
|
||||
# Preserve real client IP: disable SNAT on Traefik
|
||||
# Preserve real client IP
|
||||
kubectl patch svc traefik -n kube-system -p '{"spec":{"externalTrafficPolicy":"Local"}}' 2>/dev/null || true
|
||||
|
||||
kubectl rollout restart deployment/video-backend
|
||||
kubectl rollout restart deployment/celery-worker
|
||||
kubectl rollout restart deployment/video-web
|
||||
|
||||
rm -f /tmp/backend-deployment.yaml /tmp/web-deployment.yaml /tmp/ingress.yaml /tmp/celery-deployment.yaml
|
||||
ENDSSH
|
||||
} 2>&1 | tee /tmp/deploy.log
|
||||
|
||||
# ===== Log Center: failure reporting =====
|
||||
- name: Report failure to Log Center
|
||||
@ -137,7 +165,7 @@ jobs:
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{
|
||||
\"project_id\": \"video_backend\",
|
||||
\"environment\": \"${{ github.ref_name }}\",
|
||||
\"environment\": \"${{ env.DEPLOY_ENV }}\",
|
||||
\"level\": \"ERROR\",
|
||||
\"source\": \"${SOURCE}\",
|
||||
\"commit_hash\": \"${{ github.sha }}\",
|
||||
|
||||
@ -1,27 +1,25 @@
|
||||
"""Celery tasks for async video generation polling."""
|
||||
|
||||
import logging
|
||||
import time
|
||||
|
||||
from celery import shared_task
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# 固定轮询间隔:全程每 5 秒(RPM 12000 足够,400 并发仅用 40%)
|
||||
# 轮询间隔(秒):每次查完后重新入队,不占 worker 进程
|
||||
POLL_INTERVAL = 5
|
||||
|
||||
|
||||
@shared_task(bind=True, max_retries=0, ignore_result=True)
|
||||
@shared_task(bind=True, max_retries=None, ignore_result=True)
|
||||
def poll_video_task(self, record_id):
|
||||
"""Poll Volcano API for a video generation task until it reaches a terminal state.
|
||||
"""Poll Volcano API for a video generation task.
|
||||
|
||||
This is the server-side counterpart to the frontend polling.
|
||||
It runs independently of the browser — even if the user closes the page,
|
||||
this task keeps polling until Volcano returns completed or failed.
|
||||
每次只执行一轮查询,查完通过 self.retry 重新入队。
|
||||
这样 worker 不会被 sleep 占死,重启也不丢任务。
|
||||
"""
|
||||
from django.utils import timezone
|
||||
from apps.generation.models import GenerationRecord
|
||||
from utils.airdrama_client import query_task, map_status, extract_video_url, ERROR_MESSAGES
|
||||
from utils.airdrama_client import query_task, map_status
|
||||
|
||||
try:
|
||||
record = GenerationRecord.objects.get(pk=record_id)
|
||||
@ -38,62 +36,42 @@ def poll_video_task(self, record_id):
|
||||
logger.info('poll_video_task: record %s already in terminal state: %s', record_id, record.status)
|
||||
return
|
||||
|
||||
elapsed = 0
|
||||
logger.info('poll_video_task: start polling record=%s ark=%s', record_id, ark_task_id)
|
||||
# Poll Volcano API
|
||||
try:
|
||||
ark_resp = query_task(ark_task_id)
|
||||
new_status = map_status(ark_resp.get('status', ''))
|
||||
except Exception:
|
||||
logger.exception('poll_video_task: API query failed for %s, will retry', ark_task_id)
|
||||
raise self.retry(countdown=POLL_INTERVAL)
|
||||
|
||||
while True:
|
||||
time.sleep(POLL_INTERVAL)
|
||||
elapsed += POLL_INTERVAL
|
||||
|
||||
# Re-fetch record to check if frontend already updated it
|
||||
try:
|
||||
record.refresh_from_db()
|
||||
except GenerationRecord.DoesNotExist:
|
||||
logger.info('poll_video_task: record %s deleted during polling', record_id)
|
||||
return
|
||||
|
||||
if record.status not in ('queued', 'processing'):
|
||||
logger.info('poll_video_task: record %s resolved by frontend: %s', record_id, record.status)
|
||||
return
|
||||
|
||||
# Poll Volcano API
|
||||
try:
|
||||
ark_resp = query_task(ark_task_id)
|
||||
new_status = map_status(ark_resp.get('status', ''))
|
||||
except Exception:
|
||||
logger.exception('poll_video_task: API query failed for %s, will retry', ark_task_id)
|
||||
continue # retry on next interval
|
||||
|
||||
if new_status in ('queued', 'processing'):
|
||||
# Still running, update status and touch updated_at
|
||||
record.status = new_status
|
||||
record.save(update_fields=['status', 'updated_at'])
|
||||
continue
|
||||
|
||||
# Terminal state reached — process result
|
||||
if new_status in ('queued', 'processing'):
|
||||
# Still running — update status, then re-enqueue
|
||||
record.status = new_status
|
||||
record.save(update_fields=['status', 'updated_at'])
|
||||
raise self.retry(countdown=POLL_INTERVAL)
|
||||
|
||||
# Save seed
|
||||
returned_seed = ark_resp.get('seed')
|
||||
if returned_seed is not None:
|
||||
record.seed = returned_seed
|
||||
# Terminal state reached — process result
|
||||
record.status = new_status
|
||||
|
||||
if new_status == 'completed':
|
||||
_handle_completed(record, ark_resp)
|
||||
elif new_status == 'failed':
|
||||
_handle_failed(record, ark_resp)
|
||||
returned_seed = ark_resp.get('seed')
|
||||
if returned_seed is not None:
|
||||
record.seed = returned_seed
|
||||
|
||||
record.completed_at = timezone.now()
|
||||
record.save(update_fields=[
|
||||
'status', 'result_url', 'error_message', 'raw_error',
|
||||
'seed', 'completed_at',
|
||||
])
|
||||
if new_status == 'completed':
|
||||
_handle_completed(record, ark_resp)
|
||||
elif new_status == 'failed':
|
||||
_handle_failed(record, ark_resp)
|
||||
|
||||
logger.info(
|
||||
'poll_video_task: record=%s ark=%s final_status=%s elapsed=%ds',
|
||||
record_id, ark_task_id, new_status, elapsed,
|
||||
)
|
||||
return
|
||||
record.completed_at = timezone.now()
|
||||
record.save(update_fields=[
|
||||
'status', 'result_url', 'error_message', 'raw_error',
|
||||
'seed', 'completed_at',
|
||||
])
|
||||
|
||||
logger.info(
|
||||
'poll_video_task: record=%s ark=%s final_status=%s',
|
||||
record_id, ark_task_id, new_status,
|
||||
)
|
||||
|
||||
|
||||
def _handle_completed(record, ark_resp):
|
||||
@ -122,16 +100,16 @@ def _handle_completed(record, ark_resp):
|
||||
|
||||
@shared_task(ignore_result=True)
|
||||
def recover_stuck_tasks():
|
||||
"""定时扫描卡在 processing/queued 超过 10 分钟的任务,重新派发轮询。"""
|
||||
"""定时扫描卡在 processing/queued 超过 3 分钟的任务,重新派发轮询。"""
|
||||
from datetime import timedelta
|
||||
from django.utils import timezone
|
||||
from apps.generation.models import GenerationRecord
|
||||
|
||||
cutoff = timezone.now() - timedelta(minutes=10)
|
||||
cutoff = timezone.now() - timedelta(minutes=3)
|
||||
stuck_records = GenerationRecord.objects.filter(
|
||||
status__in=('queued', 'processing'),
|
||||
ark_task_id__isnull=False,
|
||||
updated_at__lt=cutoff, # updated_at 超过 10 分钟没更新,说明没有 worker 在轮询
|
||||
updated_at__lt=cutoff,
|
||||
).exclude(ark_task_id='')
|
||||
|
||||
count = 0
|
||||
|
||||
@ -182,7 +182,7 @@ CELERY_TIMEZONE = 'Asia/Shanghai'
|
||||
CELERY_BEAT_SCHEDULE = {
|
||||
'recover-stuck-tasks': {
|
||||
'task': 'apps.generation.tasks.recover_stuck_tasks',
|
||||
'schedule': 600, # 每 10 分钟
|
||||
'schedule': 180, # 每 3 分钟
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
134
docs/celery-polling-fix-20260404.md
Normal file
134
docs/celery-polling-fix-20260404.md
Normal file
@ -0,0 +1,134 @@
|
||||
# Celery 轮询机制修复报告
|
||||
|
||||
> 日期:2026-04-04
|
||||
> 版本:v0.16.0
|
||||
> 影响范围:backend/apps/generation/tasks.py, backend/config/settings.py
|
||||
|
||||
---
|
||||
|
||||
## 一、问题现象
|
||||
|
||||
2026/4/1 下午,大量用户反馈视频生成任务长时间卡在"生成中",前端显示耗时 60~65 分钟。
|
||||
火山引擎侧确认视频实际生成仅需约 10 分钟,结果已就绪但未被平台及时同步。
|
||||
|
||||
**截图数据**(4/1 下午完成的任务):
|
||||
|
||||
| 提交时间 | 显示耗时 |
|
||||
|---------|---------|
|
||||
| 2026/4/1 16:57:28 | 63 分 33 秒 |
|
||||
| 2026/4/1 16:58:41 | 62 分 37 秒 |
|
||||
| 2026/4/1 16:59:16 | 62 分 7 秒 |
|
||||
| 2026/4/1 17:00:36 | 64 分 24 秒 |
|
||||
| 2026/4/1 17:04:53 | 64 分 2 秒 |
|
||||
|
||||
## 二、根因分析
|
||||
|
||||
### 2.1 状态同步链路
|
||||
|
||||
```
|
||||
用户提交任务
|
||||
→ 后端调 create_task(火山 API)
|
||||
→ 获得 ark_task_id
|
||||
→ 派发 Celery 任务 poll_video_task
|
||||
→ Celery worker 每 5 秒查一次火山 API
|
||||
→ 火山返回完成 → 写 DB + 上传 TOS + 结算
|
||||
→ 前端轮询 DB → 展示结果
|
||||
```
|
||||
|
||||
前端只读 DB 状态,**不直接调火山 API**。整个链路完全依赖 Celery worker 轮询。
|
||||
|
||||
### 2.2 旧实现缺陷
|
||||
|
||||
`poll_video_task` 使用 `while True` + `time.sleep(5)` 长驻循环:
|
||||
|
||||
```python
|
||||
# 旧代码
|
||||
while True:
|
||||
time.sleep(POLL_INTERVAL) # 5 秒
|
||||
ark_resp = query_task(...) # 查一次
|
||||
if terminal:
|
||||
break
|
||||
```
|
||||
|
||||
**三个致命问题:**
|
||||
|
||||
| 问题 | 影响 |
|
||||
|------|------|
|
||||
| 每个任务占死一个 worker 进程 | `concurrency=4` 最多同时轮询 4 个任务,第 5 个排队 |
|
||||
| worker 重启后循环直接丢失 | 内存中的 `while True` 不可持久化,OOM/重启 = 任务丢失 |
|
||||
| `time.sleep` 浪费进程资源 | worker 99% 时间在 sleep,实际有用工作不到 1% |
|
||||
|
||||
### 2.3 OOM 重启链
|
||||
|
||||
```
|
||||
4 个任务同时轮询
|
||||
→ 某些任务完成,触发 TOS 上传(下载视频 + 上传对象存储)
|
||||
→ 内存飙升超过 512Mi 限制
|
||||
→ K8s OOM Kill → worker 重启(共重启 15 次)
|
||||
→ 4 个进程中的 while True 循环全部丢失
|
||||
→ 等 recover_stuck_tasks(每 10 分钟)重新派发
|
||||
→ 重新派发后 worker 又被占满 → 又 OOM → 循环
|
||||
→ 实际恢复耗时 ≈ 50~60 分钟
|
||||
```
|
||||
|
||||
## 三、修复方案
|
||||
|
||||
### 3.1 核心改动:self.retry 替代 while True
|
||||
|
||||
```python
|
||||
# 新代码
|
||||
@shared_task(bind=True, max_retries=None, ignore_result=True)
|
||||
def poll_video_task(self, record_id):
|
||||
record = GenerationRecord.objects.get(pk=record_id)
|
||||
|
||||
ark_resp = query_task(record.ark_task_id)
|
||||
new_status = map_status(ark_resp.get('status', ''))
|
||||
|
||||
if new_status in ('queued', 'processing'):
|
||||
record.save(update_fields=['status', 'updated_at'])
|
||||
raise self.retry(countdown=5) # 5 秒后重新入队
|
||||
|
||||
# 到达终态 → 处理结果
|
||||
...
|
||||
```
|
||||
|
||||
**原理对比:**
|
||||
|
||||
| | 旧方式(while True) | 新方式(self.retry) |
|
||||
|---|---|---|
|
||||
| 任务生命周期 | 在 worker 进程内存中 | 在 Redis 队列中 |
|
||||
| worker 占用 | 持续占用直到完成(分钟级) | 每次查询仅占用毫秒级 |
|
||||
| worker 重启 | 任务丢失 | Redis 中的任务自动恢复 |
|
||||
| 并发能力 | 最多 4 个(= concurrency) | 数百个(受 API RPM 限制) |
|
||||
|
||||
### 3.2 recover_stuck_tasks 间隔缩短
|
||||
|
||||
| | 旧值 | 新值 |
|
||||
|---|---|---|
|
||||
| Beat 调度间隔 | 600 秒(10 分钟) | 180 秒(3 分钟) |
|
||||
| stuck 判定门槛 | 10 分钟 | 3 分钟 |
|
||||
| 最坏恢复时间 | ~20 分钟 | ~6 分钟 |
|
||||
|
||||
### 3.3 变更文件
|
||||
|
||||
| 文件 | 改动 |
|
||||
|------|------|
|
||||
| `backend/apps/generation/tasks.py` | `poll_video_task`: while True → self.retry;`recover_stuck_tasks`: 门槛 10 → 3 分钟 |
|
||||
| `backend/config/settings.py` | Beat schedule: 600 → 180 秒 |
|
||||
|
||||
## 四、效果预估
|
||||
|
||||
| 指标 | 修复前 | 修复后 |
|
||||
|------|--------|--------|
|
||||
| 同时轮询任务数上限 | 4 | 数百 |
|
||||
| worker 重启后任务恢复 | 丢失,等 10 分钟兜底 | 自动恢复,无需兜底 |
|
||||
| 最坏同步延迟 | 60+ 分钟 | ~15 秒(= 查询间隔 + 网络延迟) |
|
||||
| 内存占用 | 持续占满(sleep 期间不释放) | 脉冲式占用(查完释放) |
|
||||
| OOM 风险 | 高(4 进程常驻 + TOS 上传峰值) | 低(进程闲置时内存极小) |
|
||||
|
||||
## 五、部署注意
|
||||
|
||||
1. **无需数据库迁移** — 仅修改 Python 代码
|
||||
2. **部署后旧的 while True 任务会自然消亡** — 不需要手动干预
|
||||
3. **Redis 中可能有旧格式的任务** — 兼容无问题,新旧 `poll_video_task` 签名一致(`record_id` 参数不变)
|
||||
4. **建议同步部署**:先部署代码,再重启 Celery worker(`kubectl rollout restart deployment celery-worker`)
|
||||
@ -14,8 +14,6 @@ spec:
|
||||
labels:
|
||||
app: video-backend
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: swr-secret
|
||||
containers:
|
||||
- name: video-backend
|
||||
image: ${CI_REGISTRY_IMAGE}/video-backend:latest
|
||||
@ -34,29 +32,20 @@ spec:
|
||||
secretKeyRef:
|
||||
name: video-backend-secrets
|
||||
key: DJANGO_SECRET_KEY
|
||||
# Database (Aliyun RDS)
|
||||
# Database (Volcano Engine RDS - 默认测试环境,生产环境通过 CI 替换)
|
||||
- name: DB_HOST
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: video-backend-secrets
|
||||
key: DB_HOST
|
||||
value: "mysql-8351f937d637.rds.ivolces.com"
|
||||
- name: DB_NAME
|
||||
value: "video_auto"
|
||||
- name: DB_USER
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: video-backend-secrets
|
||||
key: DB_USER
|
||||
value: "zyc"
|
||||
- name: DB_PASSWORD
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: video-backend-secrets
|
||||
key: DB_PASSWORD
|
||||
value: "Zyc188208"
|
||||
- name: DB_PORT
|
||||
value: "3306"
|
||||
# Redis (Celery broker)
|
||||
- name: REDIS_URL
|
||||
value: "redis://:vAhRnAA6VMco@redis-cngzyc2r77ka16g7a.redis.ivolces.com:6379/0"
|
||||
value: "redis://zyc:Zyc188208@redis-shzlsczo52dft8mia.redis.ivolces.com:6379/0"
|
||||
# CORS
|
||||
- name: CORS_ALLOWED_ORIGINS
|
||||
value: "https://airflow-studio.airlabs.art"
|
||||
|
||||
@ -14,8 +14,6 @@ spec:
|
||||
labels:
|
||||
app: celery-worker
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: swr-secret
|
||||
containers:
|
||||
- name: celery-worker
|
||||
image: ${CI_REGISTRY_IMAGE}/video-backend:latest
|
||||
@ -35,25 +33,16 @@ spec:
|
||||
key: DJANGO_SECRET_KEY
|
||||
# Redis
|
||||
- name: REDIS_URL
|
||||
value: "redis://:vAhRnAA6VMco@redis-cngzyc2r77ka16g7a.redis.ivolces.com:6379/0"
|
||||
# Database (Aliyun RDS)
|
||||
value: "redis://zyc:Zyc188208@redis-shzlsczo52dft8mia.redis.ivolces.com:6379/0"
|
||||
# Database (Volcano Engine RDS - 默认测试环境,生产环境通过 CI 替换)
|
||||
- name: DB_HOST
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: video-backend-secrets
|
||||
key: DB_HOST
|
||||
value: "mysql-8351f937d637.rds.ivolces.com"
|
||||
- name: DB_NAME
|
||||
value: "video_auto"
|
||||
- name: DB_USER
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: video-backend-secrets
|
||||
key: DB_USER
|
||||
value: "zyc"
|
||||
- name: DB_PASSWORD
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: video-backend-secrets
|
||||
key: DB_PASSWORD
|
||||
value: "Zyc188208"
|
||||
- name: DB_PORT
|
||||
value: "3306"
|
||||
# TOS (from Secret)
|
||||
|
||||
@ -12,4 +12,4 @@ spec:
|
||||
solvers:
|
||||
- http01:
|
||||
ingress:
|
||||
class: alb
|
||||
class: traefik
|
||||
|
||||
@ -14,8 +14,6 @@ spec:
|
||||
labels:
|
||||
app: video-web
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: swr-secret
|
||||
containers:
|
||||
- name: video-web
|
||||
image: ${CI_REGISTRY_IMAGE}/video-web:latest
|
||||
|
||||
7888
video_auto copy.sql
Normal file
7888
video_auto copy.sql
Normal file
File diff suppressed because one or more lines are too long
7888
video_auto.sql
Normal file
7888
video_auto.sql
Normal file
File diff suppressed because one or more lines are too long
Loading…
x
Reference in New Issue
Block a user