refactor: 切换 Celery broker 为火山引擎 Redis + 僵尸任务自动恢复
All checks were successful
Build and Deploy / build-and-deploy (push) Successful in 2m16s
All checks were successful
Build and Deploy / build-and-deploy (push) Successful in 2m16s
- Redis 从阿里云切换到火山引擎(同区域低延迟) - delay() 失败改为 warning 日志 + 重试一次(不再静默吞异常) - 新增 recover_stuck_tasks 定时任务,每10分钟扫描卡住的任务重新派发 - 轮询时每次 touch updated_at,防止活跃任务被误判为僵尸 - Celery worker 启用内嵌 Beat 调度器(-B 参数) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
3cdeb55367
commit
6853b08fc9
@ -79,10 +79,9 @@ def poll_video_task(self, record_id):
|
|||||||
continue # retry on next interval
|
continue # retry on next interval
|
||||||
|
|
||||||
if new_status in ('queued', 'processing'):
|
if new_status in ('queued', 'processing'):
|
||||||
# Still running, update status and continue
|
# Still running, update status and touch updated_at
|
||||||
if record.status != new_status:
|
record.status = new_status
|
||||||
record.status = new_status
|
record.save(update_fields=['status', 'updated_at'])
|
||||||
record.save(update_fields=['status'])
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Terminal state reached — process result
|
# Terminal state reached — process result
|
||||||
@ -135,6 +134,33 @@ def _handle_completed(record, ark_resp):
|
|||||||
_release_freeze(record)
|
_release_freeze(record)
|
||||||
|
|
||||||
|
|
||||||
|
@shared_task(ignore_result=True)
|
||||||
|
def recover_stuck_tasks():
|
||||||
|
"""定时扫描卡在 processing/queued 超过 10 分钟的任务,重新派发轮询。"""
|
||||||
|
from datetime import timedelta
|
||||||
|
from django.utils import timezone
|
||||||
|
from apps.generation.models import GenerationRecord
|
||||||
|
|
||||||
|
cutoff = timezone.now() - timedelta(minutes=10)
|
||||||
|
stuck_records = GenerationRecord.objects.filter(
|
||||||
|
status__in=('queued', 'processing'),
|
||||||
|
ark_task_id__isnull=False,
|
||||||
|
updated_at__lt=cutoff, # updated_at 超过 10 分钟没更新,说明没有 worker 在轮询
|
||||||
|
).exclude(ark_task_id='')
|
||||||
|
|
||||||
|
count = 0
|
||||||
|
for record in stuck_records:
|
||||||
|
logger.warning('recover_stuck_tasks: re-dispatching record=%s ark=%s', record.id, record.ark_task_id)
|
||||||
|
try:
|
||||||
|
poll_video_task.delay(record.id)
|
||||||
|
count += 1
|
||||||
|
except Exception:
|
||||||
|
logger.error('recover_stuck_tasks: failed to dispatch record=%s', record.id)
|
||||||
|
|
||||||
|
if count:
|
||||||
|
logger.info('recover_stuck_tasks: re-dispatched %d stuck tasks', count)
|
||||||
|
|
||||||
|
|
||||||
def _handle_failed(record, ark_resp):
|
def _handle_failed(record, ark_resp):
|
||||||
"""Process a failed task: record error and release frozen amount."""
|
"""Process a failed task: record error and release frozen amount."""
|
||||||
from utils.airdrama_client import ERROR_MESSAGES
|
from utils.airdrama_client import ERROR_MESSAGES
|
||||||
|
|||||||
@ -372,12 +372,19 @@ def video_generate_view(request):
|
|||||||
record.ark_task_id = ark_task_id
|
record.ark_task_id = ark_task_id
|
||||||
record.status = 'processing'
|
record.status = 'processing'
|
||||||
record.save(update_fields=['ark_task_id', 'status'])
|
record.save(update_fields=['ark_task_id', 'status'])
|
||||||
# 触发后端异步轮询(连不上 Redis 时静默降级,前端轮询兜底)
|
# 触发后端异步轮询
|
||||||
try:
|
try:
|
||||||
from apps.generation.tasks import poll_video_task
|
from apps.generation.tasks import poll_video_task
|
||||||
poll_video_task.delay(record.id)
|
poll_video_task.delay(record.id)
|
||||||
except Exception:
|
except Exception:
|
||||||
logger.debug('Celery not available, falling back to frontend polling')
|
logger.warning('Celery dispatch failed for record %s, retrying once...', record.id)
|
||||||
|
import time
|
||||||
|
time.sleep(1)
|
||||||
|
try:
|
||||||
|
from apps.generation.tasks import poll_video_task as _poll
|
||||||
|
_poll.delay(record.id)
|
||||||
|
except Exception:
|
||||||
|
logger.error('Celery dispatch failed twice for record %s, relying on recovery task', record.id)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.exception('AirDrama API create task failed')
|
logger.exception('AirDrama API create task failed')
|
||||||
record.status = 'failed'
|
record.status = 'failed'
|
||||||
|
|||||||
@ -173,12 +173,18 @@ CSRF_TRUSTED_ORIGINS = [o for o in CORS_ALLOWED_ORIGINS if o.startswith('https:/
|
|||||||
# ──────────────────────────────────────────────
|
# ──────────────────────────────────────────────
|
||||||
# Celery (async task queue)
|
# Celery (async task queue)
|
||||||
# ──────────────────────────────────────────────
|
# ──────────────────────────────────────────────
|
||||||
CELERY_BROKER_URL = os.environ.get('REDIS_URL', 'redis://:vAhRnAA6VMco@r-7xvat0vez5clwbzk5vpd.redis.rds.aliyuncs.com:6379/8')
|
CELERY_BROKER_URL = os.environ.get('REDIS_URL', 'redis://:vAhRnAA6VMco@redis-cngzyc2r77ka16g7a.redis.ivolces.com:6379/0')
|
||||||
CELERY_RESULT_BACKEND = CELERY_BROKER_URL
|
CELERY_RESULT_BACKEND = CELERY_BROKER_URL
|
||||||
CELERY_ACCEPT_CONTENT = ['json']
|
CELERY_ACCEPT_CONTENT = ['json']
|
||||||
CELERY_TASK_SERIALIZER = 'json'
|
CELERY_TASK_SERIALIZER = 'json'
|
||||||
CELERY_RESULT_SERIALIZER = 'json'
|
CELERY_RESULT_SERIALIZER = 'json'
|
||||||
CELERY_TIMEZONE = 'Asia/Shanghai'
|
CELERY_TIMEZONE = 'Asia/Shanghai'
|
||||||
|
CELERY_BEAT_SCHEDULE = {
|
||||||
|
'recover-stuck-tasks': {
|
||||||
|
'task': 'apps.generation.tasks.recover_stuck_tasks',
|
||||||
|
'schedule': 600, # 每 10 分钟
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
LANGUAGE_CODE = 'zh-hans'
|
LANGUAGE_CODE = 'zh-hans'
|
||||||
TIME_ZONE = 'Asia/Shanghai'
|
TIME_ZONE = 'Asia/Shanghai'
|
||||||
|
|||||||
@ -56,7 +56,7 @@ spec:
|
|||||||
value: "3306"
|
value: "3306"
|
||||||
# Redis (Celery broker)
|
# Redis (Celery broker)
|
||||||
- name: REDIS_URL
|
- name: REDIS_URL
|
||||||
value: "redis://:vAhRnAA6VMco@r-7xvat0vez5clwbzk5vpd.redis.rds.aliyuncs.com:6379/8"
|
value: "redis://:vAhRnAA6VMco@redis-cngzyc2r77ka16g7a.redis.ivolces.com:6379/0"
|
||||||
# CORS
|
# CORS
|
||||||
- name: CORS_ALLOWED_ORIGINS
|
- name: CORS_ALLOWED_ORIGINS
|
||||||
value: "https://airflow-studio.airlabs.art"
|
value: "https://airflow-studio.airlabs.art"
|
||||||
|
|||||||
@ -20,7 +20,7 @@ spec:
|
|||||||
- name: celery-worker
|
- name: celery-worker
|
||||||
image: ${CI_REGISTRY_IMAGE}/video-backend:latest
|
image: ${CI_REGISTRY_IMAGE}/video-backend:latest
|
||||||
imagePullPolicy: Always
|
imagePullPolicy: Always
|
||||||
command: ["celery", "-A", "config", "worker", "--loglevel=info", "--concurrency=4"]
|
command: ["celery", "-A", "config", "worker", "--loglevel=info", "--concurrency=4", "-B"]
|
||||||
env:
|
env:
|
||||||
- name: USE_MYSQL
|
- name: USE_MYSQL
|
||||||
value: "true"
|
value: "true"
|
||||||
@ -35,7 +35,7 @@ spec:
|
|||||||
key: DJANGO_SECRET_KEY
|
key: DJANGO_SECRET_KEY
|
||||||
# Redis
|
# Redis
|
||||||
- name: REDIS_URL
|
- name: REDIS_URL
|
||||||
value: "redis://:vAhRnAA6VMco@r-7xvat0vez5clwbzk5vpd.redis.rds.aliyuncs.com:6379/8"
|
value: "redis://:vAhRnAA6VMco@redis-cngzyc2r77ka16g7a.redis.ivolces.com:6379/0"
|
||||||
# Database (Aliyun RDS)
|
# Database (Aliyun RDS)
|
||||||
- name: DB_HOST
|
- name: DB_HOST
|
||||||
valueFrom:
|
valueFrom:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user