Some checks failed
Build and Deploy / build-and-deploy (push) Has been cancelled
- Celery 异步任务:任务提交后后端持续轮询火山 API 直到拿到终态,用户关浏览器也不会丢视频 - 渐进式轮询:前2分钟每5秒、2-10分钟每15秒、10分钟后每30秒 - 优雅降级:无 Redis 时静默跳过,不影响现有前端轮询 - K8s:新增 Redis Deployment + Service、Celery Worker Deployment - CI/CD:deploy.yaml 自动部署 Redis/Celery,每次推代码自动重启 celery worker - 兜底:poll_stuck_tasks management command 清理僵尸任务 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
109 lines
4.4 KiB
Python
109 lines
4.4 KiB
Python
"""Management command to poll stuck tasks and update their status.
|
|
|
|
This is a fallback for when Celery workers miss tasks or aren't running.
|
|
Run via cron or K8s CronJob: python manage.py poll_stuck_tasks
|
|
"""
|
|
|
|
import logging
|
|
|
|
from django.core.management.base import BaseCommand
|
|
from django.utils import timezone
|
|
|
|
from apps.generation.models import GenerationRecord
|
|
from utils.airdrama_client import query_task, map_status, extract_video_url, ERROR_MESSAGES
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class Command(BaseCommand):
|
|
help = 'Poll Volcano API for stuck queued/processing tasks and update their status.'
|
|
|
|
def handle(self, *args, **options):
|
|
stuck = GenerationRecord.objects.filter(status__in=['queued', 'processing'])
|
|
count = stuck.count()
|
|
|
|
if count == 0:
|
|
self.stdout.write('No stuck tasks found.')
|
|
return
|
|
|
|
self.stdout.write(f'Found {count} stuck task(s), polling...')
|
|
|
|
resolved = 0
|
|
for record in stuck:
|
|
ark_task_id = record.ark_task_id
|
|
|
|
# No ark_task_id means API submission failed — mark as failed
|
|
if not ark_task_id:
|
|
record.status = 'failed'
|
|
record.error_message = '任务提交失败(系统清理)'
|
|
record.completed_at = timezone.now()
|
|
record.save(update_fields=['status', 'error_message', 'completed_at'])
|
|
from apps.generation.views import _release_freeze
|
|
_release_freeze(record)
|
|
resolved += 1
|
|
self.stdout.write(f' [{record.id}] no ark_task_id -> marked failed')
|
|
continue
|
|
|
|
# Poll Volcano API
|
|
try:
|
|
ark_resp = query_task(ark_task_id)
|
|
new_status = map_status(ark_resp.get('status', ''))
|
|
except Exception as e:
|
|
self.stdout.write(f' [{record.id}] ark={ark_task_id} API error: {e}')
|
|
continue
|
|
|
|
if new_status in ('queued', 'processing'):
|
|
self.stdout.write(f' [{record.id}] ark={ark_task_id} still {new_status}')
|
|
continue
|
|
|
|
# Terminal state — process
|
|
record.status = new_status
|
|
returned_seed = ark_resp.get('seed')
|
|
if returned_seed is not None:
|
|
record.seed = returned_seed
|
|
|
|
if new_status == 'completed':
|
|
video_url = extract_video_url(ark_resp)
|
|
if video_url:
|
|
try:
|
|
from utils.tos_client import upload_from_url
|
|
record.result_url = upload_from_url(video_url, folder='results')
|
|
except Exception:
|
|
logger.exception('Failed to persist video to TOS')
|
|
record.result_url = video_url
|
|
|
|
usage = ark_resp.get('usage', {})
|
|
total_tokens = usage.get('total_tokens', 0) if isinstance(usage, dict) else 0
|
|
if total_tokens > 0:
|
|
from apps.generation.views import _settle_payment
|
|
_settle_payment(record, total_tokens)
|
|
else:
|
|
from apps.generation.views import _release_freeze
|
|
_release_freeze(record)
|
|
|
|
elif new_status == 'failed':
|
|
error = ark_resp.get('error', {})
|
|
code = error.get('code', '') if isinstance(error, dict) else ''
|
|
raw_msg = error.get('message', '') if isinstance(error, dict) else str(error)
|
|
record.error_message = ERROR_MESSAGES.get(code, raw_msg)
|
|
record.raw_error = f'{code}: {raw_msg}' if code else raw_msg
|
|
|
|
usage = ark_resp.get('usage', {})
|
|
total_tokens = usage.get('total_tokens', 0) if isinstance(usage, dict) else 0
|
|
if total_tokens > 0:
|
|
from apps.generation.views import _settle_payment
|
|
_settle_payment(record, total_tokens)
|
|
else:
|
|
from apps.generation.views import _release_freeze
|
|
_release_freeze(record)
|
|
|
|
record.completed_at = timezone.now()
|
|
record.save(update_fields=[
|
|
'status', 'result_url', 'error_message', 'raw_error',
|
|
'seed', 'completed_at',
|
|
])
|
|
resolved += 1
|
|
self.stdout.write(f' [{record.id}] ark={ark_task_id} -> {new_status}')
|
|
|
|
self.stdout.write(f'Done. Resolved {resolved}/{count} tasks.')
|