"""Management command to poll stuck tasks and update their status. This is a fallback for when Celery workers miss tasks or aren't running. Run via cron or K8s CronJob: python manage.py poll_stuck_tasks """ import logging from django.core.management.base import BaseCommand from django.utils import timezone from apps.generation.models import GenerationRecord from utils.airdrama_client import query_task, map_status, extract_video_url, ERROR_MESSAGES logger = logging.getLogger(__name__) class Command(BaseCommand): help = 'Poll Volcano API for stuck queued/processing tasks and update their status.' def handle(self, *args, **options): stuck = GenerationRecord.objects.filter(status__in=['queued', 'processing']) count = stuck.count() if count == 0: self.stdout.write('No stuck tasks found.') return self.stdout.write(f'Found {count} stuck task(s), polling...') resolved = 0 for record in stuck: ark_task_id = record.ark_task_id # No ark_task_id means API submission failed — mark as failed if not ark_task_id: record.status = 'failed' record.error_message = '任务提交失败(系统清理)' record.completed_at = timezone.now() record.save(update_fields=['status', 'error_message', 'completed_at']) from apps.generation.views import _release_freeze _release_freeze(record) resolved += 1 self.stdout.write(f' [{record.id}] no ark_task_id -> marked failed') continue # Poll Volcano API try: ark_resp = query_task(ark_task_id) new_status = map_status(ark_resp.get('status', '')) except Exception as e: self.stdout.write(f' [{record.id}] ark={ark_task_id} API error: {e}') continue if new_status in ('queued', 'processing'): self.stdout.write(f' [{record.id}] ark={ark_task_id} still {new_status}') continue # Terminal state — process record.status = new_status returned_seed = ark_resp.get('seed') if returned_seed is not None: record.seed = returned_seed if new_status == 'completed': video_url = extract_video_url(ark_resp) if video_url: try: from utils.tos_client import upload_from_url record.result_url = upload_from_url(video_url, folder='results') except Exception: logger.exception('Failed to persist video to TOS') record.result_url = video_url usage = ark_resp.get('usage', {}) total_tokens = usage.get('total_tokens', 0) if isinstance(usage, dict) else 0 if total_tokens > 0: from apps.generation.views import _settle_payment _settle_payment(record, total_tokens) else: from apps.generation.views import _release_freeze _release_freeze(record) elif new_status == 'failed': error = ark_resp.get('error', {}) code = error.get('code', '') if isinstance(error, dict) else '' raw_msg = error.get('message', '') if isinstance(error, dict) else str(error) record.error_message = ERROR_MESSAGES.get(code, raw_msg) record.raw_error = f'{code}: {raw_msg}' if code else raw_msg usage = ark_resp.get('usage', {}) total_tokens = usage.get('total_tokens', 0) if isinstance(usage, dict) else 0 if total_tokens > 0: from apps.generation.views import _settle_payment _settle_payment(record, total_tokens) else: from apps.generation.views import _release_freeze _release_freeze(record) record.completed_at = timezone.now() record.save(update_fields=[ 'status', 'result_url', 'error_message', 'raw_error', 'seed', 'completed_at', ]) resolved += 1 self.stdout.write(f' [{record.id}] ark={ark_task_id} -> {new_status}') self.stdout.write(f'Done. Resolved {resolved}/{count} tasks.')