log-center/app/main.py
zyc 0d4b2d634c
All checks were successful
Build and Deploy Log Center / build-and-deploy (push) Successful in 2m24s
feat: 扩展日志收集,支持 CI/CD 构建错误和 K8s 部署错误
新增两种日志来源(cicd / deployment),使日志中台覆盖"构建→部署→运行"全链路:

后端变更:
- models.py: 新增 LogSource 枚举和 source 字段,file_path/line_number 改为可选
- main.py: 按来源生成不同指纹策略,所有查询端点支持 source 筛选,仪表盘新增来源分布统计
- database.py: 新增 4 条迁移 SQL(source 字段、索引、字段可空)
- task_manager.py: 修复 Agent 仅拉取 runtime 来源的缺陷

新增组件:
- k8s-monitor/: K8s Pod 健康监控脚本(Python),每 5 分钟检测异常 Pod 并上报
- k8s/monitor-cronjob.yaml: CronJob + RBAC 部署清单
- scripts/report-cicd-error.sh: CI/CD 错误上报 Bash 脚本
- scripts/gitea-actions-example.yaml: Gitea Actions 集成示例

前端变更:
- api.ts: 类型定义更新,支持 source 字段
- BugList.tsx: 新增来源筛选标签页和来源列
- BugDetail.tsx: 按来源条件渲染(CI/CD 信息、部署信息),非 runtime 禁用修复按钮
- Dashboard.tsx: 新增来源分布表格
- index.css: 来源标签样式(source-badge)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-24 10:20:16 +08:00

313 lines
11 KiB
Python

from fastapi import FastAPI, Depends, HTTPException, Query
from fastapi.middleware.cors import CORSMiddleware
from sqlmodel.ext.asyncio.session import AsyncSession
from sqlmodel import select, func
from .database import init_db, get_session
from .models import ErrorLog, ErrorLogCreate, LogStatus, TaskStatusUpdate, RepairTask, RepairTaskCreate
from datetime import datetime, timedelta
from typing import Optional, List
import hashlib
import json
app = FastAPI(title="Log Center & AIOps Control Plane")
# CORS for frontend
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # In production, restrict to your domain
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
@app.on_event("startup")
async def on_startup():
await init_db()
def generate_fingerprint(log: ErrorLogCreate) -> str:
source = log.source
if source == "cicd":
ctx = log.context or {}
raw = f"{log.project_id}|cicd|{log.error.get('type')}|{ctx.get('job_name', 'unknown')}|{ctx.get('step_name', 'unknown')}"
elif source == "deployment":
ctx = log.context or {}
raw = f"{log.project_id}|deployment|{log.error.get('type')}|{ctx.get('namespace', 'default')}|{ctx.get('deployment_name', 'unknown')}"
else:
raw = f"{log.project_id}|{log.error.get('type')}|{log.error.get('file_path')}|{log.error.get('line_number')}"
return hashlib.md5(raw.encode()).hexdigest()
# ==================== Log Reporting ====================
@app.post("/api/v1/logs/report", tags=["Logs"])
async def report_log(log_data: ErrorLogCreate, session: AsyncSession = Depends(get_session)):
fingerprint = generate_fingerprint(log_data)
# Check deduplication
statement = select(ErrorLog).where(ErrorLog.fingerprint == fingerprint)
results = await session.exec(statement)
existing_log = results.first()
if existing_log:
# If exists and not resolved, just ignore or update count (implied)
if existing_log.status not in [LogStatus.DEPLOYED, LogStatus.FIXED, LogStatus.VERIFIED]:
return {"message": "Log deduplicated", "id": existing_log.id, "status": existing_log.status}
# If it was resolved but happened again -> Regression! Reset to NEW?
existing_log.status = LogStatus.NEW
existing_log.timestamp = log_data.timestamp or datetime.utcnow()
existing_log.retry_count = 0 # Reset retries for new occurrence
session.add(existing_log)
await session.commit()
await session.refresh(existing_log)
return {"message": "Regression detected, reopened", "id": existing_log.id}
# Create new
new_log = ErrorLog(
project_id=log_data.project_id,
environment=log_data.environment,
level=log_data.level,
source=log_data.source,
error_type=log_data.error.get("type"),
error_message=log_data.error.get("message"),
file_path=log_data.error.get("file_path"),
line_number=log_data.error.get("line_number"),
stack_trace=log_data.error.get("stack_trace"),
context=log_data.context,
version=log_data.version,
commit_hash=log_data.commit_hash,
fingerprint=fingerprint,
timestamp=log_data.timestamp or datetime.utcnow()
)
session.add(new_log)
await session.commit()
await session.refresh(new_log)
return {"message": "Log reported", "id": new_log.id}
# ==================== Agent Tasks ====================
@app.get("/api/v1/tasks/pending", tags=["Tasks"])
async def get_pending_tasks(project_id: str = None, source: Optional[str] = None, session: AsyncSession = Depends(get_session)):
query = select(ErrorLog).where(ErrorLog.status == LogStatus.NEW)
if project_id:
query = query.where(ErrorLog.project_id == project_id)
if source:
query = query.where(ErrorLog.source == source)
results = await session.exec(query)
return results.all()
@app.put("/api/v1/tasks/{task_id}/status", tags=["Tasks"])
async def update_task_status(
task_id: int,
status_update: TaskStatusUpdate,
session: AsyncSession = Depends(get_session)
):
statement = select(ErrorLog).where(ErrorLog.id == task_id)
results = await session.exec(statement)
task = results.first()
if not task:
raise HTTPException(status_code=404, detail="Task not found")
task.status = status_update.status
# We could log the message to a history table if needed
session.add(task)
await session.commit()
await session.refresh(task)
return {"message": "Status updated", "id": task.id, "status": task.status}
# ==================== Repair Reports ====================
@app.post("/api/v1/repair/reports", tags=["Repair"])
async def create_repair_report(report: RepairTaskCreate, session: AsyncSession = Depends(get_session)):
"""Upload a new repair report"""
# 1. Create repair task record
repair_task = RepairTask.from_orm(report)
session.add(repair_task)
# 2. Update error log status (optional, but good for consistency)
if report.status in [LogStatus.FIXED, LogStatus.FIX_FAILED]:
log_stmt = select(ErrorLog).where(ErrorLog.id == report.error_log_id)
results = await session.exec(log_stmt)
error_log = results.first()
if error_log:
error_log.status = report.status
session.add(error_log)
await session.commit()
await session.refresh(repair_task)
return {"message": "Report uploaded", "id": repair_task.id}
@app.get("/api/v1/repair/reports", tags=["Repair"])
async def get_repair_reports(
page: int = Query(1, ge=1),
page_size: int = Query(20, ge=1, le=100),
project_id: Optional[str] = None,
error_log_id: Optional[int] = None,
session: AsyncSession = Depends(get_session)
):
"""Get repair reports list, optionally filtered by project or bug"""
query = select(RepairTask).order_by(RepairTask.created_at.desc())
if project_id:
query = query.where(RepairTask.project_id == project_id)
if error_log_id:
query = query.where(RepairTask.error_log_id == error_log_id)
offset = (page - 1) * page_size
query = query.offset(offset).limit(page_size)
results = await session.exec(query)
tasks = results.all()
# Get total
count_query = select(func.count(RepairTask.id))
if project_id:
count_query = count_query.where(RepairTask.project_id == project_id)
if error_log_id:
count_query = count_query.where(RepairTask.error_log_id == error_log_id)
count_result = await session.exec(count_query)
total = count_result.one()
return {
"items": tasks,
"total": total,
"page": page,
"page_size": page_size,
"total_pages": (total + page_size - 1) // page_size
}
@app.get("/api/v1/repair/reports/{report_id}", tags=["Repair"])
async def get_repair_report_detail(report_id: int, session: AsyncSession = Depends(get_session)):
"""Get detailed repair report"""
statement = select(RepairTask).where(RepairTask.id == report_id)
results = await session.exec(statement)
task = results.first()
if not task:
raise HTTPException(status_code=404, detail="Report not found")
return task
# ==================== Dashboard APIs ====================
@app.get("/api/v1/dashboard/stats", tags=["Dashboard"])
async def get_dashboard_stats(source: Optional[str] = None, session: AsyncSession = Depends(get_session)):
"""Get overall statistics for dashboard"""
today = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0)
def _apply_source(q):
return q.where(ErrorLog.source == source) if source else q
# Total bugs
total_query = _apply_source(select(func.count(ErrorLog.id)))
total_result = await session.exec(total_query)
total_bugs = total_result.one()
# Today's new bugs
today_query = _apply_source(select(func.count(ErrorLog.id)).where(ErrorLog.timestamp >= today))
today_result = await session.exec(today_query)
today_bugs = today_result.one()
# Count by status
status_counts = {}
for status in LogStatus:
count_query = _apply_source(select(func.count(ErrorLog.id)).where(ErrorLog.status == status))
count_result = await session.exec(count_query)
status_counts[status.value] = count_result.one()
# Fixed rate = (FIXED + VERIFIED + DEPLOYED) / Total
fixed_count = status_counts.get("FIXED", 0) + status_counts.get("VERIFIED", 0) + status_counts.get("DEPLOYED", 0)
fix_rate = round((fixed_count / total_bugs * 100), 2) if total_bugs > 0 else 0
# Source distribution
from .models import LogSource
source_counts = {}
for src in LogSource:
sq = select(func.count(ErrorLog.id)).where(ErrorLog.source == src.value)
sr = await session.exec(sq)
source_counts[src.value] = sr.one()
return {
"total_bugs": total_bugs,
"today_bugs": today_bugs,
"fix_rate": fix_rate,
"status_distribution": status_counts,
"source_distribution": source_counts,
}
@app.get("/api/v1/bugs", tags=["Dashboard"])
async def get_bugs_list(
page: int = Query(1, ge=1),
page_size: int = Query(20, ge=1, le=100),
status: Optional[LogStatus] = None,
project_id: Optional[str] = None,
source: Optional[str] = None,
session: AsyncSession = Depends(get_session)
):
"""Get paginated list of bugs with optional filters"""
query = select(ErrorLog).order_by(ErrorLog.timestamp.desc())
if status:
query = query.where(ErrorLog.status == status)
if project_id:
query = query.where(ErrorLog.project_id == project_id)
if source:
query = query.where(ErrorLog.source == source)
# Pagination
offset = (page - 1) * page_size
query = query.offset(offset).limit(page_size)
results = await session.exec(query)
bugs = results.all()
# Get total count for pagination info
count_query = select(func.count(ErrorLog.id))
if status:
count_query = count_query.where(ErrorLog.status == status)
if project_id:
count_query = count_query.where(ErrorLog.project_id == project_id)
if source:
count_query = count_query.where(ErrorLog.source == source)
count_result = await session.exec(count_query)
total = count_result.one()
return {
"items": bugs,
"total": total,
"page": page,
"page_size": page_size,
"total_pages": (total + page_size - 1) // page_size
}
@app.get("/api/v1/bugs/{bug_id}", tags=["Dashboard"])
async def get_bug_detail(bug_id: int, session: AsyncSession = Depends(get_session)):
"""Get detailed information about a specific bug"""
statement = select(ErrorLog).where(ErrorLog.id == bug_id)
results = await session.exec(statement)
bug = results.first()
if not bug:
raise HTTPException(status_code=404, detail="Bug not found")
return bug
@app.get("/api/v1/projects", tags=["Dashboard"])
async def get_projects(session: AsyncSession = Depends(get_session)):
"""Get list of all unique project IDs"""
query = select(ErrorLog.project_id).distinct()
results = await session.exec(query)
projects = results.all()
return {"projects": projects}
@app.get("/", tags=["Health"])
async def health_check():
return {"status": "ok"}