feat: Add new API endpoints and HTML pages for ML model management
- Implemented HTML pages for datasets, models, training, testing, and results. - Created API endpoints for managing repositories, results, tests, and training sessions. - Added functionality for streaming training progress via Server-Sent Events (SSE). - Introduced a Dockerfile for the ML runner with necessary dependencies. - Developed an SDK for user code execution within the runner container. - Enhanced CSS styles for improved UI layout and navigation. - Established a layout template for consistent HTML structure across pages. - Added JavaScript for dynamic interactions on the models page. - Implemented WebSocket handling for real-time communication with kiosk devices and controllers. - Implemented model registration and management API at /api/models - Added Gitea proxy API for repository interactions at /api/repos - Created results API for listing and comparing training results at /api/results - Developed training management API for enqueueing and retrieving training jobs at /api/trainings - Introduced SSE endpoint for live training progress updates - Added HTML pages for models, datasets, and training management - Created a Dockerfile for the ML runner with necessary dependencies - Developed SDK for user code execution within the runner container - Enhanced CSS styles for improved UI/UX - Implemented WebSocket communication for real-time device and controller interactions in the kiosk system
This commit is contained in:
54
ml/core/worker.py
Normal file
54
ml/core/worker.py
Normal file
@@ -0,0 +1,54 @@
|
||||
"""Worker loop: BRPOP da ml:queue:train e dispatch al docker_runner.
|
||||
|
||||
Parte N task asincroni concorrenti (settings.train_concurrency).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
|
||||
from core import redis_client
|
||||
from core.config import settings
|
||||
from core.docker_runner import run_training_job
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
_tasks: list[asyncio.Task] = []
|
||||
|
||||
|
||||
async def _worker_loop(idx: int):
|
||||
r = redis_client.client()
|
||||
log.info("ml worker[%d] started", idx)
|
||||
while True:
|
||||
try:
|
||||
res = await r.brpop("ml:queue:train", timeout=10)
|
||||
except Exception as e:
|
||||
log.warning("brpop error: %s", e)
|
||||
await asyncio.sleep(2)
|
||||
continue
|
||||
if res is None:
|
||||
continue
|
||||
_, training_id = res
|
||||
log.info("worker[%d] picked training %s", idx, training_id)
|
||||
try:
|
||||
await run_training_job(training_id)
|
||||
except Exception:
|
||||
log.exception("worker[%d] training %s crashed", idx, training_id)
|
||||
|
||||
|
||||
def start_workers() -> None:
|
||||
global _tasks
|
||||
n = max(1, settings.train_concurrency)
|
||||
for i in range(n):
|
||||
_tasks.append(asyncio.create_task(_worker_loop(i)))
|
||||
|
||||
|
||||
async def stop_workers() -> None:
|
||||
for t in _tasks:
|
||||
t.cancel()
|
||||
for t in _tasks:
|
||||
try:
|
||||
await t
|
||||
except Exception:
|
||||
pass
|
||||
_tasks.clear()
|
||||
Reference in New Issue
Block a user