feat: Add new API endpoints and HTML pages for ML model management

- Implemented HTML pages for datasets, models, training, testing, and results. - Created API endpoints for managing repositories, results, tests, and training sessions. - Added functionality for streaming training progress via Server-Sent Events (SSE). - Introduced a Dockerfile for the ML runner with necessary dependencies. - Developed an SDK for user code execution within the runner container. - Enhanced CSS styles for improved UI layout and navigation. - Established a layout template for consistent HTML structure across pages. - Added JavaScript for dynamic interactions on the models page. - Implemented WebSocket handling for real-time communication with kiosk devices and controllers. - Implemented model registration and management API at /api/models - Added Gitea proxy API for repository interactions at /api/repos - Created results API for listing and comparing training results at /api/results - Developed training management API for enqueueing and retrieving training jobs at /api/trainings - Introduced SSE endpoint for live training progress updates - Added HTML pages for models, datasets, and training management - Created a Dockerfile for the ML runner with necessary dependencies - Developed SDK for user code execution within the runner container - Enhanced CSS styles for improved UI/UX - Implemented WebSocket communication for real-time device and controller interactions in the kiosk system
2026-04-28 09:24:38 +02:00
parent ee478e52ef
commit 0ce879aa44
81 changed files with 7491 additions and 746 deletions
--- a/ml/core/worker.py
+++ b/ml/core/worker.py
@@ -0,0 +1,54 @@
+"""Worker loop: BRPOP da ml:queue:train e dispatch al docker_runner.
+
+Parte N task asincroni concorrenti (settings.train_concurrency).
+"""
+from __future__ import annotations
+
+import asyncio
+import logging
+
+from core import redis_client
+from core.config import settings
+from core.docker_runner import run_training_job
+
+log = logging.getLogger(__name__)
+
+_tasks: list[asyncio.Task] = []
+
+
+async def _worker_loop(idx: int):
+    r = redis_client.client()
+    log.info("ml worker[%d] started", idx)
+    while True:
+        try:
+            res = await r.brpop("ml:queue:train", timeout=10)
+        except Exception as e:
+            log.warning("brpop error: %s", e)
+            await asyncio.sleep(2)
+            continue
+        if res is None:
+            continue
+        _, training_id = res
+        log.info("worker[%d] picked training %s", idx, training_id)
+        try:
+            await run_training_job(training_id)
+        except Exception:
+            log.exception("worker[%d] training %s crashed", idx, training_id)
+
+
+def start_workers() -> None:
+    global _tasks
+    n = max(1, settings.train_concurrency)
+    for i in range(n):
+        _tasks.append(asyncio.create_task(_worker_loop(i)))
+
+
+async def stop_workers() -> None:
+    for t in _tasks:
+        t.cancel()
+    for t in _tasks:
+        try:
+            await t
+        except Exception:
+            pass
+    _tasks.clear()