feat: Add new API endpoints and HTML pages for ML model management

- Implemented HTML pages for datasets, models, training, testing, and results. - Created API endpoints for managing repositories, results, tests, and training sessions. - Added functionality for streaming training progress via Server-Sent Events (SSE). - Introduced a Dockerfile for the ML runner with necessary dependencies. - Developed an SDK for user code execution within the runner container. - Enhanced CSS styles for improved UI layout and navigation. - Established a layout template for consistent HTML structure across pages. - Added JavaScript for dynamic interactions on the models page. - Implemented WebSocket handling for real-time communication with kiosk devices and controllers. - Implemented model registration and management API at /api/models - Added Gitea proxy API for repository interactions at /api/repos - Created results API for listing and comparing training results at /api/results - Developed training management API for enqueueing and retrieving training jobs at /api/trainings - Introduced SSE endpoint for live training progress updates - Added HTML pages for models, datasets, and training management - Created a Dockerfile for the ML runner with necessary dependencies - Developed SDK for user code execution within the runner container - Enhanced CSS styles for improved UI/UX - Implemented WebSocket communication for real-time device and controller interactions in the kiosk system
2026-04-28 09:24:38 +02:00
parent ee478e52ef
commit 0ce879aa44
81 changed files with 7491 additions and 746 deletions
--- a/ml/.env.example
+++ b/ml/.env.example
@@ -0,0 +1,45 @@
+PORT=3007
+
+# Auth condiviso
+JWT_SECRET=change-me
+INTERNAL_API_KEY=change-me
+AUTH_LOGIN_URL=https://auth.mebboat.it/login
+
+# Postgres (db ml)
+PG_HOST=meb-postgres
+PG_PORT=5432
+DB_USER=meb
+DB_PASSWORD=meb
+ML_DB=ml
+
+# Redis
+REDIS_HOST=meb-redis
+REDIS_PORT=6379
+
+# MinIO (bucket unico)
+MINIO_ENDPOINT=minio
+MINIO_PORT=9000
+MINIO_USE_SSL=false
+MINIO_ACCESS_KEY=
+MINIO_SECRET_KEY=
+MINIO_BUCKET=ml
+
+# InfluxDB
+INFLUX_URL=http://meb-influx:8086
+INFLUX_TOKEN=
+INFLUX_ORG=meb
+INFLUX_BUCKET=ml_metrics
+
+# Gitea (self-hosted esterno)
+GITEA_URL=https://git.mebboat.it
+GITEA_TOKEN=
+
+# API service
+API_URL=http://api:3003
+
+# Training runtime
+ML_TRAIN_CONCURRENCY=1
+ML_RUNNER_IMAGE=meb-ml-runner:latest
+ML_RUNNER_TMP=/var/ml/tmp
+ML_GITCACHE_DIR=/var/ml/gitcache
+ML_MAX_UPLOAD_MB=500
--- a/ml/Dockerfile
+++ b/ml/Dockerfile
@@ -3,6 +3,9 @@ FROM python:3.11-slim
 WORKDIR /app
 ENV PYTHONUNBUFFERED=1

+RUN apt-get update && apt-get install -y --no-install-recommends git \
+    && rm -rf /var/lib/apt/lists/*
+
 COPY ./requirements.txt .

 RUN pip install --no-cache-dir -r requirements.txt
--- a/ml/core/api_client.py
+++ b/ml/core/api_client.py
@@ -0,0 +1,72 @@
+"""Client HTTP verso l'api-service (service-to-service via x-api-key).
+
+Espone accesso a:
+  /jobs              ciclo di vita job
+  /queue             stato coda
+  /pageconnections   registro sessioni di pagina (enforcement /test max 2)
+"""
+from __future__ import annotations
+
+from typing import Any, Optional
+
+import httpx
+
+from core.config import settings
+
+
+def _headers() -> dict:
+    return {"x-api-key": settings.internal_api_key, "Content-Type": "application/json"}
+
+
+async def _req(method: str, path: str, json: Optional[dict] = None, params: Optional[dict] = None) -> Any:
+    url = f"{settings.api_url}{path}"
+    async with httpx.AsyncClient(timeout=10.0) as c:
+        r = await c.request(method, url, json=json, params=params, headers=_headers())
+        r.raise_for_status()
+        if r.status_code == 204 or not r.content:
+            return None
+        return r.json()
+
+
+# ── jobs ────────────────────────────────────────────────────────────────────
+async def create_job(type_: str, created_by: str, payload: dict) -> dict:
+    return await _req("POST", "/jobs", json={"type": type_, "created_by": created_by, "payload": payload})
+
+
+async def update_job(job_id: str, **fields) -> dict:
+    return await _req("PATCH", f"/jobs/{job_id}", json=fields)
+
+
+async def get_job(job_id: str) -> dict:
+    return await _req("GET", f"/jobs/{job_id}")
+
+
+async def list_jobs(type_: Optional[str] = None, status: Optional[str] = None, limit: int = 50) -> list:
+    params = {"limit": str(limit)}
+    if type_:
+        params["type"] = type_
+    if status:
+        params["status"] = status
+    return await _req("GET", "/jobs", params=params) or []
+
+
+# ── queue ───────────────────────────────────────────────────────────────────
+async def queue_status(type_: str = "train") -> dict:
+    return await _req("GET", "/queue", params={"type": type_})
+
+
+# ── page connections ───────────────────────────────────────────────────────
+async def page_connect(page: str, user_id: str, session_id: str) -> dict:
+    return await _req("POST", "/pageconnections", json={"page": page, "user_id": user_id, "session_id": session_id})
+
+
+async def page_ping(session_id: str) -> dict:
+    return await _req("POST", f"/pageconnections/{session_id}/ping")
+
+
+async def page_disconnect(session_id: str) -> None:
+    await _req("DELETE", f"/pageconnections/{session_id}")
+
+
+async def page_count(page: str) -> dict:
+    return await _req("GET", f"/pageconnections/{page}")
--- a/ml/core/config.py
+++ b/ml/core/config.py
@@ -0,0 +1,64 @@
+"""Configurazione centralizzata del servizio ML, letta da env."""
+from __future__ import annotations
+
+import os
+from dataclasses import dataclass
+
+
+def _b(name: str, default: bool = False) -> bool:
+    return os.environ.get(name, str(default)).lower() in ("1", "true", "yes", "on")
+
+
+@dataclass(frozen=True)
+class Settings:
+    # Postgres (db "ml")
+    pg_host: str = os.environ.get("PG_HOST", "meb-postgres")
+    pg_port: int = int(os.environ.get("PG_PORT", "5432"))
+    pg_user: str = os.environ.get("DB_USER", "meb")
+    pg_password: str = os.environ.get("DB_PASSWORD", "meb")
+    pg_db: str = os.environ.get("ML_DB", "ml")
+
+    # Redis
+    redis_host: str = os.environ.get("REDIS_HOST", "meb-redis")
+    redis_port: int = int(os.environ.get("REDIS_PORT", "6379"))
+
+    # MinIO (bucket unico)
+    minio_endpoint: str = os.environ.get("MINIO_ENDPOINT", "minio")
+    minio_port: int = int(os.environ.get("MINIO_PORT", "9000"))
+    minio_use_ssl: bool = _b("MINIO_USE_SSL", False)
+    minio_access_key: str = os.environ.get("MINIO_ACCESS_KEY", "")
+    minio_secret_key: str = os.environ.get("MINIO_SECRET_KEY", "")
+    minio_bucket: str = os.environ.get("MINIO_BUCKET", "ml")
+
+    # InfluxDB — accetta sia INFLUX_* che INFLX_* per allinearsi alle var già
+    # usate dagli altri servizi (realtime, api) senza dover duplicare la config.
+    influx_url: str = os.environ.get("INFLUX_URL") or os.environ.get("INFLX_URL", "http://meb-influx:8086")
+    influx_token: str = os.environ.get("INFLUX_TOKEN") or os.environ.get("INFLX_TOKEN", "")
+    influx_org: str = os.environ.get("INFLUX_ORG") or os.environ.get("INFLX_ORG", "meb")
+    # Bucket dedicato alle metriche di training/test ML, separato dai logs e
+    # dai dati meteo. Sovrascrivibile via INFLUX_BUCKET o ML_INFLUX_BUCKET.
+    influx_bucket: str = os.environ.get("ML_INFLUX_BUCKET") or os.environ.get("INFLUX_BUCKET", "ml_metrics")
+
+    # Gitea (installato esternamente)
+    gitea_url: str = os.environ.get("GITEA_URL", "")
+    gitea_token: str = os.environ.get("GITEA_TOKEN", "")
+
+    # API service (per jobs/queue/pageconnections)
+    api_url: str = os.environ.get("API_URL", "http://api:3003")
+    internal_api_key: str = os.environ.get("INTERNAL_API_KEY", "")
+
+    # Auth (condiviso)
+    jwt_secret: str = os.environ.get("JWT_SECRET", "")
+    auth_login_url: str = os.environ.get("AUTH_LOGIN_URL", "https://auth.mebboat.it/login")
+
+    # Esecuzione training
+    train_concurrency: int = int(os.environ.get("ML_TRAIN_CONCURRENCY", "1"))
+    runner_image: str = os.environ.get("ML_RUNNER_IMAGE", "meb-ml-runner:latest")
+    runner_tmp_dir: str = os.environ.get("ML_RUNNER_TMP", "/var/ml/tmp")
+    gitcache_dir: str = os.environ.get("ML_GITCACHE_DIR", "/var/ml/gitcache")
+
+    # Limiti runtime
+    max_upload_mb: int = int(os.environ.get("ML_MAX_UPLOAD_MB", "500"))
+
+
+settings = Settings()
--- a/ml/core/db.py
+++ b/ml/core/db.py
@@ -0,0 +1,53 @@
+"""Connessione asyncpg al database ml. Pool singleton."""
+from __future__ import annotations
+
+import asyncpg
+from typing import Optional
+
+from core.config import settings
+
+_pool: Optional[asyncpg.Pool] = None
+
+
+async def init_pool() -> asyncpg.Pool:
+    global _pool
+    if _pool is None:
+        _pool = await asyncpg.create_pool(
+            host=settings.pg_host,
+            port=settings.pg_port,
+            user=settings.pg_user,
+            password=settings.pg_password,
+            database=settings.pg_db,
+            min_size=1,
+            max_size=10,
+            command_timeout=30,
+        )
+    return _pool
+
+
+async def close_pool() -> None:
+    global _pool
+    if _pool is not None:
+        await _pool.close()
+        _pool = None
+
+
+def pool() -> asyncpg.Pool:
+    if _pool is None:
+        raise RuntimeError("DB pool not initialized — call init_pool() at startup")
+    return _pool
+
+
+async def fetch(sql: str, *args):
+    async with pool().acquire() as c:
+        return await c.fetch(sql, *args)
+
+
+async def fetchrow(sql: str, *args):
+    async with pool().acquire() as c:
+        return await c.fetchrow(sql, *args)
+
+
+async def execute(sql: str, *args):
+    async with pool().acquire() as c:
+        return await c.execute(sql, *args)
--- a/ml/core/docker_runner.py
+++ b/ml/core/docker_runner.py
@@ -0,0 +1,439 @@
+"""Runner Docker per train e test.
+
+train:
+  - clone repo Gitea @ sha
+  - prepara workdir /var/ml/tmp/{training_id}
+  - scarica dataset da MinIO in workdir/data.<ext>
+  - docker run meb-ml-runner con mount tmp, env, limits da model.yml
+  - legge stdout JSON → Redis stream + Influx; docker stats ogni 5s
+  - a fine: collect outputs, upload su MinIO prefix artifacts_prefix
+  - UPDATE trainings
+
+test:
+  - analogo ma sincrono, stdin JSON → stdout JSON
+"""
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+import os
+import shutil
+import subprocess
+import time
+import uuid
+from pathlib import Path
+from typing import Any, Optional
+
+import docker
+from influxdb_client import Point
+
+from core import db, gitea, influx_client, minio_client, redis_client
+from core.config import settings
+from core.model_spec import fetch_and_parse_spec
+
+log = logging.getLogger(__name__)
+
+_docker = None
+
+
+def _docker_client():
+    global _docker
+    if _docker is None:
+        _docker = docker.from_env()
+    return _docker
+
+
+async def _emit(stream_key: str, payload: dict) -> None:
+    try:
+        await redis_client.client().xadd(stream_key, {"payload": json.dumps(payload)}, maxlen=10_000)
+    except Exception as e:
+        log.warning("xadd failed: %s", e)
+
+
+async def _clone_repo(owner_repo: str, sha: str, dest: Path) -> None:
+    dest.mkdir(parents=True, exist_ok=True)
+    url = gitea.clone_url(owner_repo)
+    # clone shallow del branch/sha specifico
+    # per evitare leak del token nei log, logghiamo solo host
+    proc = await asyncio.create_subprocess_exec(
+        "git", "clone", "--depth", "50", url, str(dest),
+        stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+    )
+    _, err = await proc.communicate()
+    if proc.returncode != 0:
+        raise RuntimeError(f"git clone failed: {err.decode(errors='replace')[:400]}")
+    # checkout sha
+    proc = await asyncio.create_subprocess_exec(
+        "git", "-C", str(dest), "checkout", sha,
+        stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+    )
+    _, err = await proc.communicate()
+    if proc.returncode != 0:
+        raise RuntimeError(f"git checkout failed: {err.decode(errors='replace')[:400]}")
+
+
+async def _download_dataset(dataset_id: str, dest: Path) -> str:
+    row = await db.fetchrow(
+        "SELECT file_key, format FROM datasets WHERE id = $1", uuid.UUID(dataset_id)
+    )
+    if not row:
+        raise RuntimeError("dataset not found")
+    data = minio_client.get_bytes(row["file_key"], bucket="ml.datasets")
+    ext = {"csv": "csv", "json": "json", "netcdf": "nc"}.get(row["format"], "bin")
+    out = dest / f"data.{ext}"
+    out.write_bytes(data)
+    return str(out)
+
+
+def _stats_loop_sync(container, training_id: str, model_id: str, samples: list, stop_evt: asyncio.Event, loop: asyncio.AbstractEventLoop):
+    """Sincrono, eseguito in thread. Ogni 5s legge docker stats → Influx + samples."""
+    while not stop_evt.is_set():
+        try:
+            stats = container.stats(stream=False)
+            # CPU%
+            cpu_delta = stats["cpu_stats"]["cpu_usage"]["total_usage"] - stats["precpu_stats"]["cpu_usage"]["total_usage"]
+            sys_delta = stats["cpu_stats"].get("system_cpu_usage", 0) - stats["precpu_stats"].get("system_cpu_usage", 0)
+            online = stats["cpu_stats"].get("online_cpus") or len(stats["cpu_stats"]["cpu_usage"].get("percpu_usage") or [1])
+            cpu_pct = (cpu_delta / sys_delta) * online * 100.0 if sys_delta > 0 else 0.0
+            mem_mb = (stats["memory_stats"].get("usage") or 0) / (1024 * 1024)
+
+            samples.append((cpu_pct, mem_mb))
+            point = (
+                Point("ml_training")
+                .tag("training_id", training_id)
+                .tag("model_id", model_id)
+                .field("cpu_pct", float(cpu_pct))
+                .field("mem_mb", float(mem_mb))
+            )
+            asyncio.run_coroutine_threadsafe(influx_client.write_points([point]), loop)
+        except Exception as e:
+            log.warning("stats loop error: %s", e)
+        time.sleep(5)
+
+
+async def _stream_container_logs(container, training_id: str, model_id: str, stream_key: str):
+    """Legge stdout del container, pubblica righe JSON su Redis stream e Influx."""
+    def _iter():
+        return container.logs(stream=True, follow=True, stdout=True, stderr=True)
+
+    loop = asyncio.get_event_loop()
+    it = await loop.run_in_executor(None, _iter)
+
+    while True:
+        line = await loop.run_in_executor(None, next, it, None)
+        if line is None:
+            break
+        try:
+            text = line.decode("utf-8", errors="replace").rstrip("\n")
+        except Exception:
+            continue
+        if not text:
+            continue
+        # righe non-JSON → log
+        payload: dict
+        if text.startswith("{") and text.endswith("}"):
+            try:
+                payload = json.loads(text)
+            except json.JSONDecodeError:
+                payload = {"type": "log", "level": "info", "message": text}
+        else:
+            payload = {"type": "log", "level": "info", "message": text}
+
+        await _emit(stream_key, payload)
+
+        if payload.get("type") == "metric":
+            p = Point("ml_training").tag("training_id", training_id).tag("model_id", model_id)
+            for k, v in payload.items():
+                if k == "type":
+                    continue
+                if isinstance(v, (int, float)):
+                    p = p.field(k, float(v))
+            try:
+                await influx_client.write_points([p])
+            except Exception as e:
+                log.warning("influx write metric failed: %s", e)
+
+
+async def run_training_job(training_id: str) -> None:
+    """Esegue un job di training end-to-end. Aggiorna Postgres e Redis state."""
+    r = redis_client.client()
+    state_key = f"ml:train:{training_id}"
+    stream_key = f"ml:train:{training_id}:events"
+
+    tr = await db.fetchrow("SELECT * FROM trainings WHERE id = $1", uuid.UUID(training_id))
+    if not tr:
+        log.error("training %s not found", training_id)
+        return
+    model = await db.fetchrow("SELECT * FROM models WHERE id = $1", tr["model_id"])
+    if not model:
+        await db.execute(
+            "UPDATE trainings SET status='failed', error=$2 WHERE id=$1",
+            uuid.UUID(training_id), "model not found",
+        )
+        return
+
+    await db.execute(
+        "UPDATE trainings SET status='running', started_at=NOW() WHERE id=$1",
+        uuid.UUID(training_id),
+    )
+    await r.hset(state_key, mapping={"status": "running", "progress": "0", "message": "starting"})
+
+    workdir = Path(settings.runner_tmp_dir) / training_id
+    artifacts_prefix = f"models/{tr['model_id']}/{tr['version']}/{tr['patch']}"
+    error: Optional[str] = None
+    samples: list[tuple[float, float]] = []
+    try:
+        workdir.mkdir(parents=True, exist_ok=True)
+        await _emit(stream_key, {"type": "log", "level": "info", "message": "cloning repo"})
+        await _clone_repo(model["gitea_repo"], tr["patch"], workdir / "repo")
+
+        await _emit(stream_key, {"type": "log", "level": "info", "message": "parsing model.yml"})
+        spec = await fetch_and_parse_spec(model["gitea_repo"], tr["patch"]) or {}
+        train_spec = spec.get("train", {})
+        entrypoint = train_spec.get("entrypoint") or "python -m src.train"
+        resources = spec.get("resources", {}) or {}
+
+        await _emit(stream_key, {"type": "log", "level": "info", "message": "downloading dataset"})
+        dataset_path = await _download_dataset(str(tr["dataset_id"]), workdir)
+
+        out_dir = workdir / "out"
+        out_dir.mkdir(exist_ok=True)
+
+        # docker run
+        dc = _docker_client()
+        await _emit(stream_key, {"type": "log", "level": "info", "message": "starting container"})
+        container = dc.containers.run(
+            settings.runner_image,
+            command=["sh", "-c", f"cd /workdir/repo && pip install -q -r requirements.txt 2>&1 || true && {entrypoint}"],
+            detach=True,
+            working_dir="/workdir/repo",
+            environment={
+                "MEB_DATASET_PATH": f"/workdir/{Path(dataset_path).name}",
+                "MEB_ARTIFACTS_DIR": "/workdir/out",
+                "MEB_TRAINING_ID": training_id,
+            },
+            volumes={str(workdir): {"bind": "/workdir", "mode": "rw"}},
+            network_mode="none",
+            mem_limit=f"{int(resources.get('mem_mb', 2048))}m",
+            nano_cpus=int(float(resources.get("cpu", 1)) * 1e9),
+            read_only=False,
+            tty=False,
+            detach_mode=None,
+        )
+
+        loop = asyncio.get_event_loop()
+        stop_evt = asyncio.Event()
+        stats_task = loop.run_in_executor(
+            None, _stats_loop_sync, container, training_id, str(tr["model_id"]), samples, stop_evt, loop
+        )
+        log_task = asyncio.create_task(
+            _stream_container_logs(container, training_id, str(tr["model_id"]), stream_key)
+        )
+
+        # attendi exit
+        exit_code = await loop.run_in_executor(None, lambda: container.wait()["StatusCode"])
+        stop_evt.set()
+        await log_task
+        try:
+            stats_task.cancel()
+        except Exception:
+            pass
+
+        if exit_code != 0:
+            error = f"container exited with code {exit_code}"
+
+        # raccogli outputs
+        results: dict = {}
+        final_metrics_path = out_dir / "metrics.json"
+        if final_metrics_path.exists():
+            try:
+                results = json.loads(final_metrics_path.read_text())
+            except Exception:
+                results = {"raw": final_metrics_path.read_text()[:10000]}
+
+        # upload artefatti (tutta la cartella out/)
+        for p in out_dir.rglob("*"):
+            if p.is_file():
+                rel = p.relative_to(out_dir).as_posix()
+                key = f"{artifacts_prefix}/{rel}"
+                minio_client.put_bytes(key, p.read_bytes())
+
+        # upload logs jsonl dallo stream redis (copia su minio per persistenza)
+        try:
+            entries = await r.xrange(stream_key, min="-", max="+")
+            lines = "\n".join(json.dumps({"id": i, **({"payload": json.loads(f.get("payload", "{}"))} if "payload" in f else f)}) for i, f in entries)
+            minio_client.put_bytes(f"trainings/{training_id}/logs.jsonl", lines.encode("utf-8"), "application/x-ndjson")
+        except Exception as e:
+            log.warning("log archive failed: %s", e)
+
+        cpu_avg = sum(s[0] for s in samples) / len(samples) if samples else 0.0
+        cpu_peak = max((s[0] for s in samples), default=0.0)
+        mem_avg = sum(s[1] for s in samples) / len(samples) if samples else 0.0
+        mem_peak = max((s[1] for s in samples), default=0.0)
+        resource_summary = {
+            "cpu_avg": round(cpu_avg, 2),
+            "cpu_peak": round(cpu_peak, 2),
+            "mem_avg_mb": round(mem_avg, 2),
+            "mem_peak_mb": round(mem_peak, 2),
+            "samples": len(samples),
+        }
+
+        status = "failed" if error else "succeeded"
+        await db.execute(
+            """
+            UPDATE trainings SET
+              status=$2,
+              finished_at=NOW(),
+              duration_ms=EXTRACT(EPOCH FROM (NOW() - started_at))*1000,
+              artifacts_prefix=$3,
+              results=$4::jsonb,
+              resource_summary=$5::jsonb,
+              error=$6
+            WHERE id=$1
+            """,
+            uuid.UUID(training_id),
+            status,
+            artifacts_prefix,
+            json.dumps(results),
+            json.dumps(resource_summary),
+            error,
+        )
+        await r.hset(state_key, mapping={"status": status, "progress": "100", "message": error or "done"})
+        await _emit(stream_key, {"type": "end", "status": status, "error": error})
+
+        # Flush dei punti Influx accumulati durante il training (batched).
+        await influx_client.flush()
+
+        try:
+            container.remove(force=True)
+        except Exception:
+            pass
+
+    except Exception as e:
+        log.exception("training %s failed: %s", training_id, e)
+        await db.execute(
+            "UPDATE trainings SET status='failed', finished_at=NOW(), error=$2 WHERE id=$1",
+            uuid.UUID(training_id), str(e)[:1000],
+        )
+        await r.hset(state_key, mapping={"status": "failed", "message": str(e)[:200]})
+        await _emit(stream_key, {"type": "end", "status": "failed", "error": str(e)[:400]})
+    finally:
+        # cleanup workdir
+        try:
+            shutil.rmtree(workdir, ignore_errors=True)
+        except Exception:
+            pass
+
+
+async def run_test_once(training_id: str, inputs: dict) -> dict:
+    """Esegue una singola predizione via container spawn."""
+    tr = await db.fetchrow(
+        "SELECT t.*, m.gitea_repo FROM trainings t JOIN models m ON t.model_id = m.id WHERE t.id=$1",
+        uuid.UUID(training_id),
+    )
+    if not tr:
+        raise RuntimeError("training not found")
+
+    spec = await fetch_and_parse_spec(tr["gitea_repo"], tr["patch"]) or {}
+    test_spec = spec.get("test") or {}
+    entrypoint = test_spec.get("entrypoint") or "python -m src.predict"
+
+    workdir = Path(settings.runner_tmp_dir) / f"test-{uuid.uuid4()}"
+    workdir.mkdir(parents=True, exist_ok=True)
+    try:
+        await _clone_repo(tr["gitea_repo"], tr["patch"], workdir / "repo")
+
+        # scarica artefatti
+        if tr["artifacts_prefix"]:
+            art_dir = workdir / "artifacts"
+            art_dir.mkdir(exist_ok=True)
+            for obj in minio_client.list_prefix(tr["artifacts_prefix"] + "/"):
+                rel = obj["name"][len(tr["artifacts_prefix"]) + 1:]
+                out_path = art_dir / rel
+                out_path.parent.mkdir(parents=True, exist_ok=True)
+                out_path.write_bytes(minio_client.get_bytes(obj["name"]))
+
+        dc = _docker_client()
+        payload = json.dumps({"inputs": inputs}).encode()
+        container = dc.containers.run(
+            settings.runner_image,
+            command=["sh", "-c", f"cd /workdir/repo && pip install -q -r requirements.txt 2>&1 >/dev/null || true && {entrypoint}"],
+            detach=True,
+            working_dir="/workdir/repo",
+            environment={
+                "MEB_ARTIFACTS_DIR": "/workdir/artifacts",
+                "MEB_TRAINING_ID": training_id,
+            },
+            volumes={str(workdir): {"bind": "/workdir", "mode": "ro"}},
+            network_mode="none",
+            mem_limit="2048m",
+            nano_cpus=int(1e9),
+            stdin_open=True,
+            tty=False,
+        )
+
+        # scrivi input su stdin via attach socket
+        sock = container.attach_socket(params={"stdin": 1, "stream": 1})
+        try:
+            sock._sock.sendall(payload + b"\n")
+        except Exception:
+            pass
+        try:
+            sock.close()
+        except Exception:
+            pass
+
+        loop = asyncio.get_event_loop()
+        # stats peak
+        peak_cpu = 0.0
+        peak_mem = 0.0
+        stop = False
+
+        def _stats():
+            nonlocal peak_cpu, peak_mem, stop
+            for st in container.stats(stream=True, decode=True):
+                if stop:
+                    return
+                try:
+                    cpu_delta = st["cpu_stats"]["cpu_usage"]["total_usage"] - st["precpu_stats"]["cpu_usage"]["total_usage"]
+                    sys_delta = st["cpu_stats"].get("system_cpu_usage", 0) - st["precpu_stats"].get("system_cpu_usage", 0)
+                    online = st["cpu_stats"].get("online_cpus") or 1
+                    cpu_pct = (cpu_delta / sys_delta) * online * 100 if sys_delta > 0 else 0
+                    mem_mb = (st["memory_stats"].get("usage") or 0) / (1024 * 1024)
+                    peak_cpu = max(peak_cpu, cpu_pct)
+                    peak_mem = max(peak_mem, mem_mb)
+                except Exception:
+                    pass
+
+        stats_fut = loop.run_in_executor(None, _stats)
+
+        exit_info = await loop.run_in_executor(None, container.wait)
+        stop = True
+        logs = container.logs(stdout=True, stderr=False).decode("utf-8", errors="replace")
+        try:
+            container.remove(force=True)
+        except Exception:
+            pass
+
+        outputs: dict = {}
+        for line in logs.strip().splitlines():
+            line = line.strip()
+            if line.startswith("{") and line.endswith("}"):
+                try:
+                    obj = json.loads(line)
+                    if "outputs" in obj:
+                        outputs = obj["outputs"]
+                        break
+                except json.JSONDecodeError:
+                    continue
+
+        return {
+            "outputs": outputs,
+            "exit_code": exit_info.get("StatusCode"),
+            "cpu_peak": round(peak_cpu, 2),
+            "mem_peak_mb": round(peak_mem, 2),
+            "raw_log": logs[-2000:],
+        }
+    finally:
+        shutil.rmtree(workdir, ignore_errors=True)
--- a/ml/core/gitea.py
+++ b/ml/core/gitea.py
@@ -0,0 +1,57 @@
+"""Client Gitea: browse repo, branches, commits, file raw, clone URL autenticato."""
+from __future__ import annotations
+
+from typing import Optional
+
+import httpx
+
+from core.config import settings
+
+
+def _headers() -> dict:
+    h = {"Accept": "application/json"}
+    if settings.gitea_token:
+        h["Authorization"] = f"token {settings.gitea_token}"
+    return h
+
+
+def clone_url(owner_repo: str) -> str:
+    """URL https://oauth2:TOKEN@<host>/owner/repo.git — usato SOLO lato server."""
+    if not settings.gitea_url:
+        raise RuntimeError("GITEA_URL not configured")
+    base = settings.gitea_url.rstrip("/")
+    if settings.gitea_token:
+        base = base.replace("https://", f"https://oauth2:{settings.gitea_token}@").replace(
+            "http://", f"http://oauth2:{settings.gitea_token}@"
+        )
+    return f"{base}/{owner_repo}.git"
+
+
+async def _get(path: str, params: Optional[dict] = None) -> list | dict:
+    url = f"{settings.gitea_url.rstrip('/')}/api/v1{path}"
+    async with httpx.AsyncClient(timeout=15.0) as c:
+        r = await c.get(url, params=params, headers=_headers())
+        r.raise_for_status()
+        return r.json()
+
+
+async def list_repos(limit: int = 50) -> list[dict]:
+    data = await _get("/repos/search", params={"limit": str(limit)})
+    return data.get("data", []) if isinstance(data, dict) else []
+
+
+async def list_branches(owner_repo: str) -> list[dict]:
+    return await _get(f"/repos/{owner_repo}/branches")
+
+
+async def list_commits(owner_repo: str, branch: str = "main", limit: int = 50) -> list[dict]:
+    return await _get(f"/repos/{owner_repo}/commits", params={"sha": branch, "limit": str(limit)})
+
+
+async def get_file_raw(owner_repo: str, ref: str, path: str) -> bytes:
+    """Scarica il file raw alla revisione indicata."""
+    url = f"{settings.gitea_url.rstrip('/')}/api/v1/repos/{owner_repo}/raw/{path}"
+    async with httpx.AsyncClient(timeout=15.0) as c:
+        r = await c.get(url, params={"ref": ref}, headers=_headers())
+        r.raise_for_status()
+        return r.content
--- a/ml/core/influx_client.py
+++ b/ml/core/influx_client.py
@@ -0,0 +1,75 @@
+"""Client InfluxDB (influxdb-client sync wrapper in thread-pool per async).
+
+Le scritture usano il batching async dell'SDK invece di SYNCHRONOUS.
+Le metriche di training arrivano in burst (logs container, stats loop ogni 5s):
+con SYNCHRONOUS ogni write era una HTTP request bloccante. Con WriteOptions
+batched, l'SDK accumula i Point e fa flush periodico in background, senza
+perdere durabilità (flush forzato a fine training).
+"""
+from __future__ import annotations
+
+import asyncio
+from typing import Iterable, Optional
+
+from influxdb_client import InfluxDBClient, Point, WriteOptions
+
+from core.config import settings
+
+_client: Optional[InfluxDBClient] = None
+_write_api = None
+
+
+def client() -> InfluxDBClient:
+    global _client, _write_api
+    if _client is None:
+        _client = InfluxDBClient(
+            url=settings.influx_url, token=settings.influx_token, org=settings.influx_org
+        )
+        _write_api = _client.write_api(write_options=WriteOptions(
+            batch_size=200,
+            flush_interval=2_000,
+            jitter_interval=200,
+            retry_interval=2_000,
+            max_retries=3,
+        ))
+    return _client
+
+
+def _wa():
+    client()
+    return _write_api
+
+
+async def write_points(points: Iterable[Point]) -> None:
+    wa = _wa()
+    pts = list(points)
+    await asyncio.to_thread(wa.write, settings.influx_bucket, settings.influx_org, pts)
+
+
+async def flush() -> None:
+    """Forza il flush del buffer batched. Da chiamare a fine training per
+    garantire che tutte le metriche raccolte siano persistite."""
+    if _write_api is None:
+        return
+    try:
+        await asyncio.to_thread(_write_api.flush)
+    except Exception:
+        pass
+
+
+async def query_flux(flux: str) -> list[dict]:
+    c = client()
+    def _q():
+        tables = c.query_api().query(flux, org=settings.influx_org)
+        out = []
+        for table in tables:
+            for r in table.records:
+                out.append({
+                    "time": r.get_time().isoformat() if r.get_time() else None,
+                    "measurement": r.get_measurement(),
+                    "field": r.get_field(),
+                    "value": r.get_value(),
+                    "tags": {k: v for k, v in r.values.items() if k.startswith("_") is False and k not in ("result", "table")},
+                })
+        return out
+    return await asyncio.to_thread(_q)
--- a/ml/core/minio_client.py
+++ b/ml/core/minio_client.py
@@ -0,0 +1,118 @@
+"""Wrapper MinIO: bucket unico (settings.minio_bucket) con prefissi logici.
+
+Prefissi usati:
+  datasets/<uuid>.<ext>
+  models/<model_id>/spec.yml
+  models/<model_id>/<version>/<patch>/...   (artefatti training)
+  trainings/<training_id>/logs.jsonl
+"""
+from __future__ import annotations
+
+import io
+from datetime import timedelta
+from typing import Iterable, Optional
+
+from minio import Minio
+from minio.error import S3Error
+
+from core.config import settings
+
+
+_client: Optional[Minio] = None
+
+
+def client() -> Minio:
+    global _client
+    if _client is None:
+        _client = Minio(
+            f"{settings.minio_endpoint}:{settings.minio_port}",
+            access_key=settings.minio_access_key,
+            secret_key=settings.minio_secret_key,
+            secure=settings.minio_use_ssl,
+        )
+    return _client
+
+
+def _bucket(b: Optional[str] = None) -> str:
+    return b or settings.minio_bucket
+
+
+def ensure_bucket(bucket: Optional[str] = None) -> None:
+    name = _bucket(bucket)
+    c = client()
+    if not c.bucket_exists(name):
+        c.make_bucket(name)
+
+
+def put_bytes(key: str, data: bytes, content_type: str = "application/octet-stream",
+              bucket: Optional[str] = None) -> None:
+    ensure_bucket(bucket)
+    client().put_object(
+        _bucket(bucket),
+        key,
+        io.BytesIO(data),
+        length=len(data),
+        content_type=content_type,
+    )
+
+
+def put_stream(key: str, stream, length: int, content_type: str = "application/octet-stream",
+               bucket: Optional[str] = None) -> None:
+    ensure_bucket(bucket)
+    client().put_object(
+        _bucket(bucket), key, stream, length=length, content_type=content_type
+    )
+
+
+def get_bytes(key: str, bucket: Optional[str] = None) -> bytes:
+    r = client().get_object(_bucket(bucket), key)
+    try:
+        return r.read()
+    finally:
+        r.close()
+        r.release_conn()
+
+
+def remove(key: str, bucket: Optional[str] = None) -> None:
+    try:
+        client().remove_object(_bucket(bucket), key)
+    except S3Error:
+        pass
+
+
+def remove_prefix(prefix: str, bucket: Optional[str] = None) -> int:
+    name = _bucket(bucket)
+    n = 0
+    for obj in client().list_objects(name, prefix=prefix, recursive=True):
+        try:
+            client().remove_object(name, obj.object_name)
+            n += 1
+        except S3Error:
+            pass
+    return n
+
+
+def presigned_get(key: str, expires_seconds: int = 3600, bucket: Optional[str] = None) -> str:
+    return client().presigned_get_object(
+        _bucket(bucket), key, expires=timedelta(seconds=expires_seconds)
+    )
+
+
+def list_prefix(prefix: str, bucket: Optional[str] = None) -> list[dict]:
+    out = []
+    for obj in client().list_objects(_bucket(bucket), prefix=prefix, recursive=True):
+        out.append({
+            "name": obj.object_name,
+            "size": obj.size,
+            "last_modified": obj.last_modified.isoformat() if obj.last_modified else None,
+            "etag": obj.etag,
+        })
+    return out
+
+
+def check() -> bool:
+    try:
+        client().list_buckets()
+        return True
+    except Exception:
+        return False
--- a/ml/core/model_spec.py
+++ b/ml/core/model_spec.py
@@ -0,0 +1,90 @@
+"""Parse e validazione del contratto `model.yml` nelle repo utente.
+
+Schema sintetico (vedi piano):
+  name, type, version, python
+  train: {entrypoint, inputs, outputs, metrics}
+  test:  {entrypoint, io, input_schema[], output_schema[]}
+  resources: {cpu, mem_mb, gpu}
+"""
+from __future__ import annotations
+
+from typing import Any, Optional
+
+import yaml
+from pydantic import BaseModel, ValidationError
+
+from core import gitea, redis_client
+
+
+class _FieldSpec(BaseModel):
+    name: str
+    dtype: str
+    min: Optional[float] = None
+    max: Optional[float] = None
+    unit: Optional[str] = None
+
+
+class _Train(BaseModel):
+    entrypoint: str
+    inputs: dict = {}
+    outputs: dict = {}
+    metrics: dict = {}
+
+
+class _Test(BaseModel):
+    entrypoint: str
+    io: str = "stdio_json"
+    input_schema: list[_FieldSpec] = []
+    output_schema: list[_FieldSpec] = []
+
+
+class ModelSpec(BaseModel):
+    name: str
+    type: str
+    version: str = "0.1.0"
+    python: str = "3.11"
+    train: _Train
+    test: Optional[_Test] = None
+    resources: dict = {}
+
+
+def parse_yaml(content: bytes | str) -> dict:
+    """Parsa stringa YAML → dict validato. Solleva ValueError su errore."""
+    if isinstance(content, bytes):
+        content = content.decode("utf-8")
+    try:
+        raw = yaml.safe_load(content) or {}
+        spec = ModelSpec(**raw)
+        return spec.model_dump()
+    except (yaml.YAMLError, ValidationError) as e:
+        raise ValueError(f"invalid model.yml: {e}") from e
+
+
+async def fetch_and_parse_spec(owner_repo: str, ref: str) -> Optional[dict]:
+    """Recupera model.yml dalla repo alla revisione e lo parsa.
+    Cache Redis `ml:modelspec:{repo}:{ref}` TTL 1h.
+    """
+    cache_key = f"ml:modelspec:{owner_repo}:{ref}"
+    try:
+        cached = await redis_client.client().get(cache_key)
+        if cached:
+            import json
+            return json.loads(cached)
+    except Exception:
+        pass
+
+    try:
+        raw = await gitea.get_file_raw(owner_repo, ref, "model.yml")
+    except Exception:
+        try:
+            raw = await gitea.get_file_raw(owner_repo, ref, "model.yaml")
+        except Exception:
+            return None
+    spec = parse_yaml(raw)
+
+    try:
+        import json
+        await redis_client.client().set(cache_key, json.dumps(spec), ex=3600)
+    except Exception:
+        pass
+    return spec
--- a/ml/core/redis_client.py
+++ b/ml/core/redis_client.py
@@ -0,0 +1,29 @@
+"""Client Redis asincrono (redis-py asyncio). Singleton semplice."""
+from __future__ import annotations
+
+from typing import Optional
+
+import redis.asyncio as redis
+
+from core.config import settings
+
+_client: Optional[redis.Redis] = None
+
+
+def client() -> redis.Redis:
+    global _client
+    if _client is None:
+        _client = redis.Redis(
+            host=settings.redis_host,
+            port=settings.redis_port,
+            decode_responses=True,
+            health_check_interval=30,
+        )
+    return _client
+
+
+async def close() -> None:
+    global _client
+    if _client is not None:
+        await _client.aclose()
+        _client = None
--- a/ml/core/worker.py
+++ b/ml/core/worker.py
@@ -0,0 +1,54 @@
+"""Worker loop: BRPOP da ml:queue:train e dispatch al docker_runner.
+
+Parte N task asincroni concorrenti (settings.train_concurrency).
+"""
+from __future__ import annotations
+
+import asyncio
+import logging
+
+from core import redis_client
+from core.config import settings
+from core.docker_runner import run_training_job
+
+log = logging.getLogger(__name__)
+
+_tasks: list[asyncio.Task] = []
+
+
+async def _worker_loop(idx: int):
+    r = redis_client.client()
+    log.info("ml worker[%d] started", idx)
+    while True:
+        try:
+            res = await r.brpop("ml:queue:train", timeout=10)
+        except Exception as e:
+            log.warning("brpop error: %s", e)
+            await asyncio.sleep(2)
+            continue
+        if res is None:
+            continue
+        _, training_id = res
+        log.info("worker[%d] picked training %s", idx, training_id)
+        try:
+            await run_training_job(training_id)
+        except Exception:
+            log.exception("worker[%d] training %s crashed", idx, training_id)
+
+
+def start_workers() -> None:
+    global _tasks
+    n = max(1, settings.train_concurrency)
+    for i in range(n):
+        _tasks.append(asyncio.create_task(_worker_loop(i)))
+
+
+async def stop_workers() -> None:
+    for t in _tasks:
+        t.cancel()
+    for t in _tasks:
+        try:
+            await t
+        except Exception:
+            pass
+    _tasks.clear()
--- a/ml/main.py
+++ b/ml/main.py
@@ -1,19 +1,90 @@
-from fastapi import FastAPI, Request, Response, Header
-from fastapi.responses import HTMLResponse, JSONResponse
-import time
+"""ml-service — FastAPI entrypoint.
+
+Monta:
+  /                 → RedirectResponse
+  /datasets /models /train /test /results    → pagine Jinja
+  /api/datasets /api/models /api/repos /api/trainings /api/tests /api/results → JSON
+  /api/trainings/{id}/events → SSE
+  /health           → check
+  /static/*         → file statici
+"""
+from __future__ import annotations
+
+import logging
+from contextlib import asynccontextmanager
+from pathlib import Path
+
+from fastapi import FastAPI
+from fastapi.staticfiles import StaticFiles
+
+from core import db, minio_client, redis_client, worker
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
+log = logging.getLogger(__name__)
+
+STATIC_DIR = Path(__file__).resolve().parent / "static"
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    log.info("ml-service starting")
+    await db.init_pool()
+    try:
+        minio_client.ensure_bucket()
+    except Exception as e:
+        log.warning("minio bucket ensure failed: %s", e)
+    worker.start_workers()
+    yield
+    log.info("ml-service stopping")
+    await worker.stop_workers()
+    await db.close_pool()
+    await redis_client.close()
+
+
+app = FastAPI(title="MEB ML Service", lifespan=lifespan)
+
+# static
+app.mount("/static", StaticFiles(directory=str(STATIC_DIR)), name="static")

-app = FastAPI()

@app.get("/health")
-def health():
+async def health():
+    pg_ok = True
+    try:
+        await db.fetchrow("SELECT 1")
+    except Exception:
+        pg_ok = False
+    redis_ok = True
+    try:
+        await redis_client.client().ping()
+    except Exception:
+        redis_ok = False
    return {
-        "status": "ok",
+        "status": "ok" if (pg_ok and redis_ok) else "degraded",
        "service": "ml",
-        "version": "1.0.0",
-        "build_number": "1",
-        "version_state": "dev"
+        "postgres": "connected" if pg_ok else "disconnected",
+        "redis": "connected" if redis_ok else "disconnected",
+        "minio": "connected" if minio_client.check() else "disconnected",
+        "version": "2.0.0",
    }

-@app.get("/")
-def root():
-    return {"message": "ML Service"}
+
+from routers import (  # noqa: E402
+    datasets,
+    models,
+    pages,
+    repos,
+    results,
+    tests,
+    trainings,
+    trainings_stream,
+)
+
+app.include_router(pages.router)
+app.include_router(datasets.router)
+app.include_router(models.router)
+app.include_router(repos.router)
+app.include_router(trainings.router)
+app.include_router(trainings_stream.router)
+app.include_router(tests.router)
+app.include_router(results.router)
--- a/ml/requirements.txt
+++ b/ml/requirements.txt
@@ -1,3 +1,15 @@
 fastapi
-uvicorn
+uvicorn[standard]
 PyJWT
+asyncpg
+redis>=5
+minio
+influxdb-client
+docker
+PyYAML
+pydantic>=2
+python-multipart
+jinja2
+aiofiles
+httpx
+sse-starlette
--- a/ml/routers/datasets.py
+++ b/ml/routers/datasets.py
@@ -0,0 +1,160 @@
+"""API datasets (ml.mebboat.it/api/datasets).
+
+Upload/list/get/download/delete. Storage:
+  MinIO bucket "ml" con key "datasets/<uuid>.<ext>"
+  Postgres db "ml" tabella "datasets"
+"""
+from __future__ import annotations
+
+import json
+import uuid
+from typing import Optional
+
+from fastapi import APIRouter, Depends, File, Form, HTTPException, Query, UploadFile
+
+from core import db, minio_client
+from core.auth import require_auth
+
+router = APIRouter(prefix="/api/datasets", tags=["datasets"])
+
+# Bucket MinIO fisso per tutti i dataset (no prefix nelle key).
+BUCKET = "ml.datasets"
+_EXT = {"csv": "csv", "json": "json", "netcdf": "nc"}
+
+
+def _row(r) -> dict:
+    if r is None:
+        return None
+    d = dict(r)
+    # asyncpg ritorna JSONB come dict già; date/time come datetime
+    for k in ("created_at", "updated_at", "start_date", "end_date"):
+        if d.get(k) is not None and hasattr(d[k], "isoformat"):
+            d[k] = d[k].isoformat()
+    return d
+
+
+@router.get("")
+async def list_datasets(
+    type: Optional[str] = Query(None),
+    tags: Optional[str] = Query(None),
+    mine: Optional[int] = Query(None),
+    search: Optional[str] = Query(None),
+    user=Depends(require_auth),
+):
+    where = []
+    args: list = []
+    if type:
+        args.append(type)
+        where.append(f"type = ${len(args)}")
+    if tags:
+        tag_arr = [t.strip() for t in tags.split(",") if t.strip()]
+        if tag_arr:
+            args.append(tag_arr)
+            where.append(f"tags && ${len(args)}")
+    if mine and user.get("username"):
+        args.append(user["username"])
+        where.append(f"created_by = ${len(args)}")
+    if search:
+        args.append(f"%{search}%")
+        where.append(f"(nome ILIKE ${len(args)} OR description ILIKE ${len(args)})")
+    sql = "SELECT * FROM datasets"
+    if where:
+        sql += " WHERE " + " AND ".join(where)
+    sql += " ORDER BY created_at DESC LIMIT 500"
+    rows = await db.fetch(sql, *args)
+    return {"count": len(rows), "datasets": [_row(r) for r in rows]}
+
+
+@router.post("", status_code=201)
+async def upload_dataset(
+    file: UploadFile = File(...),
+    metadata: str = Form("{}"),
+    user=Depends(require_auth),
+):
+    try:
+        meta = json.loads(metadata or "{}")
+    except json.JSONDecodeError:
+        raise HTTPException(400, "metadata must be valid JSON")
+
+    fmt = meta.get("format") or meta.get("type") or "csv"
+    if fmt not in ("csv", "json", "netcdf"):
+        fmt = "csv"
+    ext = _EXT[fmt]
+    ds_id = str(uuid.uuid4())
+    file_key = f"{ds_id}.{ext}"
+
+    data = await file.read()
+    minio_client.put_bytes(file_key, data, content_type=file.content_type or "application/octet-stream", bucket=BUCKET)
+
+    created_by = user.get("username") or meta.get("created_by") or "unknown"
+    row = await db.fetchrow(
+        """
+        INSERT INTO datasets (
+          id, file_key, nome, description, tags, type, format, notes,
+          created_by, size_bytes, copernicus_id
+        ) VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11)
+        RETURNING *
+        """,
+        uuid.UUID(ds_id),
+        file_key,
+        meta.get("nome") or file.filename or file_key,
+        meta.get("description"),
+        meta.get("tags") or [],
+        meta.get("dataset_type") or "custom",
+        fmt,
+        meta.get("notes"),
+        created_by,
+        len(data),
+        meta.get("copernicus_id") or meta.get("copernicus_dataset_id"),
+    )
+    return _row(row)
+
+
+@router.get("/{dataset_id}")
+async def get_dataset(dataset_id: str, user=Depends(require_auth)):
+    row = await db.fetchrow("SELECT * FROM datasets WHERE id = $1", uuid.UUID(dataset_id))
+    if not row:
+        raise HTTPException(404, "not found")
+    return _row(row)
+
+
+@router.get("/{dataset_id}/download")
+async def download_dataset(dataset_id: str, user=Depends(require_auth)):
+    row = await db.fetchrow("SELECT file_key FROM datasets WHERE id = $1", uuid.UUID(dataset_id))
+    if not row:
+        raise HTTPException(404, "not found")
+    url = minio_client.presigned_get(row["file_key"], 3600, bucket=BUCKET)
+    return {"url": url, "expires_in": 3600}
+
+
+@router.patch("/{dataset_id}")
+async def patch_dataset(dataset_id: str, body: dict, user=Depends(require_auth)):
+    allowed = {"nome", "description", "tags", "notes"}
+    sets = []
+    args: list = []
+    for k, v in body.items():
+        if k in allowed:
+            args.append(v)
+            sets.append(f"{k} = ${len(args)}")
+    if not sets:
+        raise HTTPException(400, "no fields to update")
+    # Trigger updated_at non presente nel DB: lo aggiorniamo manualmente.
+    sets.append("updated_at = NOW()")
+    args.append(uuid.UUID(dataset_id))
+    row = await db.fetchrow(
+        f"UPDATE datasets SET {', '.join(sets)} WHERE id = ${len(args)} RETURNING *",
+        *args,
+    )
+    if not row:
+        raise HTTPException(404, "not found")
+    return _row(row)
+
+
+@router.delete("/{dataset_id}", status_code=204)
+async def delete_dataset(dataset_id: str, user=Depends(require_auth)):
+    row = await db.fetchrow("SELECT file_key FROM datasets WHERE id = $1", uuid.UUID(dataset_id))
+    if not row:
+        raise HTTPException(404, "not found")
+    minio_client.remove(row["file_key"], bucket=BUCKET)
+    await db.execute("DELETE FROM datasets WHERE id = $1", uuid.UUID(dataset_id))
+    return None
--- a/ml/routers/models.py
+++ b/ml/routers/models.py
@@ -0,0 +1,131 @@
+"""API /api/models — registro modelli (repo Gitea + metadata)."""
+from __future__ import annotations
+
+import uuid
+from typing import Optional
+
+from fastapi import APIRouter, Depends, HTTPException
+
+from core import db
+from core.auth import require_auth
+from core.model_spec import fetch_and_parse_spec
+
+router = APIRouter(prefix="/api/models", tags=["models"])
+
+
+def _row(r) -> Optional[dict]:
+    if r is None:
+        return None
+    d = dict(r)
+    for k in ("created_at", "updated_at"):
+        if d.get(k) is not None and hasattr(d[k], "isoformat"):
+            d[k] = d[k].isoformat()
+    return d
+
+
+@router.get("")
+async def list_models(user=Depends(require_auth)):
+    rows = await db.fetch("SELECT * FROM models ORDER BY created_at DESC LIMIT 500")
+    return {"count": len(rows), "models": [_row(r) for r in rows]}
+
+
+@router.post("", status_code=201)
+async def create_model(body: dict, user=Depends(require_auth)):
+    required = ("name", "type", "gitea_repo")
+    for k in required:
+        if not body.get(k):
+            raise HTTPException(400, f"missing field: {k}")
+
+    # prova a pre-caricare model.yml dal default branch (non fatale)
+    spec = None
+    try:
+        spec = await fetch_and_parse_spec(body["gitea_repo"], body.get("default_branch") or "main")
+    except Exception:
+        spec = None
+
+    row = await db.fetchrow(
+        """
+        INSERT INTO models (name, type, gitea_repo, default_branch, spec, created_by)
+        VALUES ($1,$2,$3,$4,$5,$6)
+        RETURNING *
+        """,
+        body["name"],
+        body["type"],
+        body["gitea_repo"],
+        body.get("default_branch") or "main",
+        spec,
+        user.get("username") or "unknown",
+    )
+    return _row(row)
+
+
+@router.get("/{model_id}")
+async def get_model(model_id: str, user=Depends(require_auth)):
+    row = await db.fetchrow("SELECT * FROM models WHERE id = $1", uuid.UUID(model_id))
+    if not row:
+        raise HTTPException(404, "not found")
+    return _row(row)
+
+
+@router.patch("/{model_id}")
+async def patch_model(model_id: str, body: dict, user=Depends(require_auth)):
+    allowed = {"name", "type", "default_branch"}
+    sets = []
+    args: list = []
+    for k, v in body.items():
+        if k in allowed:
+            args.append(v)
+            sets.append(f"{k} = ${len(args)}")
+    if not sets:
+        raise HTTPException(400, "no fields to update")
+    args.append(uuid.UUID(model_id))
+    row = await db.fetchrow(
+        f"UPDATE models SET {', '.join(sets)} WHERE id = ${len(args)} RETURNING *",
+        *args,
+    )
+    if not row:
+        raise HTTPException(404, "not found")
+    return _row(row)
+
+
+@router.delete("/{model_id}", status_code=204)
+async def delete_model(model_id: str, user=Depends(require_auth)):
+    await db.execute("DELETE FROM models WHERE id = $1", uuid.UUID(model_id))
+    return None
+
+
+# ── Notes ──────────────────────────────────────────────────────────────────
+@router.get("/{model_id}/notes")
+async def list_notes(model_id: str, user=Depends(require_auth)):
+    rows = await db.fetch(
+        "SELECT id, author, text, created_at FROM model_notes WHERE model_id = $1 ORDER BY created_at DESC",
+        uuid.UUID(model_id),
+    )
+    return [
+        {
+            "id": str(r["id"]),
+            "author": r["author"],
+            "text": r["text"],
+            "created_at": r["created_at"].isoformat(),
+        }
+        for r in rows
+    ]
+
+
+@router.post("/{model_id}/notes", status_code=201)
+async def add_note(model_id: str, body: dict, user=Depends(require_auth)):
+    text = (body.get("text") or "").strip()
+    if not text:
+        raise HTTPException(400, "text required")
+    row = await db.fetchrow(
+        "INSERT INTO model_notes (model_id, author, text) VALUES ($1, $2, $3) RETURNING *",
+        uuid.UUID(model_id),
+        user.get("username") or "unknown",
+        text,
+    )
+    return {
+        "id": str(row["id"]),
+        "author": row["author"],
+        "text": row["text"],
+        "created_at": row["created_at"].isoformat(),
+    }
--- a/ml/routers/pages.py
+++ b/ml/routers/pages.py
@@ -0,0 +1,75 @@
+"""Pagine HTML servite direttamente da ml.mebboat.it.
+
+Layout:
+  /           redirect a /datasets (o landing console)
+  /datasets   lista/upload dataset
+  /models     registro modelli
+  /train      avvia training
+  /test       esegue test su modello trainato
+  /results    storico e confronto risultati
+"""
+from __future__ import annotations
+
+from pathlib import Path
+
+from fastapi import APIRouter, Depends, Request
+from fastapi.responses import HTMLResponse, RedirectResponse
+from fastapi.templating import Jinja2Templates
+
+from core.auth import _verify
+from core.config import settings
+
+router = APIRouter(tags=["pages"])
+
+TEMPLATES_DIR = Path(__file__).resolve().parent.parent / "templates"
+templates = Jinja2Templates(directory=str(TEMPLATES_DIR))
+
+
+def _user_or_redirect(request: Request):
+    """Per le pagine, se non autenticato redirect al login. Ritorna user dict o RedirectResponse."""
+    token = request.cookies.get("auth_token")
+    auth = request.headers.get("authorization")
+    if not token and auth and auth.startswith("Bearer "):
+        token = auth[7:]
+    user = _verify(token)
+    if not user:
+        target = str(request.url)
+        return RedirectResponse(url=f"{settings.auth_login_url}?redirect={target}", status_code=302)
+    return user
+
+
+def _render(request: Request, template: str, **ctx):
+    user = _user_or_redirect(request)
+    if isinstance(user, RedirectResponse):
+        return user
+    return templates.TemplateResponse(template, {"request": request, "user": user, **ctx})
+
+
+@router.get("/", response_class=HTMLResponse)
+async def home(request: Request):
+    return RedirectResponse(url="/datasets")
+
+
+@router.get("/datasets", response_class=HTMLResponse)
+async def page_datasets(request: Request):
+    return _render(request, "datasets.html", page="datasets")
+
+
+@router.get("/models", response_class=HTMLResponse)
+async def page_models(request: Request):
+    return _render(request, "models.html", page="models")
+
+
+@router.get("/train", response_class=HTMLResponse)
+async def page_train(request: Request):
+    return _render(request, "train.html", page="train")
+
+
+@router.get("/test", response_class=HTMLResponse)
+async def page_test(request: Request):
+    return _render(request, "test.html", page="test")
+
+
+@router.get("/results", response_class=HTMLResponse)
+async def page_results(request: Request):
+    return _render(request, "results.html", page="results")
--- a/ml/routers/repos.py
+++ b/ml/routers/repos.py
@@ -0,0 +1,51 @@
+"""API /api/repos — proxy autenticato verso Gitea."""
+from __future__ import annotations
+
+from fastapi import APIRouter, Depends, HTTPException, Query
+
+from core import gitea
+from core.auth import require_auth
+from core.model_spec import fetch_and_parse_spec
+
+router = APIRouter(prefix="/api/repos", tags=["repos"])
+
+
+@router.get("")
+async def list_repos(user=Depends(require_auth)):
+    try:
+        return await gitea.list_repos()
+    except Exception as e:
+        raise HTTPException(502, f"gitea: {e}")
+
+
+@router.get("/{owner}/{repo}/branches")
+async def branches(owner: str, repo: str, user=Depends(require_auth)):
+    try:
+        return await gitea.list_branches(f"{owner}/{repo}")
+    except Exception as e:
+        raise HTTPException(502, f"gitea: {e}")
+
+
+@router.get("/{owner}/{repo}/commits")
+async def commits(owner: str, repo: str, branch: str = Query("main"), user=Depends(require_auth)):
+    try:
+        return await gitea.list_commits(f"{owner}/{repo}", branch)
+    except Exception as e:
+        raise HTTPException(502, f"gitea: {e}")
+
+
+@router.get("/{owner}/{repo}/file")
+async def file_raw(owner: str, repo: str, ref: str, path: str, user=Depends(require_auth)):
+    try:
+        raw = await gitea.get_file_raw(f"{owner}/{repo}", ref, path)
+        return {"content": raw.decode("utf-8", errors="replace"), "size": len(raw)}
+    except Exception as e:
+        raise HTTPException(404, f"file not found: {e}")
+
+
+@router.get("/{owner}/{repo}/spec")
+async def spec(owner: str, repo: str, ref: str = Query("main"), user=Depends(require_auth)):
+    s = await fetch_and_parse_spec(f"{owner}/{repo}", ref)
+    if s is None:
+        raise HTTPException(404, "model.yml not found at ref")
+    return s
--- a/ml/routers/results.py
+++ b/ml/routers/results.py
@@ -0,0 +1,89 @@
+"""API /api/results — lista trainings/tests + compare multi-training."""
+from __future__ import annotations
+
+import uuid
+from typing import Optional
+
+from fastapi import APIRouter, Depends, HTTPException, Query
+
+from core import db, influx_client
+from core.auth import require_auth
+from core.config import settings
+
+router = APIRouter(prefix="/api/results", tags=["results"])
+
+
+def _row(r):
+    if r is None:
+        return None
+    d = dict(r)
+    for k in ("queued_at", "started_at", "finished_at", "started_at", "ended_at"):
+        if d.get(k) is not None and hasattr(d[k], "isoformat"):
+            d[k] = d[k].isoformat()
+    return d
+
+
+@router.get("")
+async def list_results(
+    model_id: Optional[str] = Query(None),
+    user=Depends(require_auth),
+):
+    where = []
+    args: list = []
+    if model_id:
+        args.append(uuid.UUID(model_id))
+        where.append(f"model_id = ${len(args)}")
+    sql = "SELECT * FROM trainings"
+    if where:
+        sql += " WHERE " + " AND ".join(where)
+    sql += " ORDER BY finished_at DESC NULLS LAST, queued_at DESC LIMIT 200"
+    rows = await db.fetch(sql, *args)
+    return {"count": len(rows), "trainings": [_row(r) for r in rows]}
+
+
+@router.get("/{training_id}")
+async def get_result(training_id: str, user=Depends(require_auth)):
+    row = await db.fetchrow("SELECT * FROM trainings WHERE id = $1", uuid.UUID(training_id))
+    if not row:
+        raise HTTPException(404, "not found")
+    # timeseries via Influx: loss per iter + cpu/mem
+    flux = (
+        f'from(bucket:"{settings.influx_bucket}") '
+        f'|> range(start:-90d) '
+        f'|> filter(fn: (r) => r._measurement == "ml_training" and r.training_id == "{training_id}")'
+    )
+    try:
+        ts = await influx_client.query_flux(flux)
+    except Exception:
+        ts = []
+    return {"training": _row(row), "timeseries": ts}
+
+
+@router.get("/compare")
+async def compare(
+    trainings: str = Query(..., description="comma-separated training IDs"),
+    user=Depends(require_auth),
+):
+    ids = [s.strip() for s in trainings.split(",") if s.strip()]
+    if len(ids) < 2:
+        raise HTTPException(400, "at least 2 training IDs required")
+    out = []
+    for tid in ids:
+        try:
+            tid_uuid = uuid.UUID(tid)
+        except ValueError:
+            continue
+        row = await db.fetchrow("SELECT * FROM trainings WHERE id = $1", tid_uuid)
+        if not row:
+            continue
+        flux = (
+            f'from(bucket:"{settings.influx_bucket}") '
+            f'|> range(start:-90d) '
+            f'|> filter(fn: (r) => r._measurement == "ml_training" and r.training_id == "{tid}")'
+        )
+        try:
+            ts = await influx_client.query_flux(flux)
+        except Exception:
+            ts = []
+        out.append({"training": _row(row), "timeseries": ts})
+    return {"results": out}
--- a/ml/routers/tests.py
+++ b/ml/routers/tests.py
@@ -0,0 +1,109 @@
+"""API /api/tests — sessioni di test su training esistente (max 2 utenti simultanei)."""
+from __future__ import annotations
+
+import json
+import time
+import uuid
+from typing import Optional
+
+import httpx
+from fastapi import APIRouter, Depends, HTTPException
+
+from core import api_client, db, minio_client
+from core.auth import require_auth
+from core.docker_runner import run_test_once
+
+router = APIRouter(prefix="/api/tests", tags=["tests"])
+
+
+def _row(r):
+    if r is None:
+        return None
+    d = dict(r)
+    for k in ("started_at", "ended_at"):
+        if d.get(k) is not None and hasattr(d[k], "isoformat"):
+            d[k] = d[k].isoformat()
+    return d
+
+
+@router.post("/sessions", status_code=201)
+async def start_session(body: dict, user=Depends(require_auth)):
+    training_id = body.get("training_id")
+    if not training_id:
+        raise HTTPException(400, "training_id required")
+
+    tr = await db.fetchrow(
+        "SELECT id, status FROM trainings WHERE id = $1", uuid.UUID(training_id)
+    )
+    if not tr:
+        raise HTTPException(404, "training not found")
+    if tr["status"] != "succeeded":
+        raise HTTPException(409, "training not completed")
+
+    sid = str(uuid.uuid4())
+    try:
+        await api_client.page_connect("test", user.get("username") or "unknown", sid)
+    except httpx.HTTPStatusError as e:
+        if e.response.status_code == 429:
+            raise HTTPException(429, "test slots full (max 2 users)")
+        raise HTTPException(502, f"api: {e}")
+
+    row = await db.fetchrow(
+        "INSERT INTO tests (id, training_id, user_id) VALUES ($1,$2,$3) RETURNING *",
+        uuid.UUID(sid),
+        uuid.UUID(training_id),
+        user.get("username") or "unknown",
+    )
+    return _row(row)
+
+
+@router.post("/sessions/{session_id}/ping")
+async def ping_session(session_id: str, user=Depends(require_auth)):
+    try:
+        await api_client.page_ping(session_id)
+    except httpx.HTTPStatusError as e:
+        raise HTTPException(e.response.status_code, e.response.text)
+    return {"ok": True}
+
+
+@router.post("/sessions/{session_id}/runs", status_code=201)
+async def run_test(session_id: str, body: dict, user=Depends(require_auth)):
+    row = await db.fetchrow("SELECT * FROM tests WHERE id = $1", uuid.UUID(session_id))
+    if not row:
+        raise HTTPException(404, "session not found")
+
+    inputs = body.get("inputs") or {}
+    t0 = time.monotonic()
+    try:
+        result = await run_test_once(str(row["training_id"]), inputs)
+    except Exception as e:
+        raise HTTPException(500, f"test run failed: {e}")
+    dt_ms = int((time.monotonic() - t0) * 1000)
+
+    run = {
+        "inputs": inputs,
+        "outputs": result.get("outputs", {}),
+        "duration_ms": dt_ms,
+        "cpu_peak": result.get("cpu_peak"),
+        "mem_peak_mb": result.get("mem_peak_mb"),
+        "ts": time.time(),
+    }
+    await db.execute(
+        "UPDATE tests SET runs = runs || $1::jsonb WHERE id = $2",
+        json.dumps([run]),
+        uuid.UUID(session_id),
+    )
+    return run
+
+
+@router.delete("/sessions/{session_id}", status_code=204)
+async def end_session(session_id: str, user=Depends(require_auth)):
+    await db.execute(
+        "UPDATE tests SET ended_at = NOW() WHERE id = $1 AND ended_at IS NULL",
+        uuid.UUID(session_id),
+    )
+    try:
+        await api_client.page_disconnect(session_id)
+    except Exception:
+        pass
+    return None
--- a/ml/routers/trainings.py
+++ b/ml/routers/trainings.py
@@ -0,0 +1,129 @@
+"""API /api/trainings — enqueue, list, get, artifacts."""
+from __future__ import annotations
+
+import json
+import uuid
+from typing import Optional
+
+from fastapi import APIRouter, Depends, HTTPException, Query
+
+from core import db, minio_client, redis_client, api_client
+from core.auth import require_auth
+
+router = APIRouter(prefix="/api/trainings", tags=["trainings"])
+
+
+def _row(r) -> Optional[dict]:
+    if r is None:
+        return None
+    d = dict(r)
+    for k in ("queued_at", "started_at", "finished_at"):
+        if d.get(k) is not None and hasattr(d[k], "isoformat"):
+            d[k] = d[k].isoformat()
+    return d
+
+
+@router.get("")
+async def list_trainings(
+    model_id: Optional[str] = Query(None),
+    status: Optional[str] = Query(None),
+    limit: int = Query(100, le=500),
+    user=Depends(require_auth),
+):
+    where = []
+    args: list = []
+    if model_id:
+        args.append(uuid.UUID(model_id))
+        where.append(f"model_id = ${len(args)}")
+    if status:
+        args.append(status)
+        where.append(f"status = ${len(args)}")
+    sql = "SELECT * FROM trainings"
+    if where:
+        sql += " WHERE " + " AND ".join(where)
+    args.append(limit)
+    sql += f" ORDER BY queued_at DESC LIMIT ${len(args)}"
+    rows = await db.fetch(sql, *args)
+    return {"count": len(rows), "trainings": [_row(r) for r in rows]}
+
+
+@router.post("", status_code=202)
+async def enqueue_training(body: dict, user=Depends(require_auth)):
+    for k in ("model_id", "version", "patch", "dataset_id"):
+        if not body.get(k):
+            raise HTTPException(400, f"missing field: {k}")
+
+    model_row = await db.fetchrow("SELECT * FROM models WHERE id = $1", uuid.UUID(body["model_id"]))
+    if not model_row:
+        raise HTTPException(404, "model not found")
+
+    ds_row = await db.fetchrow("SELECT id FROM datasets WHERE id = $1", uuid.UUID(body["dataset_id"]))
+    if not ds_row:
+        raise HTTPException(404, "dataset not found")
+
+    try:
+        training_row = await db.fetchrow(
+            """
+            INSERT INTO trainings (model_id, version, patch, dataset_id, started_by, status)
+            VALUES ($1,$2,$3,$4,$5,'queued')
+            RETURNING *
+            """,
+            uuid.UUID(body["model_id"]),
+            body["version"],
+            body["patch"],
+            uuid.UUID(body["dataset_id"]),
+            user.get("username") or "unknown",
+        )
+    except Exception as e:
+        raise HTTPException(409, f"training already exists or invalid: {e}")
+
+    training_id = str(training_row["id"])
+
+    # crea job lato api-service (cross-service registry)
+    try:
+        await api_client.create_job(
+            "train",
+            created_by=user.get("username") or "unknown",
+            payload={
+                "training_id": training_id,
+                "model_id": body["model_id"],
+                "version": body["version"],
+                "patch": body["patch"],
+                "dataset_id": body["dataset_id"],
+            },
+        )
+    except Exception as e:
+        # non-fatale: il worker locale può comunque procedere; logghiamo e continuiamo
+        import logging
+        logging.warning("create_job failed: %s", e)
+
+    # enqueue in Redis (il worker locale lo raccoglie)
+    await redis_client.client().lpush("ml:queue:train", training_id)
+    await redis_client.client().hset(
+        f"ml:train:{training_id}",
+        mapping={"status": "queued", "progress": "0", "message": "queued"},
+    )
+    await redis_client.client().expire(f"ml:train:{training_id}", 48 * 3600)
+
+    return _row(training_row)
+
+
+@router.get("/{training_id}")
+async def get_training(training_id: str, user=Depends(require_auth)):
+    row = await db.fetchrow("SELECT * FROM trainings WHERE id = $1", uuid.UUID(training_id))
+    if not row:
+        raise HTTPException(404, "not found")
+    return _row(row)
+
+
+@router.get("/{training_id}/artifacts")
+async def list_artifacts(training_id: str, user=Depends(require_auth)):
+    row = await db.fetchrow(
+        "SELECT artifacts_prefix FROM trainings WHERE id = $1", uuid.UUID(training_id)
+    )
+    if not row or not row["artifacts_prefix"]:
+        raise HTTPException(404, "no artifacts")
+    objs = minio_client.list_prefix(row["artifacts_prefix"] + "/")
+    for o in objs:
+        o["url"] = minio_client.presigned_get(o["name"], 3600)
+    return objs
--- a/ml/routers/trainings_stream.py
+++ b/ml/routers/trainings_stream.py
@@ -0,0 +1,64 @@
+"""SSE endpoint per live progress del training.
+
+GET /api/trainings/{id}/events
+  Streamma eventi dal Redis stream `ml:train:{id}:events` via Server-Sent Events.
+  Termina quando lo stato del training è terminale (succeeded/failed/cancelled).
+"""
+from __future__ import annotations
+
+import asyncio
+import json
+import uuid
+
+from fastapi import APIRouter, Depends, HTTPException
+from sse_starlette.sse import EventSourceResponse
+
+from core import db, redis_client
+from core.auth import require_auth
+
+router = APIRouter(prefix="/api/trainings", tags=["trainings-sse"])
+
+_TERMINAL = {"succeeded", "failed", "cancelled"}
+
+
+@router.get("/{training_id}/events")
+async def training_events(training_id: str, user=Depends(require_auth)):
+    # verifica esistenza
+    row = await db.fetchrow("SELECT status FROM trainings WHERE id = $1", uuid.UUID(training_id))
+    if not row:
+        raise HTTPException(404, "not found")
+
+    stream_key = f"ml:train:{training_id}:events"
+    status_key = f"ml:train:{training_id}"
+
+    async def gen():
+        last_id = "0-0"
+        r = redis_client.client()
+        while True:
+            try:
+                # XREAD block 5s per non tenere la connessione idle troppo a lungo
+                resp = await r.xread({stream_key: last_id}, count=50, block=5000)
+            except Exception as e:
+                yield {"event": "error", "data": json.dumps({"error": str(e)})}
+                await asyncio.sleep(1)
+                continue
+
+            if resp:
+                for _stream, entries in resp:
+                    for entry_id, fields in entries:
+                        last_id = entry_id
+                        yield {"event": "message", "id": entry_id, "data": json.dumps(fields)}
+
+            # controlla stato terminale
+            state = await r.hget(status_key, "status")
+            if not state:
+                # fallback su db se redis scaduto
+                db_row = await db.fetchrow(
+                    "SELECT status FROM trainings WHERE id = $1", uuid.UUID(training_id)
+                )
+                state = db_row["status"] if db_row else "unknown"
+            if state in _TERMINAL:
+                yield {"event": "end", "data": json.dumps({"status": state})}
+                return
+
+    return EventSourceResponse(gen())
--- a/ml/runner/Dockerfile
+++ b/ml/runner/Dockerfile
@@ -0,0 +1,18 @@
+FROM python:3.11-slim
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        git \
+        build-essential \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN pip install --no-cache-dir \
+        numpy pandas scikit-learn \
+        xgboost \
+        matplotlib \
+        pyyaml
+
+COPY sdk.py /opt/meb/meb_ml.py
+ENV PYTHONPATH=/opt/meb
+
+WORKDIR /workdir
+CMD ["bash"]
--- a/ml/runner/sdk.py
+++ b/ml/runner/sdk.py
@@ -0,0 +1,80 @@
+"""meb_ml — SDK importabile dal codice utente dentro il container runner.
+
+API:
+    from meb_ml import emit_metric, emit_series, emit_matrix, emit_log, save_artifact
+
+    emit_metric(iter=10, loss=0.23)
+    emit_series("roc_curve", x=fpr, y=tpr, kind="line")
+    emit_matrix("confusion", labels=[...], values=[[...],[...]])
+    emit_log("info", "epoch done")
+
+Scrive righe JSON su stdout; il parent (ml-service) le inoltra su Redis/Influx.
+Per risultati finali scrivere `out/metrics.json` con:
+    {"metrics": {...}, "plots": {"loss_curve": {"x": [...], "y": [...]}, ...}}
+"""
+from __future__ import annotations
+
+import json
+import os
+import sys
+from pathlib import Path
+from typing import Any, Iterable, Sequence
+
+
+def _print(obj: dict) -> None:
+    sys.stdout.write(json.dumps(obj, default=float) + "\n")
+    sys.stdout.flush()
+
+
+def emit_metric(**fields: Any) -> None:
+    _print({"type": "metric", **fields})
+
+
+def emit_series(name: str, x: Sequence, y: Sequence, kind: str = "line") -> None:
+    _print({
+        "type": "series",
+        "name": name,
+        "kind": kind,
+        "x": list(x),
+        "y": list(y),
+    })
+
+
+def emit_matrix(name: str, labels: Sequence, values: Sequence[Sequence]) -> None:
+    _print({
+        "type": "matrix",
+        "name": name,
+        "labels": list(labels),
+        "values": [list(row) for row in values],
+    })
+
+
+def emit_log(level: str, message: str) -> None:
+    _print({"type": "log", "level": level, "message": message})
+
+
+def save_artifact(path: str) -> str:
+    """Copia `path` nella cartella artefatti (MEB_ARTIFACTS_DIR). Ritorna la dest."""
+    dest_dir = Path(os.environ.get("MEB_ARTIFACTS_DIR", "/workdir/out"))
+    dest_dir.mkdir(parents=True, exist_ok=True)
+    src = Path(path)
+    dest = dest_dir / src.name
+    dest.write_bytes(src.read_bytes())
+    return str(dest)
+
+
+def dataset_path() -> str:
+    return os.environ["MEB_DATASET_PATH"]
+
+
+def artifacts_dir() -> str:
+    return os.environ.get("MEB_ARTIFACTS_DIR", "/workdir/out")
+
+
+def read_test_input() -> dict:
+    """Legge un singolo JSON da stdin (per script di test)."""
+    return json.loads(sys.stdin.readline())
+
+
+def write_test_output(outputs: dict) -> None:
+    _print({"type": "result", "outputs": outputs})
--- a/ml/static/styles/ml.css
+++ b/ml/static/styles/ml.css
@@ -0,0 +1,146 @@
+.ml-nav {
+    display: flex;
+    gap: 16px;
+    align-items: center;
+}
+.ml-nav a {
+    text-decoration: none;
+    color: var(--text-secondary);
+    font-weight: 600;
+    padding: 8px 12px;
+    border-radius: var(--radius-md);
+    transition: all 0.2s ease;
+}
+.ml-nav a:hover { background: var(--accent-light); color: var(--accent-color); }
+.ml-nav a.active { background: var(--accent-light); color: var(--accent-color); }
+
+.container {
+    max-width: 1200px;
+    margin: 24px auto;
+    padding: 0 24px;
+}
+
+.page-head {
+    display: flex;
+    align-items: center;
+    justify-content: space-between;
+    margin-bottom: 20px;
+}
+.page-head h2 { font-size: 1.5rem; }
+
+.list {
+    display: flex;
+    flex-direction: column;
+    gap: 8px;
+}
+.list .item {
+    display: flex;
+    align-items: center;
+    justify-content: space-between;
+    padding: 12px 16px;
+    border: 1px solid var(--header-border);
+    border-radius: var(--radius-lg);
+    background: #fff;
+    transition: box-shadow 0.12s ease;
+}
+.list .item:hover { box-shadow: var(--shadow-md); }
+.list .meta { color: var(--text-secondary); font-size: 0.85rem; }
+
+.form-row {
+    display: flex;
+    flex-wrap: wrap;
+    gap: 12px;
+    align-items: flex-end;
+    margin-bottom: 20px;
+}
+.form-row label {
+    display: flex;
+    flex-direction: column;
+    gap: 4px;
+    font-size: 0.85rem;
+    color: var(--text-secondary);
+}
+.form-row input, .form-row select, .form-row textarea {
+    padding: 8px 12px;
+    border: 1px solid var(--header-border);
+    border-radius: var(--radius-md);
+    font-family: inherit;
+}
+
+.hidden { display: none !important; }
+
+.queue-info {
+    font-size: 0.9rem;
+    color: var(--text-secondary);
+    padding: 6px 12px;
+    background: var(--accent-light);
+    border-radius: var(--radius-md);
+}
+
+.charts {
+    display: grid;
+    grid-template-columns: 1fr 1fr;
+    gap: 16px;
+    margin: 16px 0;
+}
+
+.logs {
+    background: #0f172a;
+    color: #cbd5e1;
+    padding: 12px;
+    border-radius: var(--radius-md);
+    font-family: ui-monospace, monospace;
+    font-size: 0.8rem;
+    max-height: 320px;
+    overflow: auto;
+    white-space: pre-wrap;
+}
+
+.detail {
+    border: 1px solid var(--header-border);
+    border-radius: var(--radius-lg);
+    padding: 16px;
+    margin-top: 16px;
+    background: #fff;
+    position: relative;
+}
+.detail #btn-close-detail {
+    position: absolute;
+    top: 8px;
+    right: 8px;
+    padding: 4px 10px;
+}
+
+dialog {
+    border: 1px solid var(--header-border);
+    border-radius: var(--radius-lg);
+    padding: 24px;
+    width: min(500px, 90vw);
+}
+dialog form { display: flex; flex-direction: column; gap: 12px; }
+dialog label { display: flex; flex-direction: column; gap: 4px; font-size: 0.85rem; }
+dialog menu { display: flex; justify-content: flex-end; gap: 8px; margin-top: 16px; padding: 0; }
+
+table {
+    width: 100%;
+    border-collapse: collapse;
+    margin-top: 12px;
+}
+th, td { padding: 8px 12px; border-bottom: 1px solid var(--header-border); text-align: left; font-size: 0.9rem; }
+
+code {
+    font-family: ui-monospace, monospace;
+    background: #f1f5f9;
+    padding: 2px 6px;
+    border-radius: 4px;
+    font-size: 0.85em;
+}
+
+pre {
+    background: #f8fafc;
+    padding: 12px;
+    border-radius: var(--radius-md);
+    overflow: auto;
+    font-family: ui-monospace, monospace;
+    font-size: 0.8rem;
+}
--- a/ml/templates/_layout.html
+++ b/ml/templates/_layout.html
@@ -0,0 +1,33 @@
+<!DOCTYPE html>
+<html lang="it">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>ML — {% block title %}{{ page|capitalize }}{% endblock %}</title>
+    <link href="/static/styles/style.css" rel="stylesheet">
+    <link href="/static/styles/ml.css" rel="stylesheet">
+</head>
+<body>
+    <div class="header">
+        <h1>Modelli ML</h1>
+        <nav class="ml-nav">
+            <a href="/datasets" class="{% if page=='datasets' %}active{% endif %}">Datasets</a>
+            <a href="/models" class="{% if page=='models' %}active{% endif %}">Modelli</a>
+            <a href="/train" class="{% if page=='train' %}active{% endif %}">Train</a>
+            <a href="/test" class="{% if page=='test' %}active{% endif %}">Test</a>
+            <a href="/results" class="{% if page=='results' %}active{% endif %}">Results</a>
+        </nav>
+        <div class="profile">
+            <p id="username">{{ user.username }}</p>
+            <button id="logout-btn">Logout</button>
+        </div>
+    </div>
+
+    <div class="container">
+        {% block content %}{% endblock %}
+    </div>
+
+    <script src="/static/js/common.js"></script>
+    {% block scripts %}{% endblock %}
+</body>
+</html>
--- a/ml/templates/datasets.html
+++ b/ml/templates/datasets.html
@@ -0,0 +1,39 @@
+{% extends "_layout.html" %}
+{% block title %}Datasets{% endblock %}
+{% block content %}
+<div class="page-head">
+    <h2>Datasets</h2>
+    <button class="prominent" id="btn-upload">+ Carica CSV</button>
+</div>
+
+<div id="datasets-list" class="list"></div>
+
+<dialog id="upload-dlg">
+    <form id="upload-form" method="dialog">
+        <h3>Carica dataset</h3>
+        <label>Nome<input type="text" name="nome" required></label>
+        <label>Tipo
+            <select name="dataset_type">
+                <option value="custom">custom</option>
+                <option value="imported">imported</option>
+            </select>
+        </label>
+        <label>Formato
+            <select name="format">
+                <option value="csv">csv</option>
+                <option value="json">json</option>
+            </select>
+        </label>
+        <label>Tags (virgola)<input type="text" name="tags"></label>
+        <label>Descrizione<textarea name="description"></textarea></label>
+        <label>File<input type="file" name="file" required></label>
+        <menu>
+            <button type="button" id="upload-cancel">Annulla</button>
+            <button type="submit" class="prominent">Carica</button>
+        </menu>
+    </form>
+</dialog>
+{% endblock %}
+{% block scripts %}
+<script src="/static/js/datasets.js"></script>
+{% endblock %}
--- a/ml/templates/models.html
+++ b/ml/templates/models.html
@@ -0,0 +1,57 @@
+{% extends "_layout.html" %}
+{% block title %}Modelli{% endblock %}
+{% block content %}
+<div class="page-head">
+    <h2>Modelli</h2>
+    <button class="prominent" id="btn-add-model">+ Aggiungi modello</button>
+</div>
+
+<div id="models-list" class="list"></div>
+
+<div id="model-detail" class="detail hidden">
+    <button id="btn-close-detail">×</button>
+    <h3 id="md-name"></h3>
+    <p id="md-meta"></p>
+    <section>
+        <h4>Branch / Commits</h4>
+        <select id="md-branch"></select>
+        <ul id="md-commits"></ul>
+    </section>
+    <section>
+        <h4>model.yml</h4>
+        <pre id="md-spec"></pre>
+    </section>
+    <section>
+        <h4>Note</h4>
+        <ul id="md-notes"></ul>
+        <form id="md-note-form">
+            <textarea name="text" placeholder="Nuova nota"></textarea>
+            <button type="submit" class="prominent">Aggiungi</button>
+        </form>
+    </section>
+</div>
+
+<dialog id="add-model-dlg">
+    <form id="add-model-form" method="dialog">
+        <h3>Nuovo modello</h3>
+        <label>Nome<input type="text" name="name" required></label>
+        <label>Tipo
+            <select name="type">
+                <option>xgboost</option>
+                <option>lstm</option>
+                <option>sklearn</option>
+                <option>other</option>
+            </select>
+        </label>
+        <label>Repo Gitea (owner/repo)<input type="text" name="gitea_repo" required></label>
+        <label>Branch<input type="text" name="default_branch" value="main"></label>
+        <menu>
+            <button type="button" id="add-model-cancel">Annulla</button>
+            <button type="submit" class="prominent">Crea</button>
+        </menu>
+    </form>
+</dialog>
+{% endblock %}
+{% block scripts %}
+<script src="/static/js/models.js"></script>
+{% endblock %}
--- a/ml/templates/results.html
+++ b/ml/templates/results.html
@@ -1,89 +1,33 @@
-<!DOCTYPE html>
+{% extends "_layout.html" %}
+{% block title %}Risultati{% endblock %}
+{% block content %}
+<div class="page-head">
+    <h2>Risultati training</h2>
+    <button id="btn-compare" class="prominent">Confronta selezionati</button>
+</div>

-<html>
-    <head>
-        <title>Risultati</title>
-        <link href="../static/styles/style.css" rel="stylesheet">
+<div id="results-list" class="list"></div>

-        <style>
-            .container {
-                display: flex;
-                flex-direction: column;
-                align-items: center;
-                justify-content: center;
-                height: 100%;
-            }
+<section id="compare-panel" class="hidden">
+    <h3>Confronto</h3>
+    <div class="charts">
+        <canvas id="cmp-loss"></canvas>
+    </div>
+    <table id="cmp-table"></table>
+    <div id="cmp-plots"></div>
+</section>

-            .picker {
-                display: flex;
-                flex-direction: column;
-                align-items: center;
-                justify-content: center;
-                height: 100%;
-            }
-
-            .picker .header {
-                display: flex;
-                flex-direction: column;
-                align-items: center;
-                justify-content: center;
-                height: 100%;
-            }
-            
-        </style>
-
-    </head>
-
-    <body>
-        <div class="header">
-            <h1>Risultati</h1>
-            <div class="profile">
-                <p>Utente</p>
-                <button>Logout</button>
-            </div>
-        </div>     
-        
-        <div class="container">
-
-            <div class="picker">
-                
-                <div class="header">
-                    <h2>
-                    Seleziona
-                </h2>
-
-                <p>
-                    una sessione di training eseguita per visualizzarne i risultati
-                </p>
-                </div>
-
-                <div class="grid">
-
-                    <div class="card">
-                        <h3>sessione 1</h3>
-                        <div class="train-info">
-                            <p>24/03/2026</p>
-                            <p>12:00</p>
-                            <p>dataset: d-1</p>
-                        </div>
-                        
-                    </div>
-
-                    <div class="card">
-                        <h3>sessione 2</h3>
-                        <p>24/03/2026</p>
-                        
-                    </div>
-
-                </div>
-
-            </div>
-
-        </div>
-        
-    </body>
-
-    <script>
-
-    </script>
-</html>
+<section id="detail-panel" class="hidden">
+    <h3>Dettaglio training <code id="dt-id"></code></h3>
+    <div id="dt-meta"></div>
+    <div class="charts">
+        <canvas id="dt-loss"></canvas>
+        <canvas id="dt-res"></canvas>
+    </div>
+    <div id="dt-plots"></div>
+</section>
+{% endblock %}
+{% block scripts %}
+<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
+<script src="/static/js/results.js"></script>
+{% endblock %}
--- a/ml/templates/test.html
+++ b/ml/templates/test.html
@@ -0,0 +1,33 @@
+{% extends "_layout.html" %}
+{% block title %}Test{% endblock %}
+{% block content %}
+<div class="page-head">
+    <h2>Test modello</h2>
+    <div id="slot-info" class="queue-info">Slot: <span id="slot-count">–</span>/2</div>
+</div>
+
+<div id="slot-full" class="info-panel hidden">
+    <div class="icon">🚧</div>
+    <h3>Slot test pieni</h3>
+    <p>Massimo 2 utenti possono eseguire test contemporaneamente. Riprova tra qualche minuto.</p>
+</div>
+
+<form id="test-start" class="form-row">
+    <label>Modello<select id="t-model"></select></label>
+    <label>Training<select id="t-training"></select></label>
+    <button type="submit" class="prominent">Avvia sessione</button>
+</form>
+
+<section id="test-session" class="hidden">
+    <h3>Sessione <code id="ts-id"></code></h3>
+    <form id="inputs-form"></form>
+    <button id="btn-run" class="prominent">Esegui test</button>
+    <button id="btn-end">Chiudi sessione</button>
+
+    <h4>Risultati</h4>
+    <div id="runs-list"></div>
+</section>
+{% endblock %}
+{% block scripts %}
+<script src="/static/js/test.js"></script>
+{% endblock %}
--- a/ml/templates/train.html
+++ b/ml/templates/train.html
@@ -0,0 +1,35 @@
+{% extends "_layout.html" %}
+{% block title %}Train{% endblock %}
+{% block content %}
+<div class="page-head">
+    <h2>Avvia training</h2>
+    <div class="queue-info">Coda: <span id="queue-count">–</span></div>
+</div>
+
+<form id="train-form" class="form-row">
+    <label>Modello<select name="model_id" id="f-model"></select></label>
+    <label>Branch<select name="branch" id="f-branch"></select></label>
+    <label>Commit<select name="patch" id="f-patch"></select></label>
+    <label>Versione<input type="text" name="version" placeholder="1.0.0" required></label>
+    <label>Dataset<select name="dataset_id" id="f-dataset"></select></label>
+    <button type="submit" class="prominent">Avvia</button>
+</form>
+
+<section id="live-panel" class="hidden">
+    <h3>Training <code id="live-id"></code> — <span id="live-status">queued</span></h3>
+    <div class="charts">
+        <canvas id="chart-loss"></canvas>
+        <canvas id="chart-cpu"></canvas>
+    </div>
+    <pre id="live-logs" class="logs"></pre>
+</section>
+
+<section>
+    <h3>Recenti</h3>
+    <div id="recent-trainings" class="list"></div>
+</section>
+{% endblock %}
+{% block scripts %}
+<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
+<script src="/static/js/train.js"></script>
+{% endblock %}