feat: Add new API endpoints and HTML pages for ML model management

- Implemented HTML pages for datasets, models, training, testing, and results.
- Created API endpoints for managing repositories, results, tests, and training sessions.
- Added functionality for streaming training progress via Server-Sent Events (SSE).
- Introduced a Dockerfile for the ML runner with necessary dependencies.
- Developed an SDK for user code execution within the runner container.
- Enhanced CSS styles for improved UI layout and navigation.
- Established a layout template for consistent HTML structure across pages.
- Added JavaScript for dynamic interactions on the models page.
- Implemented WebSocket handling for real-time communication with kiosk devices and controllers.
- Implemented model registration and management API at /api/models
- Added Gitea proxy API for repository interactions at /api/repos
- Created results API for listing and comparing training results at /api/results
- Developed training management API for enqueueing and retrieving training jobs at /api/trainings
- Introduced SSE endpoint for live training progress updates
- Added HTML pages for models, datasets, and training management
- Created a Dockerfile for the ML runner with necessary dependencies
- Developed SDK for user code execution within the runner container
- Enhanced CSS styles for improved UI/UX
- Implemented WebSocket communication for real-time device and controller interactions in the kiosk system
This commit is contained in:
Giuseppe Raffa
2026-04-28 09:24:38 +02:00
parent ee478e52ef
commit 0ce879aa44
81 changed files with 7491 additions and 746 deletions

View File

@@ -0,0 +1,45 @@
PORT=3007
# Auth condiviso
JWT_SECRET=change-me
INTERNAL_API_KEY=change-me
AUTH_LOGIN_URL=https://auth.mebboat.it/login
# Postgres (db ml)
PG_HOST=meb-postgres
PG_PORT=5432
DB_USER=meb
DB_PASSWORD=meb
ML_DB=ml
# Redis
REDIS_HOST=meb-redis
REDIS_PORT=6379
# MinIO (bucket unico)
MINIO_ENDPOINT=minio
MINIO_PORT=9000
MINIO_USE_SSL=false
MINIO_ACCESS_KEY=
MINIO_SECRET_KEY=
MINIO_BUCKET=ml
# InfluxDB
INFLUX_URL=http://meb-influx:8086
INFLUX_TOKEN=
INFLUX_ORG=meb
INFLUX_BUCKET=ml_metrics
# Gitea (self-hosted esterno)
GITEA_URL=https://git.mebboat.it
GITEA_TOKEN=
# API service
API_URL=http://api:3003
# Training runtime
ML_TRAIN_CONCURRENCY=1
ML_RUNNER_IMAGE=meb-ml-runner:latest
ML_RUNNER_TMP=/var/ml/tmp
ML_GITCACHE_DIR=/var/ml/gitcache
ML_MAX_UPLOAD_MB=500

View File

@@ -3,6 +3,9 @@ FROM python:3.11-slim
WORKDIR /app
ENV PYTHONUNBUFFERED=1
RUN apt-get update && apt-get install -y --no-install-recommends git \
&& rm -rf /var/lib/apt/lists/*
COPY ./requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

72
ml/core/api_client.py Normal file
View File

@@ -0,0 +1,72 @@
"""Client HTTP verso l'api-service (service-to-service via x-api-key).
Espone accesso a:
/jobs ciclo di vita job
/queue stato coda
/pageconnections registro sessioni di pagina (enforcement /test max 2)
"""
from __future__ import annotations
from typing import Any, Optional
import httpx
from core.config import settings
def _headers() -> dict:
return {"x-api-key": settings.internal_api_key, "Content-Type": "application/json"}
async def _req(method: str, path: str, json: Optional[dict] = None, params: Optional[dict] = None) -> Any:
url = f"{settings.api_url}{path}"
async with httpx.AsyncClient(timeout=10.0) as c:
r = await c.request(method, url, json=json, params=params, headers=_headers())
r.raise_for_status()
if r.status_code == 204 or not r.content:
return None
return r.json()
# ── jobs ────────────────────────────────────────────────────────────────────
async def create_job(type_: str, created_by: str, payload: dict) -> dict:
return await _req("POST", "/jobs", json={"type": type_, "created_by": created_by, "payload": payload})
async def update_job(job_id: str, **fields) -> dict:
return await _req("PATCH", f"/jobs/{job_id}", json=fields)
async def get_job(job_id: str) -> dict:
return await _req("GET", f"/jobs/{job_id}")
async def list_jobs(type_: Optional[str] = None, status: Optional[str] = None, limit: int = 50) -> list:
params = {"limit": str(limit)}
if type_:
params["type"] = type_
if status:
params["status"] = status
return await _req("GET", "/jobs", params=params) or []
# ── queue ───────────────────────────────────────────────────────────────────
async def queue_status(type_: str = "train") -> dict:
return await _req("GET", "/queue", params={"type": type_})
# ── page connections ───────────────────────────────────────────────────────
async def page_connect(page: str, user_id: str, session_id: str) -> dict:
return await _req("POST", "/pageconnections", json={"page": page, "user_id": user_id, "session_id": session_id})
async def page_ping(session_id: str) -> dict:
return await _req("POST", f"/pageconnections/{session_id}/ping")
async def page_disconnect(session_id: str) -> None:
await _req("DELETE", f"/pageconnections/{session_id}")
async def page_count(page: str) -> dict:
return await _req("GET", f"/pageconnections/{page}")

64
ml/core/config.py Normal file
View File

@@ -0,0 +1,64 @@
"""Configurazione centralizzata del servizio ML, letta da env."""
from __future__ import annotations
import os
from dataclasses import dataclass
def _b(name: str, default: bool = False) -> bool:
return os.environ.get(name, str(default)).lower() in ("1", "true", "yes", "on")
@dataclass(frozen=True)
class Settings:
# Postgres (db "ml")
pg_host: str = os.environ.get("PG_HOST", "meb-postgres")
pg_port: int = int(os.environ.get("PG_PORT", "5432"))
pg_user: str = os.environ.get("DB_USER", "meb")
pg_password: str = os.environ.get("DB_PASSWORD", "meb")
pg_db: str = os.environ.get("ML_DB", "ml")
# Redis
redis_host: str = os.environ.get("REDIS_HOST", "meb-redis")
redis_port: int = int(os.environ.get("REDIS_PORT", "6379"))
# MinIO (bucket unico)
minio_endpoint: str = os.environ.get("MINIO_ENDPOINT", "minio")
minio_port: int = int(os.environ.get("MINIO_PORT", "9000"))
minio_use_ssl: bool = _b("MINIO_USE_SSL", False)
minio_access_key: str = os.environ.get("MINIO_ACCESS_KEY", "")
minio_secret_key: str = os.environ.get("MINIO_SECRET_KEY", "")
minio_bucket: str = os.environ.get("MINIO_BUCKET", "ml")
# InfluxDB — accetta sia INFLUX_* che INFLX_* per allinearsi alle var già
# usate dagli altri servizi (realtime, api) senza dover duplicare la config.
influx_url: str = os.environ.get("INFLUX_URL") or os.environ.get("INFLX_URL", "http://meb-influx:8086")
influx_token: str = os.environ.get("INFLUX_TOKEN") or os.environ.get("INFLX_TOKEN", "")
influx_org: str = os.environ.get("INFLUX_ORG") or os.environ.get("INFLX_ORG", "meb")
# Bucket dedicato alle metriche di training/test ML, separato dai logs e
# dai dati meteo. Sovrascrivibile via INFLUX_BUCKET o ML_INFLUX_BUCKET.
influx_bucket: str = os.environ.get("ML_INFLUX_BUCKET") or os.environ.get("INFLUX_BUCKET", "ml_metrics")
# Gitea (installato esternamente)
gitea_url: str = os.environ.get("GITEA_URL", "")
gitea_token: str = os.environ.get("GITEA_TOKEN", "")
# API service (per jobs/queue/pageconnections)
api_url: str = os.environ.get("API_URL", "http://api:3003")
internal_api_key: str = os.environ.get("INTERNAL_API_KEY", "")
# Auth (condiviso)
jwt_secret: str = os.environ.get("JWT_SECRET", "")
auth_login_url: str = os.environ.get("AUTH_LOGIN_URL", "https://auth.mebboat.it/login")
# Esecuzione training
train_concurrency: int = int(os.environ.get("ML_TRAIN_CONCURRENCY", "1"))
runner_image: str = os.environ.get("ML_RUNNER_IMAGE", "meb-ml-runner:latest")
runner_tmp_dir: str = os.environ.get("ML_RUNNER_TMP", "/var/ml/tmp")
gitcache_dir: str = os.environ.get("ML_GITCACHE_DIR", "/var/ml/gitcache")
# Limiti runtime
max_upload_mb: int = int(os.environ.get("ML_MAX_UPLOAD_MB", "500"))
settings = Settings()

53
ml/core/db.py Normal file
View File

@@ -0,0 +1,53 @@
"""Connessione asyncpg al database ml. Pool singleton."""
from __future__ import annotations
import asyncpg
from typing import Optional
from core.config import settings
_pool: Optional[asyncpg.Pool] = None
async def init_pool() -> asyncpg.Pool:
global _pool
if _pool is None:
_pool = await asyncpg.create_pool(
host=settings.pg_host,
port=settings.pg_port,
user=settings.pg_user,
password=settings.pg_password,
database=settings.pg_db,
min_size=1,
max_size=10,
command_timeout=30,
)
return _pool
async def close_pool() -> None:
global _pool
if _pool is not None:
await _pool.close()
_pool = None
def pool() -> asyncpg.Pool:
if _pool is None:
raise RuntimeError("DB pool not initialized — call init_pool() at startup")
return _pool
async def fetch(sql: str, *args):
async with pool().acquire() as c:
return await c.fetch(sql, *args)
async def fetchrow(sql: str, *args):
async with pool().acquire() as c:
return await c.fetchrow(sql, *args)
async def execute(sql: str, *args):
async with pool().acquire() as c:
return await c.execute(sql, *args)

439
ml/core/docker_runner.py Normal file
View File

@@ -0,0 +1,439 @@
"""Runner Docker per train e test.
train:
- clone repo Gitea @ sha
- prepara workdir /var/ml/tmp/{training_id}
- scarica dataset da MinIO in workdir/data.<ext>
- docker run meb-ml-runner con mount tmp, env, limits da model.yml
- legge stdout JSON → Redis stream + Influx; docker stats ogni 5s
- a fine: collect outputs, upload su MinIO prefix artifacts_prefix
- UPDATE trainings
test:
- analogo ma sincrono, stdin JSON → stdout JSON
"""
from __future__ import annotations
import asyncio
import json
import logging
import os
import shutil
import subprocess
import time
import uuid
from pathlib import Path
from typing import Any, Optional
import docker
from influxdb_client import Point
from core import db, gitea, influx_client, minio_client, redis_client
from core.config import settings
from core.model_spec import fetch_and_parse_spec
log = logging.getLogger(__name__)
_docker = None
def _docker_client():
global _docker
if _docker is None:
_docker = docker.from_env()
return _docker
async def _emit(stream_key: str, payload: dict) -> None:
try:
await redis_client.client().xadd(stream_key, {"payload": json.dumps(payload)}, maxlen=10_000)
except Exception as e:
log.warning("xadd failed: %s", e)
async def _clone_repo(owner_repo: str, sha: str, dest: Path) -> None:
dest.mkdir(parents=True, exist_ok=True)
url = gitea.clone_url(owner_repo)
# clone shallow del branch/sha specifico
# per evitare leak del token nei log, logghiamo solo host
proc = await asyncio.create_subprocess_exec(
"git", "clone", "--depth", "50", url, str(dest),
stdout=subprocess.PIPE, stderr=subprocess.PIPE,
)
_, err = await proc.communicate()
if proc.returncode != 0:
raise RuntimeError(f"git clone failed: {err.decode(errors='replace')[:400]}")
# checkout sha
proc = await asyncio.create_subprocess_exec(
"git", "-C", str(dest), "checkout", sha,
stdout=subprocess.PIPE, stderr=subprocess.PIPE,
)
_, err = await proc.communicate()
if proc.returncode != 0:
raise RuntimeError(f"git checkout failed: {err.decode(errors='replace')[:400]}")
async def _download_dataset(dataset_id: str, dest: Path) -> str:
row = await db.fetchrow(
"SELECT file_key, format FROM datasets WHERE id = $1", uuid.UUID(dataset_id)
)
if not row:
raise RuntimeError("dataset not found")
data = minio_client.get_bytes(row["file_key"], bucket="ml.datasets")
ext = {"csv": "csv", "json": "json", "netcdf": "nc"}.get(row["format"], "bin")
out = dest / f"data.{ext}"
out.write_bytes(data)
return str(out)
def _stats_loop_sync(container, training_id: str, model_id: str, samples: list, stop_evt: asyncio.Event, loop: asyncio.AbstractEventLoop):
"""Sincrono, eseguito in thread. Ogni 5s legge docker stats → Influx + samples."""
while not stop_evt.is_set():
try:
stats = container.stats(stream=False)
# CPU%
cpu_delta = stats["cpu_stats"]["cpu_usage"]["total_usage"] - stats["precpu_stats"]["cpu_usage"]["total_usage"]
sys_delta = stats["cpu_stats"].get("system_cpu_usage", 0) - stats["precpu_stats"].get("system_cpu_usage", 0)
online = stats["cpu_stats"].get("online_cpus") or len(stats["cpu_stats"]["cpu_usage"].get("percpu_usage") or [1])
cpu_pct = (cpu_delta / sys_delta) * online * 100.0 if sys_delta > 0 else 0.0
mem_mb = (stats["memory_stats"].get("usage") or 0) / (1024 * 1024)
samples.append((cpu_pct, mem_mb))
point = (
Point("ml_training")
.tag("training_id", training_id)
.tag("model_id", model_id)
.field("cpu_pct", float(cpu_pct))
.field("mem_mb", float(mem_mb))
)
asyncio.run_coroutine_threadsafe(influx_client.write_points([point]), loop)
except Exception as e:
log.warning("stats loop error: %s", e)
time.sleep(5)
async def _stream_container_logs(container, training_id: str, model_id: str, stream_key: str):
"""Legge stdout del container, pubblica righe JSON su Redis stream e Influx."""
def _iter():
return container.logs(stream=True, follow=True, stdout=True, stderr=True)
loop = asyncio.get_event_loop()
it = await loop.run_in_executor(None, _iter)
while True:
line = await loop.run_in_executor(None, next, it, None)
if line is None:
break
try:
text = line.decode("utf-8", errors="replace").rstrip("\n")
except Exception:
continue
if not text:
continue
# righe non-JSON → log
payload: dict
if text.startswith("{") and text.endswith("}"):
try:
payload = json.loads(text)
except json.JSONDecodeError:
payload = {"type": "log", "level": "info", "message": text}
else:
payload = {"type": "log", "level": "info", "message": text}
await _emit(stream_key, payload)
if payload.get("type") == "metric":
p = Point("ml_training").tag("training_id", training_id).tag("model_id", model_id)
for k, v in payload.items():
if k == "type":
continue
if isinstance(v, (int, float)):
p = p.field(k, float(v))
try:
await influx_client.write_points([p])
except Exception as e:
log.warning("influx write metric failed: %s", e)
async def run_training_job(training_id: str) -> None:
"""Esegue un job di training end-to-end. Aggiorna Postgres e Redis state."""
r = redis_client.client()
state_key = f"ml:train:{training_id}"
stream_key = f"ml:train:{training_id}:events"
tr = await db.fetchrow("SELECT * FROM trainings WHERE id = $1", uuid.UUID(training_id))
if not tr:
log.error("training %s not found", training_id)
return
model = await db.fetchrow("SELECT * FROM models WHERE id = $1", tr["model_id"])
if not model:
await db.execute(
"UPDATE trainings SET status='failed', error=$2 WHERE id=$1",
uuid.UUID(training_id), "model not found",
)
return
await db.execute(
"UPDATE trainings SET status='running', started_at=NOW() WHERE id=$1",
uuid.UUID(training_id),
)
await r.hset(state_key, mapping={"status": "running", "progress": "0", "message": "starting"})
workdir = Path(settings.runner_tmp_dir) / training_id
artifacts_prefix = f"models/{tr['model_id']}/{tr['version']}/{tr['patch']}"
error: Optional[str] = None
samples: list[tuple[float, float]] = []
try:
workdir.mkdir(parents=True, exist_ok=True)
await _emit(stream_key, {"type": "log", "level": "info", "message": "cloning repo"})
await _clone_repo(model["gitea_repo"], tr["patch"], workdir / "repo")
await _emit(stream_key, {"type": "log", "level": "info", "message": "parsing model.yml"})
spec = await fetch_and_parse_spec(model["gitea_repo"], tr["patch"]) or {}
train_spec = spec.get("train", {})
entrypoint = train_spec.get("entrypoint") or "python -m src.train"
resources = spec.get("resources", {}) or {}
await _emit(stream_key, {"type": "log", "level": "info", "message": "downloading dataset"})
dataset_path = await _download_dataset(str(tr["dataset_id"]), workdir)
out_dir = workdir / "out"
out_dir.mkdir(exist_ok=True)
# docker run
dc = _docker_client()
await _emit(stream_key, {"type": "log", "level": "info", "message": "starting container"})
container = dc.containers.run(
settings.runner_image,
command=["sh", "-c", f"cd /workdir/repo && pip install -q -r requirements.txt 2>&1 || true && {entrypoint}"],
detach=True,
working_dir="/workdir/repo",
environment={
"MEB_DATASET_PATH": f"/workdir/{Path(dataset_path).name}",
"MEB_ARTIFACTS_DIR": "/workdir/out",
"MEB_TRAINING_ID": training_id,
},
volumes={str(workdir): {"bind": "/workdir", "mode": "rw"}},
network_mode="none",
mem_limit=f"{int(resources.get('mem_mb', 2048))}m",
nano_cpus=int(float(resources.get("cpu", 1)) * 1e9),
read_only=False,
tty=False,
detach_mode=None,
)
loop = asyncio.get_event_loop()
stop_evt = asyncio.Event()
stats_task = loop.run_in_executor(
None, _stats_loop_sync, container, training_id, str(tr["model_id"]), samples, stop_evt, loop
)
log_task = asyncio.create_task(
_stream_container_logs(container, training_id, str(tr["model_id"]), stream_key)
)
# attendi exit
exit_code = await loop.run_in_executor(None, lambda: container.wait()["StatusCode"])
stop_evt.set()
await log_task
try:
stats_task.cancel()
except Exception:
pass
if exit_code != 0:
error = f"container exited with code {exit_code}"
# raccogli outputs
results: dict = {}
final_metrics_path = out_dir / "metrics.json"
if final_metrics_path.exists():
try:
results = json.loads(final_metrics_path.read_text())
except Exception:
results = {"raw": final_metrics_path.read_text()[:10000]}
# upload artefatti (tutta la cartella out/)
for p in out_dir.rglob("*"):
if p.is_file():
rel = p.relative_to(out_dir).as_posix()
key = f"{artifacts_prefix}/{rel}"
minio_client.put_bytes(key, p.read_bytes())
# upload logs jsonl dallo stream redis (copia su minio per persistenza)
try:
entries = await r.xrange(stream_key, min="-", max="+")
lines = "\n".join(json.dumps({"id": i, **({"payload": json.loads(f.get("payload", "{}"))} if "payload" in f else f)}) for i, f in entries)
minio_client.put_bytes(f"trainings/{training_id}/logs.jsonl", lines.encode("utf-8"), "application/x-ndjson")
except Exception as e:
log.warning("log archive failed: %s", e)
cpu_avg = sum(s[0] for s in samples) / len(samples) if samples else 0.0
cpu_peak = max((s[0] for s in samples), default=0.0)
mem_avg = sum(s[1] for s in samples) / len(samples) if samples else 0.0
mem_peak = max((s[1] for s in samples), default=0.0)
resource_summary = {
"cpu_avg": round(cpu_avg, 2),
"cpu_peak": round(cpu_peak, 2),
"mem_avg_mb": round(mem_avg, 2),
"mem_peak_mb": round(mem_peak, 2),
"samples": len(samples),
}
status = "failed" if error else "succeeded"
await db.execute(
"""
UPDATE trainings SET
status=$2,
finished_at=NOW(),
duration_ms=EXTRACT(EPOCH FROM (NOW() - started_at))*1000,
artifacts_prefix=$3,
results=$4::jsonb,
resource_summary=$5::jsonb,
error=$6
WHERE id=$1
""",
uuid.UUID(training_id),
status,
artifacts_prefix,
json.dumps(results),
json.dumps(resource_summary),
error,
)
await r.hset(state_key, mapping={"status": status, "progress": "100", "message": error or "done"})
await _emit(stream_key, {"type": "end", "status": status, "error": error})
# Flush dei punti Influx accumulati durante il training (batched).
await influx_client.flush()
try:
container.remove(force=True)
except Exception:
pass
except Exception as e:
log.exception("training %s failed: %s", training_id, e)
await db.execute(
"UPDATE trainings SET status='failed', finished_at=NOW(), error=$2 WHERE id=$1",
uuid.UUID(training_id), str(e)[:1000],
)
await r.hset(state_key, mapping={"status": "failed", "message": str(e)[:200]})
await _emit(stream_key, {"type": "end", "status": "failed", "error": str(e)[:400]})
finally:
# cleanup workdir
try:
shutil.rmtree(workdir, ignore_errors=True)
except Exception:
pass
async def run_test_once(training_id: str, inputs: dict) -> dict:
"""Esegue una singola predizione via container spawn."""
tr = await db.fetchrow(
"SELECT t.*, m.gitea_repo FROM trainings t JOIN models m ON t.model_id = m.id WHERE t.id=$1",
uuid.UUID(training_id),
)
if not tr:
raise RuntimeError("training not found")
spec = await fetch_and_parse_spec(tr["gitea_repo"], tr["patch"]) or {}
test_spec = spec.get("test") or {}
entrypoint = test_spec.get("entrypoint") or "python -m src.predict"
workdir = Path(settings.runner_tmp_dir) / f"test-{uuid.uuid4()}"
workdir.mkdir(parents=True, exist_ok=True)
try:
await _clone_repo(tr["gitea_repo"], tr["patch"], workdir / "repo")
# scarica artefatti
if tr["artifacts_prefix"]:
art_dir = workdir / "artifacts"
art_dir.mkdir(exist_ok=True)
for obj in minio_client.list_prefix(tr["artifacts_prefix"] + "/"):
rel = obj["name"][len(tr["artifacts_prefix"]) + 1:]
out_path = art_dir / rel
out_path.parent.mkdir(parents=True, exist_ok=True)
out_path.write_bytes(minio_client.get_bytes(obj["name"]))
dc = _docker_client()
payload = json.dumps({"inputs": inputs}).encode()
container = dc.containers.run(
settings.runner_image,
command=["sh", "-c", f"cd /workdir/repo && pip install -q -r requirements.txt 2>&1 >/dev/null || true && {entrypoint}"],
detach=True,
working_dir="/workdir/repo",
environment={
"MEB_ARTIFACTS_DIR": "/workdir/artifacts",
"MEB_TRAINING_ID": training_id,
},
volumes={str(workdir): {"bind": "/workdir", "mode": "ro"}},
network_mode="none",
mem_limit="2048m",
nano_cpus=int(1e9),
stdin_open=True,
tty=False,
)
# scrivi input su stdin via attach socket
sock = container.attach_socket(params={"stdin": 1, "stream": 1})
try:
sock._sock.sendall(payload + b"\n")
except Exception:
pass
try:
sock.close()
except Exception:
pass
loop = asyncio.get_event_loop()
# stats peak
peak_cpu = 0.0
peak_mem = 0.0
stop = False
def _stats():
nonlocal peak_cpu, peak_mem, stop
for st in container.stats(stream=True, decode=True):
if stop:
return
try:
cpu_delta = st["cpu_stats"]["cpu_usage"]["total_usage"] - st["precpu_stats"]["cpu_usage"]["total_usage"]
sys_delta = st["cpu_stats"].get("system_cpu_usage", 0) - st["precpu_stats"].get("system_cpu_usage", 0)
online = st["cpu_stats"].get("online_cpus") or 1
cpu_pct = (cpu_delta / sys_delta) * online * 100 if sys_delta > 0 else 0
mem_mb = (st["memory_stats"].get("usage") or 0) / (1024 * 1024)
peak_cpu = max(peak_cpu, cpu_pct)
peak_mem = max(peak_mem, mem_mb)
except Exception:
pass
stats_fut = loop.run_in_executor(None, _stats)
exit_info = await loop.run_in_executor(None, container.wait)
stop = True
logs = container.logs(stdout=True, stderr=False).decode("utf-8", errors="replace")
try:
container.remove(force=True)
except Exception:
pass
outputs: dict = {}
for line in logs.strip().splitlines():
line = line.strip()
if line.startswith("{") and line.endswith("}"):
try:
obj = json.loads(line)
if "outputs" in obj:
outputs = obj["outputs"]
break
except json.JSONDecodeError:
continue
return {
"outputs": outputs,
"exit_code": exit_info.get("StatusCode"),
"cpu_peak": round(peak_cpu, 2),
"mem_peak_mb": round(peak_mem, 2),
"raw_log": logs[-2000:],
}
finally:
shutil.rmtree(workdir, ignore_errors=True)

57
ml/core/gitea.py Normal file
View File

@@ -0,0 +1,57 @@
"""Client Gitea: browse repo, branches, commits, file raw, clone URL autenticato."""
from __future__ import annotations
from typing import Optional
import httpx
from core.config import settings
def _headers() -> dict:
h = {"Accept": "application/json"}
if settings.gitea_token:
h["Authorization"] = f"token {settings.gitea_token}"
return h
def clone_url(owner_repo: str) -> str:
"""URL https://oauth2:TOKEN@<host>/owner/repo.git — usato SOLO lato server."""
if not settings.gitea_url:
raise RuntimeError("GITEA_URL not configured")
base = settings.gitea_url.rstrip("/")
if settings.gitea_token:
base = base.replace("https://", f"https://oauth2:{settings.gitea_token}@").replace(
"http://", f"http://oauth2:{settings.gitea_token}@"
)
return f"{base}/{owner_repo}.git"
async def _get(path: str, params: Optional[dict] = None) -> list | dict:
url = f"{settings.gitea_url.rstrip('/')}/api/v1{path}"
async with httpx.AsyncClient(timeout=15.0) as c:
r = await c.get(url, params=params, headers=_headers())
r.raise_for_status()
return r.json()
async def list_repos(limit: int = 50) -> list[dict]:
data = await _get("/repos/search", params={"limit": str(limit)})
return data.get("data", []) if isinstance(data, dict) else []
async def list_branches(owner_repo: str) -> list[dict]:
return await _get(f"/repos/{owner_repo}/branches")
async def list_commits(owner_repo: str, branch: str = "main", limit: int = 50) -> list[dict]:
return await _get(f"/repos/{owner_repo}/commits", params={"sha": branch, "limit": str(limit)})
async def get_file_raw(owner_repo: str, ref: str, path: str) -> bytes:
"""Scarica il file raw alla revisione indicata."""
url = f"{settings.gitea_url.rstrip('/')}/api/v1/repos/{owner_repo}/raw/{path}"
async with httpx.AsyncClient(timeout=15.0) as c:
r = await c.get(url, params={"ref": ref}, headers=_headers())
r.raise_for_status()
return r.content

75
ml/core/influx_client.py Normal file
View File

@@ -0,0 +1,75 @@
"""Client InfluxDB (influxdb-client sync wrapper in thread-pool per async).
Le scritture usano il batching async dell'SDK invece di SYNCHRONOUS.
Le metriche di training arrivano in burst (logs container, stats loop ogni 5s):
con SYNCHRONOUS ogni write era una HTTP request bloccante. Con WriteOptions
batched, l'SDK accumula i Point e fa flush periodico in background, senza
perdere durabilità (flush forzato a fine training).
"""
from __future__ import annotations
import asyncio
from typing import Iterable, Optional
from influxdb_client import InfluxDBClient, Point, WriteOptions
from core.config import settings
_client: Optional[InfluxDBClient] = None
_write_api = None
def client() -> InfluxDBClient:
global _client, _write_api
if _client is None:
_client = InfluxDBClient(
url=settings.influx_url, token=settings.influx_token, org=settings.influx_org
)
_write_api = _client.write_api(write_options=WriteOptions(
batch_size=200,
flush_interval=2_000,
jitter_interval=200,
retry_interval=2_000,
max_retries=3,
))
return _client
def _wa():
client()
return _write_api
async def write_points(points: Iterable[Point]) -> None:
wa = _wa()
pts = list(points)
await asyncio.to_thread(wa.write, settings.influx_bucket, settings.influx_org, pts)
async def flush() -> None:
"""Forza il flush del buffer batched. Da chiamare a fine training per
garantire che tutte le metriche raccolte siano persistite."""
if _write_api is None:
return
try:
await asyncio.to_thread(_write_api.flush)
except Exception:
pass
async def query_flux(flux: str) -> list[dict]:
c = client()
def _q():
tables = c.query_api().query(flux, org=settings.influx_org)
out = []
for table in tables:
for r in table.records:
out.append({
"time": r.get_time().isoformat() if r.get_time() else None,
"measurement": r.get_measurement(),
"field": r.get_field(),
"value": r.get_value(),
"tags": {k: v for k, v in r.values.items() if k.startswith("_") is False and k not in ("result", "table")},
})
return out
return await asyncio.to_thread(_q)

118
ml/core/minio_client.py Normal file
View File

@@ -0,0 +1,118 @@
"""Wrapper MinIO: bucket unico (settings.minio_bucket) con prefissi logici.
Prefissi usati:
datasets/<uuid>.<ext>
models/<model_id>/spec.yml
models/<model_id>/<version>/<patch>/... (artefatti training)
trainings/<training_id>/logs.jsonl
"""
from __future__ import annotations
import io
from datetime import timedelta
from typing import Iterable, Optional
from minio import Minio
from minio.error import S3Error
from core.config import settings
_client: Optional[Minio] = None
def client() -> Minio:
global _client
if _client is None:
_client = Minio(
f"{settings.minio_endpoint}:{settings.minio_port}",
access_key=settings.minio_access_key,
secret_key=settings.minio_secret_key,
secure=settings.minio_use_ssl,
)
return _client
def _bucket(b: Optional[str] = None) -> str:
return b or settings.minio_bucket
def ensure_bucket(bucket: Optional[str] = None) -> None:
name = _bucket(bucket)
c = client()
if not c.bucket_exists(name):
c.make_bucket(name)
def put_bytes(key: str, data: bytes, content_type: str = "application/octet-stream",
bucket: Optional[str] = None) -> None:
ensure_bucket(bucket)
client().put_object(
_bucket(bucket),
key,
io.BytesIO(data),
length=len(data),
content_type=content_type,
)
def put_stream(key: str, stream, length: int, content_type: str = "application/octet-stream",
bucket: Optional[str] = None) -> None:
ensure_bucket(bucket)
client().put_object(
_bucket(bucket), key, stream, length=length, content_type=content_type
)
def get_bytes(key: str, bucket: Optional[str] = None) -> bytes:
r = client().get_object(_bucket(bucket), key)
try:
return r.read()
finally:
r.close()
r.release_conn()
def remove(key: str, bucket: Optional[str] = None) -> None:
try:
client().remove_object(_bucket(bucket), key)
except S3Error:
pass
def remove_prefix(prefix: str, bucket: Optional[str] = None) -> int:
name = _bucket(bucket)
n = 0
for obj in client().list_objects(name, prefix=prefix, recursive=True):
try:
client().remove_object(name, obj.object_name)
n += 1
except S3Error:
pass
return n
def presigned_get(key: str, expires_seconds: int = 3600, bucket: Optional[str] = None) -> str:
return client().presigned_get_object(
_bucket(bucket), key, expires=timedelta(seconds=expires_seconds)
)
def list_prefix(prefix: str, bucket: Optional[str] = None) -> list[dict]:
out = []
for obj in client().list_objects(_bucket(bucket), prefix=prefix, recursive=True):
out.append({
"name": obj.object_name,
"size": obj.size,
"last_modified": obj.last_modified.isoformat() if obj.last_modified else None,
"etag": obj.etag,
})
return out
def check() -> bool:
try:
client().list_buckets()
return True
except Exception:
return False

90
ml/core/model_spec.py Normal file
View File

@@ -0,0 +1,90 @@
"""Parse e validazione del contratto `model.yml` nelle repo utente.
Schema sintetico (vedi piano):
name, type, version, python
train: {entrypoint, inputs, outputs, metrics}
test: {entrypoint, io, input_schema[], output_schema[]}
resources: {cpu, mem_mb, gpu}
"""
from __future__ import annotations
from typing import Any, Optional
import yaml
from pydantic import BaseModel, ValidationError
from core import gitea, redis_client
class _FieldSpec(BaseModel):
name: str
dtype: str
min: Optional[float] = None
max: Optional[float] = None
unit: Optional[str] = None
class _Train(BaseModel):
entrypoint: str
inputs: dict = {}
outputs: dict = {}
metrics: dict = {}
class _Test(BaseModel):
entrypoint: str
io: str = "stdio_json"
input_schema: list[_FieldSpec] = []
output_schema: list[_FieldSpec] = []
class ModelSpec(BaseModel):
name: str
type: str
version: str = "0.1.0"
python: str = "3.11"
train: _Train
test: Optional[_Test] = None
resources: dict = {}
def parse_yaml(content: bytes | str) -> dict:
"""Parsa stringa YAML → dict validato. Solleva ValueError su errore."""
if isinstance(content, bytes):
content = content.decode("utf-8")
try:
raw = yaml.safe_load(content) or {}
spec = ModelSpec(**raw)
return spec.model_dump()
except (yaml.YAMLError, ValidationError) as e:
raise ValueError(f"invalid model.yml: {e}") from e
async def fetch_and_parse_spec(owner_repo: str, ref: str) -> Optional[dict]:
"""Recupera model.yml dalla repo alla revisione e lo parsa.
Cache Redis `ml:modelspec:{repo}:{ref}` TTL 1h.
"""
cache_key = f"ml:modelspec:{owner_repo}:{ref}"
try:
cached = await redis_client.client().get(cache_key)
if cached:
import json
return json.loads(cached)
except Exception:
pass
try:
raw = await gitea.get_file_raw(owner_repo, ref, "model.yml")
except Exception:
try:
raw = await gitea.get_file_raw(owner_repo, ref, "model.yaml")
except Exception:
return None
spec = parse_yaml(raw)
try:
import json
await redis_client.client().set(cache_key, json.dumps(spec), ex=3600)
except Exception:
pass
return spec

29
ml/core/redis_client.py Normal file
View File

@@ -0,0 +1,29 @@
"""Client Redis asincrono (redis-py asyncio). Singleton semplice."""
from __future__ import annotations
from typing import Optional
import redis.asyncio as redis
from core.config import settings
_client: Optional[redis.Redis] = None
def client() -> redis.Redis:
global _client
if _client is None:
_client = redis.Redis(
host=settings.redis_host,
port=settings.redis_port,
decode_responses=True,
health_check_interval=30,
)
return _client
async def close() -> None:
global _client
if _client is not None:
await _client.aclose()
_client = None

54
ml/core/worker.py Normal file
View File

@@ -0,0 +1,54 @@
"""Worker loop: BRPOP da ml:queue:train e dispatch al docker_runner.
Parte N task asincroni concorrenti (settings.train_concurrency).
"""
from __future__ import annotations
import asyncio
import logging
from core import redis_client
from core.config import settings
from core.docker_runner import run_training_job
log = logging.getLogger(__name__)
_tasks: list[asyncio.Task] = []
async def _worker_loop(idx: int):
r = redis_client.client()
log.info("ml worker[%d] started", idx)
while True:
try:
res = await r.brpop("ml:queue:train", timeout=10)
except Exception as e:
log.warning("brpop error: %s", e)
await asyncio.sleep(2)
continue
if res is None:
continue
_, training_id = res
log.info("worker[%d] picked training %s", idx, training_id)
try:
await run_training_job(training_id)
except Exception:
log.exception("worker[%d] training %s crashed", idx, training_id)
def start_workers() -> None:
global _tasks
n = max(1, settings.train_concurrency)
for i in range(n):
_tasks.append(asyncio.create_task(_worker_loop(i)))
async def stop_workers() -> None:
for t in _tasks:
t.cancel()
for t in _tasks:
try:
await t
except Exception:
pass
_tasks.clear()

View File

@@ -1,19 +1,90 @@
from fastapi import FastAPI, Request, Response, Header
from fastapi.responses import HTMLResponse, JSONResponse
import time
"""ml-service — FastAPI entrypoint.
Monta:
/ → RedirectResponse
/datasets /models /train /test /results → pagine Jinja
/api/datasets /api/models /api/repos /api/trainings /api/tests /api/results → JSON
/api/trainings/{id}/events → SSE
/health → check
/static/* → file statici
"""
from __future__ import annotations
import logging
from contextlib import asynccontextmanager
from pathlib import Path
from fastapi import FastAPI
from fastapi.staticfiles import StaticFiles
from core import db, minio_client, redis_client, worker
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
log = logging.getLogger(__name__)
STATIC_DIR = Path(__file__).resolve().parent / "static"
@asynccontextmanager
async def lifespan(app: FastAPI):
log.info("ml-service starting")
await db.init_pool()
try:
minio_client.ensure_bucket()
except Exception as e:
log.warning("minio bucket ensure failed: %s", e)
worker.start_workers()
yield
log.info("ml-service stopping")
await worker.stop_workers()
await db.close_pool()
await redis_client.close()
app = FastAPI(title="MEB ML Service", lifespan=lifespan)
# static
app.mount("/static", StaticFiles(directory=str(STATIC_DIR)), name="static")
app = FastAPI()
@app.get("/health")
def health():
async def health():
pg_ok = True
try:
await db.fetchrow("SELECT 1")
except Exception:
pg_ok = False
redis_ok = True
try:
await redis_client.client().ping()
except Exception:
redis_ok = False
return {
"status": "ok",
"status": "ok" if (pg_ok and redis_ok) else "degraded",
"service": "ml",
"version": "1.0.0",
"build_number": "1",
"version_state": "dev"
"postgres": "connected" if pg_ok else "disconnected",
"redis": "connected" if redis_ok else "disconnected",
"minio": "connected" if minio_client.check() else "disconnected",
"version": "2.0.0",
}
@app.get("/")
def root():
return {"message": "ML Service"}
from routers import ( # noqa: E402
datasets,
models,
pages,
repos,
results,
tests,
trainings,
trainings_stream,
)
app.include_router(pages.router)
app.include_router(datasets.router)
app.include_router(models.router)
app.include_router(repos.router)
app.include_router(trainings.router)
app.include_router(trainings_stream.router)
app.include_router(tests.router)
app.include_router(results.router)

View File

@@ -1,3 +1,15 @@
fastapi
uvicorn
uvicorn[standard]
PyJWT
asyncpg
redis>=5
minio
influxdb-client
docker
PyYAML
pydantic>=2
python-multipart
jinja2
aiofiles
httpx
sse-starlette

160
ml/routers/datasets.py Normal file
View File

@@ -0,0 +1,160 @@
"""API datasets (ml.mebboat.it/api/datasets).
Upload/list/get/download/delete. Storage:
MinIO bucket "ml" con key "datasets/<uuid>.<ext>"
Postgres db "ml" tabella "datasets"
"""
from __future__ import annotations
import json
import uuid
from typing import Optional
from fastapi import APIRouter, Depends, File, Form, HTTPException, Query, UploadFile
from core import db, minio_client
from core.auth import require_auth
router = APIRouter(prefix="/api/datasets", tags=["datasets"])
# Bucket MinIO fisso per tutti i dataset (no prefix nelle key).
BUCKET = "ml.datasets"
_EXT = {"csv": "csv", "json": "json", "netcdf": "nc"}
def _row(r) -> dict:
if r is None:
return None
d = dict(r)
# asyncpg ritorna JSONB come dict già; date/time come datetime
for k in ("created_at", "updated_at", "start_date", "end_date"):
if d.get(k) is not None and hasattr(d[k], "isoformat"):
d[k] = d[k].isoformat()
return d
@router.get("")
async def list_datasets(
type: Optional[str] = Query(None),
tags: Optional[str] = Query(None),
mine: Optional[int] = Query(None),
search: Optional[str] = Query(None),
user=Depends(require_auth),
):
where = []
args: list = []
if type:
args.append(type)
where.append(f"type = ${len(args)}")
if tags:
tag_arr = [t.strip() for t in tags.split(",") if t.strip()]
if tag_arr:
args.append(tag_arr)
where.append(f"tags && ${len(args)}")
if mine and user.get("username"):
args.append(user["username"])
where.append(f"created_by = ${len(args)}")
if search:
args.append(f"%{search}%")
where.append(f"(nome ILIKE ${len(args)} OR description ILIKE ${len(args)})")
sql = "SELECT * FROM datasets"
if where:
sql += " WHERE " + " AND ".join(where)
sql += " ORDER BY created_at DESC LIMIT 500"
rows = await db.fetch(sql, *args)
return {"count": len(rows), "datasets": [_row(r) for r in rows]}
@router.post("", status_code=201)
async def upload_dataset(
file: UploadFile = File(...),
metadata: str = Form("{}"),
user=Depends(require_auth),
):
try:
meta = json.loads(metadata or "{}")
except json.JSONDecodeError:
raise HTTPException(400, "metadata must be valid JSON")
fmt = meta.get("format") or meta.get("type") or "csv"
if fmt not in ("csv", "json", "netcdf"):
fmt = "csv"
ext = _EXT[fmt]
ds_id = str(uuid.uuid4())
file_key = f"{ds_id}.{ext}"
data = await file.read()
minio_client.put_bytes(file_key, data, content_type=file.content_type or "application/octet-stream", bucket=BUCKET)
created_by = user.get("username") or meta.get("created_by") or "unknown"
row = await db.fetchrow(
"""
INSERT INTO datasets (
id, file_key, nome, description, tags, type, format, notes,
created_by, size_bytes, copernicus_id
) VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11)
RETURNING *
""",
uuid.UUID(ds_id),
file_key,
meta.get("nome") or file.filename or file_key,
meta.get("description"),
meta.get("tags") or [],
meta.get("dataset_type") or "custom",
fmt,
meta.get("notes"),
created_by,
len(data),
meta.get("copernicus_id") or meta.get("copernicus_dataset_id"),
)
return _row(row)
@router.get("/{dataset_id}")
async def get_dataset(dataset_id: str, user=Depends(require_auth)):
row = await db.fetchrow("SELECT * FROM datasets WHERE id = $1", uuid.UUID(dataset_id))
if not row:
raise HTTPException(404, "not found")
return _row(row)
@router.get("/{dataset_id}/download")
async def download_dataset(dataset_id: str, user=Depends(require_auth)):
row = await db.fetchrow("SELECT file_key FROM datasets WHERE id = $1", uuid.UUID(dataset_id))
if not row:
raise HTTPException(404, "not found")
url = minio_client.presigned_get(row["file_key"], 3600, bucket=BUCKET)
return {"url": url, "expires_in": 3600}
@router.patch("/{dataset_id}")
async def patch_dataset(dataset_id: str, body: dict, user=Depends(require_auth)):
allowed = {"nome", "description", "tags", "notes"}
sets = []
args: list = []
for k, v in body.items():
if k in allowed:
args.append(v)
sets.append(f"{k} = ${len(args)}")
if not sets:
raise HTTPException(400, "no fields to update")
# Trigger updated_at non presente nel DB: lo aggiorniamo manualmente.
sets.append("updated_at = NOW()")
args.append(uuid.UUID(dataset_id))
row = await db.fetchrow(
f"UPDATE datasets SET {', '.join(sets)} WHERE id = ${len(args)} RETURNING *",
*args,
)
if not row:
raise HTTPException(404, "not found")
return _row(row)
@router.delete("/{dataset_id}", status_code=204)
async def delete_dataset(dataset_id: str, user=Depends(require_auth)):
row = await db.fetchrow("SELECT file_key FROM datasets WHERE id = $1", uuid.UUID(dataset_id))
if not row:
raise HTTPException(404, "not found")
minio_client.remove(row["file_key"], bucket=BUCKET)
await db.execute("DELETE FROM datasets WHERE id = $1", uuid.UUID(dataset_id))
return None

131
ml/routers/models.py Normal file
View File

@@ -0,0 +1,131 @@
"""API /api/models — registro modelli (repo Gitea + metadata)."""
from __future__ import annotations
import uuid
from typing import Optional
from fastapi import APIRouter, Depends, HTTPException
from core import db
from core.auth import require_auth
from core.model_spec import fetch_and_parse_spec
router = APIRouter(prefix="/api/models", tags=["models"])
def _row(r) -> Optional[dict]:
if r is None:
return None
d = dict(r)
for k in ("created_at", "updated_at"):
if d.get(k) is not None and hasattr(d[k], "isoformat"):
d[k] = d[k].isoformat()
return d
@router.get("")
async def list_models(user=Depends(require_auth)):
rows = await db.fetch("SELECT * FROM models ORDER BY created_at DESC LIMIT 500")
return {"count": len(rows), "models": [_row(r) for r in rows]}
@router.post("", status_code=201)
async def create_model(body: dict, user=Depends(require_auth)):
required = ("name", "type", "gitea_repo")
for k in required:
if not body.get(k):
raise HTTPException(400, f"missing field: {k}")
# prova a pre-caricare model.yml dal default branch (non fatale)
spec = None
try:
spec = await fetch_and_parse_spec(body["gitea_repo"], body.get("default_branch") or "main")
except Exception:
spec = None
row = await db.fetchrow(
"""
INSERT INTO models (name, type, gitea_repo, default_branch, spec, created_by)
VALUES ($1,$2,$3,$4,$5,$6)
RETURNING *
""",
body["name"],
body["type"],
body["gitea_repo"],
body.get("default_branch") or "main",
spec,
user.get("username") or "unknown",
)
return _row(row)
@router.get("/{model_id}")
async def get_model(model_id: str, user=Depends(require_auth)):
row = await db.fetchrow("SELECT * FROM models WHERE id = $1", uuid.UUID(model_id))
if not row:
raise HTTPException(404, "not found")
return _row(row)
@router.patch("/{model_id}")
async def patch_model(model_id: str, body: dict, user=Depends(require_auth)):
allowed = {"name", "type", "default_branch"}
sets = []
args: list = []
for k, v in body.items():
if k in allowed:
args.append(v)
sets.append(f"{k} = ${len(args)}")
if not sets:
raise HTTPException(400, "no fields to update")
args.append(uuid.UUID(model_id))
row = await db.fetchrow(
f"UPDATE models SET {', '.join(sets)} WHERE id = ${len(args)} RETURNING *",
*args,
)
if not row:
raise HTTPException(404, "not found")
return _row(row)
@router.delete("/{model_id}", status_code=204)
async def delete_model(model_id: str, user=Depends(require_auth)):
await db.execute("DELETE FROM models WHERE id = $1", uuid.UUID(model_id))
return None
# ── Notes ──────────────────────────────────────────────────────────────────
@router.get("/{model_id}/notes")
async def list_notes(model_id: str, user=Depends(require_auth)):
rows = await db.fetch(
"SELECT id, author, text, created_at FROM model_notes WHERE model_id = $1 ORDER BY created_at DESC",
uuid.UUID(model_id),
)
return [
{
"id": str(r["id"]),
"author": r["author"],
"text": r["text"],
"created_at": r["created_at"].isoformat(),
}
for r in rows
]
@router.post("/{model_id}/notes", status_code=201)
async def add_note(model_id: str, body: dict, user=Depends(require_auth)):
text = (body.get("text") or "").strip()
if not text:
raise HTTPException(400, "text required")
row = await db.fetchrow(
"INSERT INTO model_notes (model_id, author, text) VALUES ($1, $2, $3) RETURNING *",
uuid.UUID(model_id),
user.get("username") or "unknown",
text,
)
return {
"id": str(row["id"]),
"author": row["author"],
"text": row["text"],
"created_at": row["created_at"].isoformat(),
}

75
ml/routers/pages.py Normal file
View File

@@ -0,0 +1,75 @@
"""Pagine HTML servite direttamente da ml.mebboat.it.
Layout:
/ redirect a /datasets (o landing console)
/datasets lista/upload dataset
/models registro modelli
/train avvia training
/test esegue test su modello trainato
/results storico e confronto risultati
"""
from __future__ import annotations
from pathlib import Path
from fastapi import APIRouter, Depends, Request
from fastapi.responses import HTMLResponse, RedirectResponse
from fastapi.templating import Jinja2Templates
from core.auth import _verify
from core.config import settings
router = APIRouter(tags=["pages"])
TEMPLATES_DIR = Path(__file__).resolve().parent.parent / "templates"
templates = Jinja2Templates(directory=str(TEMPLATES_DIR))
def _user_or_redirect(request: Request):
"""Per le pagine, se non autenticato redirect al login. Ritorna user dict o RedirectResponse."""
token = request.cookies.get("auth_token")
auth = request.headers.get("authorization")
if not token and auth and auth.startswith("Bearer "):
token = auth[7:]
user = _verify(token)
if not user:
target = str(request.url)
return RedirectResponse(url=f"{settings.auth_login_url}?redirect={target}", status_code=302)
return user
def _render(request: Request, template: str, **ctx):
user = _user_or_redirect(request)
if isinstance(user, RedirectResponse):
return user
return templates.TemplateResponse(template, {"request": request, "user": user, **ctx})
@router.get("/", response_class=HTMLResponse)
async def home(request: Request):
return RedirectResponse(url="/datasets")
@router.get("/datasets", response_class=HTMLResponse)
async def page_datasets(request: Request):
return _render(request, "datasets.html", page="datasets")
@router.get("/models", response_class=HTMLResponse)
async def page_models(request: Request):
return _render(request, "models.html", page="models")
@router.get("/train", response_class=HTMLResponse)
async def page_train(request: Request):
return _render(request, "train.html", page="train")
@router.get("/test", response_class=HTMLResponse)
async def page_test(request: Request):
return _render(request, "test.html", page="test")
@router.get("/results", response_class=HTMLResponse)
async def page_results(request: Request):
return _render(request, "results.html", page="results")

51
ml/routers/repos.py Normal file
View File

@@ -0,0 +1,51 @@
"""API /api/repos — proxy autenticato verso Gitea."""
from __future__ import annotations
from fastapi import APIRouter, Depends, HTTPException, Query
from core import gitea
from core.auth import require_auth
from core.model_spec import fetch_and_parse_spec
router = APIRouter(prefix="/api/repos", tags=["repos"])
@router.get("")
async def list_repos(user=Depends(require_auth)):
try:
return await gitea.list_repos()
except Exception as e:
raise HTTPException(502, f"gitea: {e}")
@router.get("/{owner}/{repo}/branches")
async def branches(owner: str, repo: str, user=Depends(require_auth)):
try:
return await gitea.list_branches(f"{owner}/{repo}")
except Exception as e:
raise HTTPException(502, f"gitea: {e}")
@router.get("/{owner}/{repo}/commits")
async def commits(owner: str, repo: str, branch: str = Query("main"), user=Depends(require_auth)):
try:
return await gitea.list_commits(f"{owner}/{repo}", branch)
except Exception as e:
raise HTTPException(502, f"gitea: {e}")
@router.get("/{owner}/{repo}/file")
async def file_raw(owner: str, repo: str, ref: str, path: str, user=Depends(require_auth)):
try:
raw = await gitea.get_file_raw(f"{owner}/{repo}", ref, path)
return {"content": raw.decode("utf-8", errors="replace"), "size": len(raw)}
except Exception as e:
raise HTTPException(404, f"file not found: {e}")
@router.get("/{owner}/{repo}/spec")
async def spec(owner: str, repo: str, ref: str = Query("main"), user=Depends(require_auth)):
s = await fetch_and_parse_spec(f"{owner}/{repo}", ref)
if s is None:
raise HTTPException(404, "model.yml not found at ref")
return s

89
ml/routers/results.py Normal file
View File

@@ -0,0 +1,89 @@
"""API /api/results — lista trainings/tests + compare multi-training."""
from __future__ import annotations
import uuid
from typing import Optional
from fastapi import APIRouter, Depends, HTTPException, Query
from core import db, influx_client
from core.auth import require_auth
from core.config import settings
router = APIRouter(prefix="/api/results", tags=["results"])
def _row(r):
if r is None:
return None
d = dict(r)
for k in ("queued_at", "started_at", "finished_at", "started_at", "ended_at"):
if d.get(k) is not None and hasattr(d[k], "isoformat"):
d[k] = d[k].isoformat()
return d
@router.get("")
async def list_results(
model_id: Optional[str] = Query(None),
user=Depends(require_auth),
):
where = []
args: list = []
if model_id:
args.append(uuid.UUID(model_id))
where.append(f"model_id = ${len(args)}")
sql = "SELECT * FROM trainings"
if where:
sql += " WHERE " + " AND ".join(where)
sql += " ORDER BY finished_at DESC NULLS LAST, queued_at DESC LIMIT 200"
rows = await db.fetch(sql, *args)
return {"count": len(rows), "trainings": [_row(r) for r in rows]}
@router.get("/{training_id}")
async def get_result(training_id: str, user=Depends(require_auth)):
row = await db.fetchrow("SELECT * FROM trainings WHERE id = $1", uuid.UUID(training_id))
if not row:
raise HTTPException(404, "not found")
# timeseries via Influx: loss per iter + cpu/mem
flux = (
f'from(bucket:"{settings.influx_bucket}") '
f'|> range(start:-90d) '
f'|> filter(fn: (r) => r._measurement == "ml_training" and r.training_id == "{training_id}")'
)
try:
ts = await influx_client.query_flux(flux)
except Exception:
ts = []
return {"training": _row(row), "timeseries": ts}
@router.get("/compare")
async def compare(
trainings: str = Query(..., description="comma-separated training IDs"),
user=Depends(require_auth),
):
ids = [s.strip() for s in trainings.split(",") if s.strip()]
if len(ids) < 2:
raise HTTPException(400, "at least 2 training IDs required")
out = []
for tid in ids:
try:
tid_uuid = uuid.UUID(tid)
except ValueError:
continue
row = await db.fetchrow("SELECT * FROM trainings WHERE id = $1", tid_uuid)
if not row:
continue
flux = (
f'from(bucket:"{settings.influx_bucket}") '
f'|> range(start:-90d) '
f'|> filter(fn: (r) => r._measurement == "ml_training" and r.training_id == "{tid}")'
)
try:
ts = await influx_client.query_flux(flux)
except Exception:
ts = []
out.append({"training": _row(row), "timeseries": ts})
return {"results": out}

109
ml/routers/tests.py Normal file
View File

@@ -0,0 +1,109 @@
"""API /api/tests — sessioni di test su training esistente (max 2 utenti simultanei)."""
from __future__ import annotations
import json
import time
import uuid
from typing import Optional
import httpx
from fastapi import APIRouter, Depends, HTTPException
from core import api_client, db, minio_client
from core.auth import require_auth
from core.docker_runner import run_test_once
router = APIRouter(prefix="/api/tests", tags=["tests"])
def _row(r):
if r is None:
return None
d = dict(r)
for k in ("started_at", "ended_at"):
if d.get(k) is not None and hasattr(d[k], "isoformat"):
d[k] = d[k].isoformat()
return d
@router.post("/sessions", status_code=201)
async def start_session(body: dict, user=Depends(require_auth)):
training_id = body.get("training_id")
if not training_id:
raise HTTPException(400, "training_id required")
tr = await db.fetchrow(
"SELECT id, status FROM trainings WHERE id = $1", uuid.UUID(training_id)
)
if not tr:
raise HTTPException(404, "training not found")
if tr["status"] != "succeeded":
raise HTTPException(409, "training not completed")
sid = str(uuid.uuid4())
try:
await api_client.page_connect("test", user.get("username") or "unknown", sid)
except httpx.HTTPStatusError as e:
if e.response.status_code == 429:
raise HTTPException(429, "test slots full (max 2 users)")
raise HTTPException(502, f"api: {e}")
row = await db.fetchrow(
"INSERT INTO tests (id, training_id, user_id) VALUES ($1,$2,$3) RETURNING *",
uuid.UUID(sid),
uuid.UUID(training_id),
user.get("username") or "unknown",
)
return _row(row)
@router.post("/sessions/{session_id}/ping")
async def ping_session(session_id: str, user=Depends(require_auth)):
try:
await api_client.page_ping(session_id)
except httpx.HTTPStatusError as e:
raise HTTPException(e.response.status_code, e.response.text)
return {"ok": True}
@router.post("/sessions/{session_id}/runs", status_code=201)
async def run_test(session_id: str, body: dict, user=Depends(require_auth)):
row = await db.fetchrow("SELECT * FROM tests WHERE id = $1", uuid.UUID(session_id))
if not row:
raise HTTPException(404, "session not found")
inputs = body.get("inputs") or {}
t0 = time.monotonic()
try:
result = await run_test_once(str(row["training_id"]), inputs)
except Exception as e:
raise HTTPException(500, f"test run failed: {e}")
dt_ms = int((time.monotonic() - t0) * 1000)
run = {
"inputs": inputs,
"outputs": result.get("outputs", {}),
"duration_ms": dt_ms,
"cpu_peak": result.get("cpu_peak"),
"mem_peak_mb": result.get("mem_peak_mb"),
"ts": time.time(),
}
await db.execute(
"UPDATE tests SET runs = runs || $1::jsonb WHERE id = $2",
json.dumps([run]),
uuid.UUID(session_id),
)
return run
@router.delete("/sessions/{session_id}", status_code=204)
async def end_session(session_id: str, user=Depends(require_auth)):
await db.execute(
"UPDATE tests SET ended_at = NOW() WHERE id = $1 AND ended_at IS NULL",
uuid.UUID(session_id),
)
try:
await api_client.page_disconnect(session_id)
except Exception:
pass
return None

129
ml/routers/trainings.py Normal file
View File

@@ -0,0 +1,129 @@
"""API /api/trainings — enqueue, list, get, artifacts."""
from __future__ import annotations
import json
import uuid
from typing import Optional
from fastapi import APIRouter, Depends, HTTPException, Query
from core import db, minio_client, redis_client, api_client
from core.auth import require_auth
router = APIRouter(prefix="/api/trainings", tags=["trainings"])
def _row(r) -> Optional[dict]:
if r is None:
return None
d = dict(r)
for k in ("queued_at", "started_at", "finished_at"):
if d.get(k) is not None and hasattr(d[k], "isoformat"):
d[k] = d[k].isoformat()
return d
@router.get("")
async def list_trainings(
model_id: Optional[str] = Query(None),
status: Optional[str] = Query(None),
limit: int = Query(100, le=500),
user=Depends(require_auth),
):
where = []
args: list = []
if model_id:
args.append(uuid.UUID(model_id))
where.append(f"model_id = ${len(args)}")
if status:
args.append(status)
where.append(f"status = ${len(args)}")
sql = "SELECT * FROM trainings"
if where:
sql += " WHERE " + " AND ".join(where)
args.append(limit)
sql += f" ORDER BY queued_at DESC LIMIT ${len(args)}"
rows = await db.fetch(sql, *args)
return {"count": len(rows), "trainings": [_row(r) for r in rows]}
@router.post("", status_code=202)
async def enqueue_training(body: dict, user=Depends(require_auth)):
for k in ("model_id", "version", "patch", "dataset_id"):
if not body.get(k):
raise HTTPException(400, f"missing field: {k}")
model_row = await db.fetchrow("SELECT * FROM models WHERE id = $1", uuid.UUID(body["model_id"]))
if not model_row:
raise HTTPException(404, "model not found")
ds_row = await db.fetchrow("SELECT id FROM datasets WHERE id = $1", uuid.UUID(body["dataset_id"]))
if not ds_row:
raise HTTPException(404, "dataset not found")
try:
training_row = await db.fetchrow(
"""
INSERT INTO trainings (model_id, version, patch, dataset_id, started_by, status)
VALUES ($1,$2,$3,$4,$5,'queued')
RETURNING *
""",
uuid.UUID(body["model_id"]),
body["version"],
body["patch"],
uuid.UUID(body["dataset_id"]),
user.get("username") or "unknown",
)
except Exception as e:
raise HTTPException(409, f"training already exists or invalid: {e}")
training_id = str(training_row["id"])
# crea job lato api-service (cross-service registry)
try:
await api_client.create_job(
"train",
created_by=user.get("username") or "unknown",
payload={
"training_id": training_id,
"model_id": body["model_id"],
"version": body["version"],
"patch": body["patch"],
"dataset_id": body["dataset_id"],
},
)
except Exception as e:
# non-fatale: il worker locale può comunque procedere; logghiamo e continuiamo
import logging
logging.warning("create_job failed: %s", e)
# enqueue in Redis (il worker locale lo raccoglie)
await redis_client.client().lpush("ml:queue:train", training_id)
await redis_client.client().hset(
f"ml:train:{training_id}",
mapping={"status": "queued", "progress": "0", "message": "queued"},
)
await redis_client.client().expire(f"ml:train:{training_id}", 48 * 3600)
return _row(training_row)
@router.get("/{training_id}")
async def get_training(training_id: str, user=Depends(require_auth)):
row = await db.fetchrow("SELECT * FROM trainings WHERE id = $1", uuid.UUID(training_id))
if not row:
raise HTTPException(404, "not found")
return _row(row)
@router.get("/{training_id}/artifacts")
async def list_artifacts(training_id: str, user=Depends(require_auth)):
row = await db.fetchrow(
"SELECT artifacts_prefix FROM trainings WHERE id = $1", uuid.UUID(training_id)
)
if not row or not row["artifacts_prefix"]:
raise HTTPException(404, "no artifacts")
objs = minio_client.list_prefix(row["artifacts_prefix"] + "/")
for o in objs:
o["url"] = minio_client.presigned_get(o["name"], 3600)
return objs

View File

@@ -0,0 +1,64 @@
"""SSE endpoint per live progress del training.
GET /api/trainings/{id}/events
Streamma eventi dal Redis stream `ml:train:{id}:events` via Server-Sent Events.
Termina quando lo stato del training è terminale (succeeded/failed/cancelled).
"""
from __future__ import annotations
import asyncio
import json
import uuid
from fastapi import APIRouter, Depends, HTTPException
from sse_starlette.sse import EventSourceResponse
from core import db, redis_client
from core.auth import require_auth
router = APIRouter(prefix="/api/trainings", tags=["trainings-sse"])
_TERMINAL = {"succeeded", "failed", "cancelled"}
@router.get("/{training_id}/events")
async def training_events(training_id: str, user=Depends(require_auth)):
# verifica esistenza
row = await db.fetchrow("SELECT status FROM trainings WHERE id = $1", uuid.UUID(training_id))
if not row:
raise HTTPException(404, "not found")
stream_key = f"ml:train:{training_id}:events"
status_key = f"ml:train:{training_id}"
async def gen():
last_id = "0-0"
r = redis_client.client()
while True:
try:
# XREAD block 5s per non tenere la connessione idle troppo a lungo
resp = await r.xread({stream_key: last_id}, count=50, block=5000)
except Exception as e:
yield {"event": "error", "data": json.dumps({"error": str(e)})}
await asyncio.sleep(1)
continue
if resp:
for _stream, entries in resp:
for entry_id, fields in entries:
last_id = entry_id
yield {"event": "message", "id": entry_id, "data": json.dumps(fields)}
# controlla stato terminale
state = await r.hget(status_key, "status")
if not state:
# fallback su db se redis scaduto
db_row = await db.fetchrow(
"SELECT status FROM trainings WHERE id = $1", uuid.UUID(training_id)
)
state = db_row["status"] if db_row else "unknown"
if state in _TERMINAL:
yield {"event": "end", "data": json.dumps({"status": state})}
return
return EventSourceResponse(gen())

18
ml/runner/Dockerfile Normal file
View File

@@ -0,0 +1,18 @@
FROM python:3.11-slim
RUN apt-get update && apt-get install -y --no-install-recommends \
git \
build-essential \
&& rm -rf /var/lib/apt/lists/*
RUN pip install --no-cache-dir \
numpy pandas scikit-learn \
xgboost \
matplotlib \
pyyaml
COPY sdk.py /opt/meb/meb_ml.py
ENV PYTHONPATH=/opt/meb
WORKDIR /workdir
CMD ["bash"]

80
ml/runner/sdk.py Normal file
View File

@@ -0,0 +1,80 @@
"""meb_ml — SDK importabile dal codice utente dentro il container runner.
API:
from meb_ml import emit_metric, emit_series, emit_matrix, emit_log, save_artifact
emit_metric(iter=10, loss=0.23)
emit_series("roc_curve", x=fpr, y=tpr, kind="line")
emit_matrix("confusion", labels=[...], values=[[...],[...]])
emit_log("info", "epoch done")
Scrive righe JSON su stdout; il parent (ml-service) le inoltra su Redis/Influx.
Per risultati finali scrivere `out/metrics.json` con:
{"metrics": {...}, "plots": {"loss_curve": {"x": [...], "y": [...]}, ...}}
"""
from __future__ import annotations
import json
import os
import sys
from pathlib import Path
from typing import Any, Iterable, Sequence
def _print(obj: dict) -> None:
sys.stdout.write(json.dumps(obj, default=float) + "\n")
sys.stdout.flush()
def emit_metric(**fields: Any) -> None:
_print({"type": "metric", **fields})
def emit_series(name: str, x: Sequence, y: Sequence, kind: str = "line") -> None:
_print({
"type": "series",
"name": name,
"kind": kind,
"x": list(x),
"y": list(y),
})
def emit_matrix(name: str, labels: Sequence, values: Sequence[Sequence]) -> None:
_print({
"type": "matrix",
"name": name,
"labels": list(labels),
"values": [list(row) for row in values],
})
def emit_log(level: str, message: str) -> None:
_print({"type": "log", "level": level, "message": message})
def save_artifact(path: str) -> str:
"""Copia `path` nella cartella artefatti (MEB_ARTIFACTS_DIR). Ritorna la dest."""
dest_dir = Path(os.environ.get("MEB_ARTIFACTS_DIR", "/workdir/out"))
dest_dir.mkdir(parents=True, exist_ok=True)
src = Path(path)
dest = dest_dir / src.name
dest.write_bytes(src.read_bytes())
return str(dest)
def dataset_path() -> str:
return os.environ["MEB_DATASET_PATH"]
def artifacts_dir() -> str:
return os.environ.get("MEB_ARTIFACTS_DIR", "/workdir/out")
def read_test_input() -> dict:
"""Legge un singolo JSON da stdin (per script di test)."""
return json.loads(sys.stdin.readline())
def write_test_output(outputs: dict) -> None:
_print({"type": "result", "outputs": outputs})

146
ml/static/styles/ml.css Normal file
View File

@@ -0,0 +1,146 @@
.ml-nav {
display: flex;
gap: 16px;
align-items: center;
}
.ml-nav a {
text-decoration: none;
color: var(--text-secondary);
font-weight: 600;
padding: 8px 12px;
border-radius: var(--radius-md);
transition: all 0.2s ease;
}
.ml-nav a:hover { background: var(--accent-light); color: var(--accent-color); }
.ml-nav a.active { background: var(--accent-light); color: var(--accent-color); }
.container {
max-width: 1200px;
margin: 24px auto;
padding: 0 24px;
}
.page-head {
display: flex;
align-items: center;
justify-content: space-between;
margin-bottom: 20px;
}
.page-head h2 { font-size: 1.5rem; }
.list {
display: flex;
flex-direction: column;
gap: 8px;
}
.list .item {
display: flex;
align-items: center;
justify-content: space-between;
padding: 12px 16px;
border: 1px solid var(--header-border);
border-radius: var(--radius-lg);
background: #fff;
transition: box-shadow 0.12s ease;
}
.list .item:hover { box-shadow: var(--shadow-md); }
.list .meta { color: var(--text-secondary); font-size: 0.85rem; }
.form-row {
display: flex;
flex-wrap: wrap;
gap: 12px;
align-items: flex-end;
margin-bottom: 20px;
}
.form-row label {
display: flex;
flex-direction: column;
gap: 4px;
font-size: 0.85rem;
color: var(--text-secondary);
}
.form-row input, .form-row select, .form-row textarea {
padding: 8px 12px;
border: 1px solid var(--header-border);
border-radius: var(--radius-md);
font-family: inherit;
}
.hidden { display: none !important; }
.queue-info {
font-size: 0.9rem;
color: var(--text-secondary);
padding: 6px 12px;
background: var(--accent-light);
border-radius: var(--radius-md);
}
.charts {
display: grid;
grid-template-columns: 1fr 1fr;
gap: 16px;
margin: 16px 0;
}
.logs {
background: #0f172a;
color: #cbd5e1;
padding: 12px;
border-radius: var(--radius-md);
font-family: ui-monospace, monospace;
font-size: 0.8rem;
max-height: 320px;
overflow: auto;
white-space: pre-wrap;
}
.detail {
border: 1px solid var(--header-border);
border-radius: var(--radius-lg);
padding: 16px;
margin-top: 16px;
background: #fff;
position: relative;
}
.detail #btn-close-detail {
position: absolute;
top: 8px;
right: 8px;
padding: 4px 10px;
}
dialog {
border: 1px solid var(--header-border);
border-radius: var(--radius-lg);
padding: 24px;
width: min(500px, 90vw);
}
dialog form { display: flex; flex-direction: column; gap: 12px; }
dialog label { display: flex; flex-direction: column; gap: 4px; font-size: 0.85rem; }
dialog menu { display: flex; justify-content: flex-end; gap: 8px; margin-top: 16px; padding: 0; }
table {
width: 100%;
border-collapse: collapse;
margin-top: 12px;
}
th, td { padding: 8px 12px; border-bottom: 1px solid var(--header-border); text-align: left; font-size: 0.9rem; }
code {
font-family: ui-monospace, monospace;
background: #f1f5f9;
padding: 2px 6px;
border-radius: 4px;
font-size: 0.85em;
}
pre {
background: #f8fafc;
padding: 12px;
border-radius: var(--radius-md);
overflow: auto;
font-family: ui-monospace, monospace;
font-size: 0.8rem;
}

33
ml/templates/_layout.html Normal file
View File

@@ -0,0 +1,33 @@
<!DOCTYPE html>
<html lang="it">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>ML — {% block title %}{{ page|capitalize }}{% endblock %}</title>
<link href="/static/styles/style.css" rel="stylesheet">
<link href="/static/styles/ml.css" rel="stylesheet">
</head>
<body>
<div class="header">
<h1>Modelli ML</h1>
<nav class="ml-nav">
<a href="/datasets" class="{% if page=='datasets' %}active{% endif %}">Datasets</a>
<a href="/models" class="{% if page=='models' %}active{% endif %}">Modelli</a>
<a href="/train" class="{% if page=='train' %}active{% endif %}">Train</a>
<a href="/test" class="{% if page=='test' %}active{% endif %}">Test</a>
<a href="/results" class="{% if page=='results' %}active{% endif %}">Results</a>
</nav>
<div class="profile">
<p id="username">{{ user.username }}</p>
<button id="logout-btn">Logout</button>
</div>
</div>
<div class="container">
{% block content %}{% endblock %}
</div>
<script src="/static/js/common.js"></script>
{% block scripts %}{% endblock %}
</body>
</html>

View File

@@ -0,0 +1,39 @@
{% extends "_layout.html" %}
{% block title %}Datasets{% endblock %}
{% block content %}
<div class="page-head">
<h2>Datasets</h2>
<button class="prominent" id="btn-upload">+ Carica CSV</button>
</div>
<div id="datasets-list" class="list"></div>
<dialog id="upload-dlg">
<form id="upload-form" method="dialog">
<h3>Carica dataset</h3>
<label>Nome<input type="text" name="nome" required></label>
<label>Tipo
<select name="dataset_type">
<option value="custom">custom</option>
<option value="imported">imported</option>
</select>
</label>
<label>Formato
<select name="format">
<option value="csv">csv</option>
<option value="json">json</option>
</select>
</label>
<label>Tags (virgola)<input type="text" name="tags"></label>
<label>Descrizione<textarea name="description"></textarea></label>
<label>File<input type="file" name="file" required></label>
<menu>
<button type="button" id="upload-cancel">Annulla</button>
<button type="submit" class="prominent">Carica</button>
</menu>
</form>
</dialog>
{% endblock %}
{% block scripts %}
<script src="/static/js/datasets.js"></script>
{% endblock %}

57
ml/templates/models.html Normal file
View File

@@ -0,0 +1,57 @@
{% extends "_layout.html" %}
{% block title %}Modelli{% endblock %}
{% block content %}
<div class="page-head">
<h2>Modelli</h2>
<button class="prominent" id="btn-add-model">+ Aggiungi modello</button>
</div>
<div id="models-list" class="list"></div>
<div id="model-detail" class="detail hidden">
<button id="btn-close-detail">×</button>
<h3 id="md-name"></h3>
<p id="md-meta"></p>
<section>
<h4>Branch / Commits</h4>
<select id="md-branch"></select>
<ul id="md-commits"></ul>
</section>
<section>
<h4>model.yml</h4>
<pre id="md-spec"></pre>
</section>
<section>
<h4>Note</h4>
<ul id="md-notes"></ul>
<form id="md-note-form">
<textarea name="text" placeholder="Nuova nota"></textarea>
<button type="submit" class="prominent">Aggiungi</button>
</form>
</section>
</div>
<dialog id="add-model-dlg">
<form id="add-model-form" method="dialog">
<h3>Nuovo modello</h3>
<label>Nome<input type="text" name="name" required></label>
<label>Tipo
<select name="type">
<option>xgboost</option>
<option>lstm</option>
<option>sklearn</option>
<option>other</option>
</select>
</label>
<label>Repo Gitea (owner/repo)<input type="text" name="gitea_repo" required></label>
<label>Branch<input type="text" name="default_branch" value="main"></label>
<menu>
<button type="button" id="add-model-cancel">Annulla</button>
<button type="submit" class="prominent">Crea</button>
</menu>
</form>
</dialog>
{% endblock %}
{% block scripts %}
<script src="/static/js/models.js"></script>
{% endblock %}

View File

@@ -1,89 +1,33 @@
<!DOCTYPE html>
{% extends "_layout.html" %}
{% block title %}Risultati{% endblock %}
{% block content %}
<div class="page-head">
<h2>Risultati training</h2>
<button id="btn-compare" class="prominent">Confronta selezionati</button>
</div>
<html>
<head>
<title>Risultati</title>
<link href="../static/styles/style.css" rel="stylesheet">
<div id="results-list" class="list"></div>
<style>
.container {
display: flex;
flex-direction: column;
align-items: center;
justify-content: center;
height: 100%;
}
<section id="compare-panel" class="hidden">
<h3>Confronto</h3>
<div class="charts">
<canvas id="cmp-loss"></canvas>
</div>
<table id="cmp-table"></table>
<div id="cmp-plots"></div>
</section>
.picker {
display: flex;
flex-direction: column;
align-items: center;
justify-content: center;
height: 100%;
}
.picker .header {
display: flex;
flex-direction: column;
align-items: center;
justify-content: center;
height: 100%;
}
</style>
</head>
<body>
<div class="header">
<h1>Risultati</h1>
<div class="profile">
<p>Utente</p>
<button>Logout</button>
</div>
</div>
<div class="container">
<div class="picker">
<div class="header">
<h2>
Seleziona
</h2>
<p>
una sessione di training eseguita per visualizzarne i risultati
</p>
</div>
<div class="grid">
<div class="card">
<h3>sessione 1</h3>
<div class="train-info">
<p>24/03/2026</p>
<p>12:00</p>
<p>dataset: d-1</p>
</div>
</div>
<div class="card">
<h3>sessione 2</h3>
<p>24/03/2026</p>
</div>
</div>
</div>
</div>
</body>
<script>
</script>
</html>
<section id="detail-panel" class="hidden">
<h3>Dettaglio training <code id="dt-id"></code></h3>
<div id="dt-meta"></div>
<div class="charts">
<canvas id="dt-loss"></canvas>
<canvas id="dt-res"></canvas>
</div>
<div id="dt-plots"></div>
</section>
{% endblock %}
{% block scripts %}
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
<script src="/static/js/results.js"></script>
{% endblock %}

View File

@@ -0,0 +1,33 @@
{% extends "_layout.html" %}
{% block title %}Test{% endblock %}
{% block content %}
<div class="page-head">
<h2>Test modello</h2>
<div id="slot-info" class="queue-info">Slot: <span id="slot-count"></span>/2</div>
</div>
<div id="slot-full" class="info-panel hidden">
<div class="icon">🚧</div>
<h3>Slot test pieni</h3>
<p>Massimo 2 utenti possono eseguire test contemporaneamente. Riprova tra qualche minuto.</p>
</div>
<form id="test-start" class="form-row">
<label>Modello<select id="t-model"></select></label>
<label>Training<select id="t-training"></select></label>
<button type="submit" class="prominent">Avvia sessione</button>
</form>
<section id="test-session" class="hidden">
<h3>Sessione <code id="ts-id"></code></h3>
<form id="inputs-form"></form>
<button id="btn-run" class="prominent">Esegui test</button>
<button id="btn-end">Chiudi sessione</button>
<h4>Risultati</h4>
<div id="runs-list"></div>
</section>
{% endblock %}
{% block scripts %}
<script src="/static/js/test.js"></script>
{% endblock %}

View File

@@ -0,0 +1,35 @@
{% extends "_layout.html" %}
{% block title %}Train{% endblock %}
{% block content %}
<div class="page-head">
<h2>Avvia training</h2>
<div class="queue-info">Coda: <span id="queue-count"></span></div>
</div>
<form id="train-form" class="form-row">
<label>Modello<select name="model_id" id="f-model"></select></label>
<label>Branch<select name="branch" id="f-branch"></select></label>
<label>Commit<select name="patch" id="f-patch"></select></label>
<label>Versione<input type="text" name="version" placeholder="1.0.0" required></label>
<label>Dataset<select name="dataset_id" id="f-dataset"></select></label>
<button type="submit" class="prominent">Avvia</button>
</form>
<section id="live-panel" class="hidden">
<h3>Training <code id="live-id"></code><span id="live-status">queued</span></h3>
<div class="charts">
<canvas id="chart-loss"></canvas>
<canvas id="chart-cpu"></canvas>
</div>
<pre id="live-logs" class="logs"></pre>
</section>
<section>
<h3>Recenti</h3>
<div id="recent-trainings" class="list"></div>
</section>
{% endblock %}
{% block scripts %}
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
<script src="/static/js/train.js"></script>
{% endblock %}