feat: Add new API endpoints and HTML pages for ML model management
- Implemented HTML pages for datasets, models, training, testing, and results. - Created API endpoints for managing repositories, results, tests, and training sessions. - Added functionality for streaming training progress via Server-Sent Events (SSE). - Introduced a Dockerfile for the ML runner with necessary dependencies. - Developed an SDK for user code execution within the runner container. - Enhanced CSS styles for improved UI layout and navigation. - Established a layout template for consistent HTML structure across pages. - Added JavaScript for dynamic interactions on the models page. - Implemented WebSocket handling for real-time communication with kiosk devices and controllers. - Implemented model registration and management API at /api/models - Added Gitea proxy API for repository interactions at /api/repos - Created results API for listing and comparing training results at /api/results - Developed training management API for enqueueing and retrieving training jobs at /api/trainings - Introduced SSE endpoint for live training progress updates - Added HTML pages for models, datasets, and training management - Created a Dockerfile for the ML runner with necessary dependencies - Developed SDK for user code execution within the runner container - Enhanced CSS styles for improved UI/UX - Implemented WebSocket communication for real-time device and controller interactions in the kiosk system
This commit is contained in:
@@ -0,0 +1,45 @@
|
||||
PORT=3007
|
||||
|
||||
# Auth condiviso
|
||||
JWT_SECRET=change-me
|
||||
INTERNAL_API_KEY=change-me
|
||||
AUTH_LOGIN_URL=https://auth.mebboat.it/login
|
||||
|
||||
# Postgres (db ml)
|
||||
PG_HOST=meb-postgres
|
||||
PG_PORT=5432
|
||||
DB_USER=meb
|
||||
DB_PASSWORD=meb
|
||||
ML_DB=ml
|
||||
|
||||
# Redis
|
||||
REDIS_HOST=meb-redis
|
||||
REDIS_PORT=6379
|
||||
|
||||
# MinIO (bucket unico)
|
||||
MINIO_ENDPOINT=minio
|
||||
MINIO_PORT=9000
|
||||
MINIO_USE_SSL=false
|
||||
MINIO_ACCESS_KEY=
|
||||
MINIO_SECRET_KEY=
|
||||
MINIO_BUCKET=ml
|
||||
|
||||
# InfluxDB
|
||||
INFLUX_URL=http://meb-influx:8086
|
||||
INFLUX_TOKEN=
|
||||
INFLUX_ORG=meb
|
||||
INFLUX_BUCKET=ml_metrics
|
||||
|
||||
# Gitea (self-hosted esterno)
|
||||
GITEA_URL=https://git.mebboat.it
|
||||
GITEA_TOKEN=
|
||||
|
||||
# API service
|
||||
API_URL=http://api:3003
|
||||
|
||||
# Training runtime
|
||||
ML_TRAIN_CONCURRENCY=1
|
||||
ML_RUNNER_IMAGE=meb-ml-runner:latest
|
||||
ML_RUNNER_TMP=/var/ml/tmp
|
||||
ML_GITCACHE_DIR=/var/ml/gitcache
|
||||
ML_MAX_UPLOAD_MB=500
|
||||
|
||||
@@ -3,6 +3,9 @@ FROM python:3.11-slim
|
||||
WORKDIR /app
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends git \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
COPY ./requirements.txt .
|
||||
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
72
ml/core/api_client.py
Normal file
72
ml/core/api_client.py
Normal file
@@ -0,0 +1,72 @@
|
||||
"""Client HTTP verso l'api-service (service-to-service via x-api-key).
|
||||
|
||||
Espone accesso a:
|
||||
/jobs ciclo di vita job
|
||||
/queue stato coda
|
||||
/pageconnections registro sessioni di pagina (enforcement /test max 2)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Optional
|
||||
|
||||
import httpx
|
||||
|
||||
from core.config import settings
|
||||
|
||||
|
||||
def _headers() -> dict:
|
||||
return {"x-api-key": settings.internal_api_key, "Content-Type": "application/json"}
|
||||
|
||||
|
||||
async def _req(method: str, path: str, json: Optional[dict] = None, params: Optional[dict] = None) -> Any:
|
||||
url = f"{settings.api_url}{path}"
|
||||
async with httpx.AsyncClient(timeout=10.0) as c:
|
||||
r = await c.request(method, url, json=json, params=params, headers=_headers())
|
||||
r.raise_for_status()
|
||||
if r.status_code == 204 or not r.content:
|
||||
return None
|
||||
return r.json()
|
||||
|
||||
|
||||
# ── jobs ────────────────────────────────────────────────────────────────────
|
||||
async def create_job(type_: str, created_by: str, payload: dict) -> dict:
|
||||
return await _req("POST", "/jobs", json={"type": type_, "created_by": created_by, "payload": payload})
|
||||
|
||||
|
||||
async def update_job(job_id: str, **fields) -> dict:
|
||||
return await _req("PATCH", f"/jobs/{job_id}", json=fields)
|
||||
|
||||
|
||||
async def get_job(job_id: str) -> dict:
|
||||
return await _req("GET", f"/jobs/{job_id}")
|
||||
|
||||
|
||||
async def list_jobs(type_: Optional[str] = None, status: Optional[str] = None, limit: int = 50) -> list:
|
||||
params = {"limit": str(limit)}
|
||||
if type_:
|
||||
params["type"] = type_
|
||||
if status:
|
||||
params["status"] = status
|
||||
return await _req("GET", "/jobs", params=params) or []
|
||||
|
||||
|
||||
# ── queue ───────────────────────────────────────────────────────────────────
|
||||
async def queue_status(type_: str = "train") -> dict:
|
||||
return await _req("GET", "/queue", params={"type": type_})
|
||||
|
||||
|
||||
# ── page connections ───────────────────────────────────────────────────────
|
||||
async def page_connect(page: str, user_id: str, session_id: str) -> dict:
|
||||
return await _req("POST", "/pageconnections", json={"page": page, "user_id": user_id, "session_id": session_id})
|
||||
|
||||
|
||||
async def page_ping(session_id: str) -> dict:
|
||||
return await _req("POST", f"/pageconnections/{session_id}/ping")
|
||||
|
||||
|
||||
async def page_disconnect(session_id: str) -> None:
|
||||
await _req("DELETE", f"/pageconnections/{session_id}")
|
||||
|
||||
|
||||
async def page_count(page: str) -> dict:
|
||||
return await _req("GET", f"/pageconnections/{page}")
|
||||
64
ml/core/config.py
Normal file
64
ml/core/config.py
Normal file
@@ -0,0 +1,64 @@
|
||||
"""Configurazione centralizzata del servizio ML, letta da env."""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
def _b(name: str, default: bool = False) -> bool:
|
||||
return os.environ.get(name, str(default)).lower() in ("1", "true", "yes", "on")
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Settings:
|
||||
# Postgres (db "ml")
|
||||
pg_host: str = os.environ.get("PG_HOST", "meb-postgres")
|
||||
pg_port: int = int(os.environ.get("PG_PORT", "5432"))
|
||||
pg_user: str = os.environ.get("DB_USER", "meb")
|
||||
pg_password: str = os.environ.get("DB_PASSWORD", "meb")
|
||||
pg_db: str = os.environ.get("ML_DB", "ml")
|
||||
|
||||
# Redis
|
||||
redis_host: str = os.environ.get("REDIS_HOST", "meb-redis")
|
||||
redis_port: int = int(os.environ.get("REDIS_PORT", "6379"))
|
||||
|
||||
# MinIO (bucket unico)
|
||||
minio_endpoint: str = os.environ.get("MINIO_ENDPOINT", "minio")
|
||||
minio_port: int = int(os.environ.get("MINIO_PORT", "9000"))
|
||||
minio_use_ssl: bool = _b("MINIO_USE_SSL", False)
|
||||
minio_access_key: str = os.environ.get("MINIO_ACCESS_KEY", "")
|
||||
minio_secret_key: str = os.environ.get("MINIO_SECRET_KEY", "")
|
||||
minio_bucket: str = os.environ.get("MINIO_BUCKET", "ml")
|
||||
|
||||
# InfluxDB — accetta sia INFLUX_* che INFLX_* per allinearsi alle var già
|
||||
# usate dagli altri servizi (realtime, api) senza dover duplicare la config.
|
||||
influx_url: str = os.environ.get("INFLUX_URL") or os.environ.get("INFLX_URL", "http://meb-influx:8086")
|
||||
influx_token: str = os.environ.get("INFLUX_TOKEN") or os.environ.get("INFLX_TOKEN", "")
|
||||
influx_org: str = os.environ.get("INFLUX_ORG") or os.environ.get("INFLX_ORG", "meb")
|
||||
# Bucket dedicato alle metriche di training/test ML, separato dai logs e
|
||||
# dai dati meteo. Sovrascrivibile via INFLUX_BUCKET o ML_INFLUX_BUCKET.
|
||||
influx_bucket: str = os.environ.get("ML_INFLUX_BUCKET") or os.environ.get("INFLUX_BUCKET", "ml_metrics")
|
||||
|
||||
# Gitea (installato esternamente)
|
||||
gitea_url: str = os.environ.get("GITEA_URL", "")
|
||||
gitea_token: str = os.environ.get("GITEA_TOKEN", "")
|
||||
|
||||
# API service (per jobs/queue/pageconnections)
|
||||
api_url: str = os.environ.get("API_URL", "http://api:3003")
|
||||
internal_api_key: str = os.environ.get("INTERNAL_API_KEY", "")
|
||||
|
||||
# Auth (condiviso)
|
||||
jwt_secret: str = os.environ.get("JWT_SECRET", "")
|
||||
auth_login_url: str = os.environ.get("AUTH_LOGIN_URL", "https://auth.mebboat.it/login")
|
||||
|
||||
# Esecuzione training
|
||||
train_concurrency: int = int(os.environ.get("ML_TRAIN_CONCURRENCY", "1"))
|
||||
runner_image: str = os.environ.get("ML_RUNNER_IMAGE", "meb-ml-runner:latest")
|
||||
runner_tmp_dir: str = os.environ.get("ML_RUNNER_TMP", "/var/ml/tmp")
|
||||
gitcache_dir: str = os.environ.get("ML_GITCACHE_DIR", "/var/ml/gitcache")
|
||||
|
||||
# Limiti runtime
|
||||
max_upload_mb: int = int(os.environ.get("ML_MAX_UPLOAD_MB", "500"))
|
||||
|
||||
|
||||
settings = Settings()
|
||||
53
ml/core/db.py
Normal file
53
ml/core/db.py
Normal file
@@ -0,0 +1,53 @@
|
||||
"""Connessione asyncpg al database ml. Pool singleton."""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncpg
|
||||
from typing import Optional
|
||||
|
||||
from core.config import settings
|
||||
|
||||
_pool: Optional[asyncpg.Pool] = None
|
||||
|
||||
|
||||
async def init_pool() -> asyncpg.Pool:
|
||||
global _pool
|
||||
if _pool is None:
|
||||
_pool = await asyncpg.create_pool(
|
||||
host=settings.pg_host,
|
||||
port=settings.pg_port,
|
||||
user=settings.pg_user,
|
||||
password=settings.pg_password,
|
||||
database=settings.pg_db,
|
||||
min_size=1,
|
||||
max_size=10,
|
||||
command_timeout=30,
|
||||
)
|
||||
return _pool
|
||||
|
||||
|
||||
async def close_pool() -> None:
|
||||
global _pool
|
||||
if _pool is not None:
|
||||
await _pool.close()
|
||||
_pool = None
|
||||
|
||||
|
||||
def pool() -> asyncpg.Pool:
|
||||
if _pool is None:
|
||||
raise RuntimeError("DB pool not initialized — call init_pool() at startup")
|
||||
return _pool
|
||||
|
||||
|
||||
async def fetch(sql: str, *args):
|
||||
async with pool().acquire() as c:
|
||||
return await c.fetch(sql, *args)
|
||||
|
||||
|
||||
async def fetchrow(sql: str, *args):
|
||||
async with pool().acquire() as c:
|
||||
return await c.fetchrow(sql, *args)
|
||||
|
||||
|
||||
async def execute(sql: str, *args):
|
||||
async with pool().acquire() as c:
|
||||
return await c.execute(sql, *args)
|
||||
439
ml/core/docker_runner.py
Normal file
439
ml/core/docker_runner.py
Normal file
@@ -0,0 +1,439 @@
|
||||
"""Runner Docker per train e test.
|
||||
|
||||
train:
|
||||
- clone repo Gitea @ sha
|
||||
- prepara workdir /var/ml/tmp/{training_id}
|
||||
- scarica dataset da MinIO in workdir/data.<ext>
|
||||
- docker run meb-ml-runner con mount tmp, env, limits da model.yml
|
||||
- legge stdout JSON → Redis stream + Influx; docker stats ogni 5s
|
||||
- a fine: collect outputs, upload su MinIO prefix artifacts_prefix
|
||||
- UPDATE trainings
|
||||
|
||||
test:
|
||||
- analogo ma sincrono, stdin JSON → stdout JSON
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import time
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional
|
||||
|
||||
import docker
|
||||
from influxdb_client import Point
|
||||
|
||||
from core import db, gitea, influx_client, minio_client, redis_client
|
||||
from core.config import settings
|
||||
from core.model_spec import fetch_and_parse_spec
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
_docker = None
|
||||
|
||||
|
||||
def _docker_client():
|
||||
global _docker
|
||||
if _docker is None:
|
||||
_docker = docker.from_env()
|
||||
return _docker
|
||||
|
||||
|
||||
async def _emit(stream_key: str, payload: dict) -> None:
|
||||
try:
|
||||
await redis_client.client().xadd(stream_key, {"payload": json.dumps(payload)}, maxlen=10_000)
|
||||
except Exception as e:
|
||||
log.warning("xadd failed: %s", e)
|
||||
|
||||
|
||||
async def _clone_repo(owner_repo: str, sha: str, dest: Path) -> None:
|
||||
dest.mkdir(parents=True, exist_ok=True)
|
||||
url = gitea.clone_url(owner_repo)
|
||||
# clone shallow del branch/sha specifico
|
||||
# per evitare leak del token nei log, logghiamo solo host
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
"git", "clone", "--depth", "50", url, str(dest),
|
||||
stdout=subprocess.PIPE, stderr=subprocess.PIPE,
|
||||
)
|
||||
_, err = await proc.communicate()
|
||||
if proc.returncode != 0:
|
||||
raise RuntimeError(f"git clone failed: {err.decode(errors='replace')[:400]}")
|
||||
# checkout sha
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
"git", "-C", str(dest), "checkout", sha,
|
||||
stdout=subprocess.PIPE, stderr=subprocess.PIPE,
|
||||
)
|
||||
_, err = await proc.communicate()
|
||||
if proc.returncode != 0:
|
||||
raise RuntimeError(f"git checkout failed: {err.decode(errors='replace')[:400]}")
|
||||
|
||||
|
||||
async def _download_dataset(dataset_id: str, dest: Path) -> str:
|
||||
row = await db.fetchrow(
|
||||
"SELECT file_key, format FROM datasets WHERE id = $1", uuid.UUID(dataset_id)
|
||||
)
|
||||
if not row:
|
||||
raise RuntimeError("dataset not found")
|
||||
data = minio_client.get_bytes(row["file_key"], bucket="ml.datasets")
|
||||
ext = {"csv": "csv", "json": "json", "netcdf": "nc"}.get(row["format"], "bin")
|
||||
out = dest / f"data.{ext}"
|
||||
out.write_bytes(data)
|
||||
return str(out)
|
||||
|
||||
|
||||
def _stats_loop_sync(container, training_id: str, model_id: str, samples: list, stop_evt: asyncio.Event, loop: asyncio.AbstractEventLoop):
|
||||
"""Sincrono, eseguito in thread. Ogni 5s legge docker stats → Influx + samples."""
|
||||
while not stop_evt.is_set():
|
||||
try:
|
||||
stats = container.stats(stream=False)
|
||||
# CPU%
|
||||
cpu_delta = stats["cpu_stats"]["cpu_usage"]["total_usage"] - stats["precpu_stats"]["cpu_usage"]["total_usage"]
|
||||
sys_delta = stats["cpu_stats"].get("system_cpu_usage", 0) - stats["precpu_stats"].get("system_cpu_usage", 0)
|
||||
online = stats["cpu_stats"].get("online_cpus") or len(stats["cpu_stats"]["cpu_usage"].get("percpu_usage") or [1])
|
||||
cpu_pct = (cpu_delta / sys_delta) * online * 100.0 if sys_delta > 0 else 0.0
|
||||
mem_mb = (stats["memory_stats"].get("usage") or 0) / (1024 * 1024)
|
||||
|
||||
samples.append((cpu_pct, mem_mb))
|
||||
point = (
|
||||
Point("ml_training")
|
||||
.tag("training_id", training_id)
|
||||
.tag("model_id", model_id)
|
||||
.field("cpu_pct", float(cpu_pct))
|
||||
.field("mem_mb", float(mem_mb))
|
||||
)
|
||||
asyncio.run_coroutine_threadsafe(influx_client.write_points([point]), loop)
|
||||
except Exception as e:
|
||||
log.warning("stats loop error: %s", e)
|
||||
time.sleep(5)
|
||||
|
||||
|
||||
async def _stream_container_logs(container, training_id: str, model_id: str, stream_key: str):
|
||||
"""Legge stdout del container, pubblica righe JSON su Redis stream e Influx."""
|
||||
def _iter():
|
||||
return container.logs(stream=True, follow=True, stdout=True, stderr=True)
|
||||
|
||||
loop = asyncio.get_event_loop()
|
||||
it = await loop.run_in_executor(None, _iter)
|
||||
|
||||
while True:
|
||||
line = await loop.run_in_executor(None, next, it, None)
|
||||
if line is None:
|
||||
break
|
||||
try:
|
||||
text = line.decode("utf-8", errors="replace").rstrip("\n")
|
||||
except Exception:
|
||||
continue
|
||||
if not text:
|
||||
continue
|
||||
# righe non-JSON → log
|
||||
payload: dict
|
||||
if text.startswith("{") and text.endswith("}"):
|
||||
try:
|
||||
payload = json.loads(text)
|
||||
except json.JSONDecodeError:
|
||||
payload = {"type": "log", "level": "info", "message": text}
|
||||
else:
|
||||
payload = {"type": "log", "level": "info", "message": text}
|
||||
|
||||
await _emit(stream_key, payload)
|
||||
|
||||
if payload.get("type") == "metric":
|
||||
p = Point("ml_training").tag("training_id", training_id).tag("model_id", model_id)
|
||||
for k, v in payload.items():
|
||||
if k == "type":
|
||||
continue
|
||||
if isinstance(v, (int, float)):
|
||||
p = p.field(k, float(v))
|
||||
try:
|
||||
await influx_client.write_points([p])
|
||||
except Exception as e:
|
||||
log.warning("influx write metric failed: %s", e)
|
||||
|
||||
|
||||
async def run_training_job(training_id: str) -> None:
|
||||
"""Esegue un job di training end-to-end. Aggiorna Postgres e Redis state."""
|
||||
r = redis_client.client()
|
||||
state_key = f"ml:train:{training_id}"
|
||||
stream_key = f"ml:train:{training_id}:events"
|
||||
|
||||
tr = await db.fetchrow("SELECT * FROM trainings WHERE id = $1", uuid.UUID(training_id))
|
||||
if not tr:
|
||||
log.error("training %s not found", training_id)
|
||||
return
|
||||
model = await db.fetchrow("SELECT * FROM models WHERE id = $1", tr["model_id"])
|
||||
if not model:
|
||||
await db.execute(
|
||||
"UPDATE trainings SET status='failed', error=$2 WHERE id=$1",
|
||||
uuid.UUID(training_id), "model not found",
|
||||
)
|
||||
return
|
||||
|
||||
await db.execute(
|
||||
"UPDATE trainings SET status='running', started_at=NOW() WHERE id=$1",
|
||||
uuid.UUID(training_id),
|
||||
)
|
||||
await r.hset(state_key, mapping={"status": "running", "progress": "0", "message": "starting"})
|
||||
|
||||
workdir = Path(settings.runner_tmp_dir) / training_id
|
||||
artifacts_prefix = f"models/{tr['model_id']}/{tr['version']}/{tr['patch']}"
|
||||
error: Optional[str] = None
|
||||
samples: list[tuple[float, float]] = []
|
||||
try:
|
||||
workdir.mkdir(parents=True, exist_ok=True)
|
||||
await _emit(stream_key, {"type": "log", "level": "info", "message": "cloning repo"})
|
||||
await _clone_repo(model["gitea_repo"], tr["patch"], workdir / "repo")
|
||||
|
||||
await _emit(stream_key, {"type": "log", "level": "info", "message": "parsing model.yml"})
|
||||
spec = await fetch_and_parse_spec(model["gitea_repo"], tr["patch"]) or {}
|
||||
train_spec = spec.get("train", {})
|
||||
entrypoint = train_spec.get("entrypoint") or "python -m src.train"
|
||||
resources = spec.get("resources", {}) or {}
|
||||
|
||||
await _emit(stream_key, {"type": "log", "level": "info", "message": "downloading dataset"})
|
||||
dataset_path = await _download_dataset(str(tr["dataset_id"]), workdir)
|
||||
|
||||
out_dir = workdir / "out"
|
||||
out_dir.mkdir(exist_ok=True)
|
||||
|
||||
# docker run
|
||||
dc = _docker_client()
|
||||
await _emit(stream_key, {"type": "log", "level": "info", "message": "starting container"})
|
||||
container = dc.containers.run(
|
||||
settings.runner_image,
|
||||
command=["sh", "-c", f"cd /workdir/repo && pip install -q -r requirements.txt 2>&1 || true && {entrypoint}"],
|
||||
detach=True,
|
||||
working_dir="/workdir/repo",
|
||||
environment={
|
||||
"MEB_DATASET_PATH": f"/workdir/{Path(dataset_path).name}",
|
||||
"MEB_ARTIFACTS_DIR": "/workdir/out",
|
||||
"MEB_TRAINING_ID": training_id,
|
||||
},
|
||||
volumes={str(workdir): {"bind": "/workdir", "mode": "rw"}},
|
||||
network_mode="none",
|
||||
mem_limit=f"{int(resources.get('mem_mb', 2048))}m",
|
||||
nano_cpus=int(float(resources.get("cpu", 1)) * 1e9),
|
||||
read_only=False,
|
||||
tty=False,
|
||||
detach_mode=None,
|
||||
)
|
||||
|
||||
loop = asyncio.get_event_loop()
|
||||
stop_evt = asyncio.Event()
|
||||
stats_task = loop.run_in_executor(
|
||||
None, _stats_loop_sync, container, training_id, str(tr["model_id"]), samples, stop_evt, loop
|
||||
)
|
||||
log_task = asyncio.create_task(
|
||||
_stream_container_logs(container, training_id, str(tr["model_id"]), stream_key)
|
||||
)
|
||||
|
||||
# attendi exit
|
||||
exit_code = await loop.run_in_executor(None, lambda: container.wait()["StatusCode"])
|
||||
stop_evt.set()
|
||||
await log_task
|
||||
try:
|
||||
stats_task.cancel()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if exit_code != 0:
|
||||
error = f"container exited with code {exit_code}"
|
||||
|
||||
# raccogli outputs
|
||||
results: dict = {}
|
||||
final_metrics_path = out_dir / "metrics.json"
|
||||
if final_metrics_path.exists():
|
||||
try:
|
||||
results = json.loads(final_metrics_path.read_text())
|
||||
except Exception:
|
||||
results = {"raw": final_metrics_path.read_text()[:10000]}
|
||||
|
||||
# upload artefatti (tutta la cartella out/)
|
||||
for p in out_dir.rglob("*"):
|
||||
if p.is_file():
|
||||
rel = p.relative_to(out_dir).as_posix()
|
||||
key = f"{artifacts_prefix}/{rel}"
|
||||
minio_client.put_bytes(key, p.read_bytes())
|
||||
|
||||
# upload logs jsonl dallo stream redis (copia su minio per persistenza)
|
||||
try:
|
||||
entries = await r.xrange(stream_key, min="-", max="+")
|
||||
lines = "\n".join(json.dumps({"id": i, **({"payload": json.loads(f.get("payload", "{}"))} if "payload" in f else f)}) for i, f in entries)
|
||||
minio_client.put_bytes(f"trainings/{training_id}/logs.jsonl", lines.encode("utf-8"), "application/x-ndjson")
|
||||
except Exception as e:
|
||||
log.warning("log archive failed: %s", e)
|
||||
|
||||
cpu_avg = sum(s[0] for s in samples) / len(samples) if samples else 0.0
|
||||
cpu_peak = max((s[0] for s in samples), default=0.0)
|
||||
mem_avg = sum(s[1] for s in samples) / len(samples) if samples else 0.0
|
||||
mem_peak = max((s[1] for s in samples), default=0.0)
|
||||
resource_summary = {
|
||||
"cpu_avg": round(cpu_avg, 2),
|
||||
"cpu_peak": round(cpu_peak, 2),
|
||||
"mem_avg_mb": round(mem_avg, 2),
|
||||
"mem_peak_mb": round(mem_peak, 2),
|
||||
"samples": len(samples),
|
||||
}
|
||||
|
||||
status = "failed" if error else "succeeded"
|
||||
await db.execute(
|
||||
"""
|
||||
UPDATE trainings SET
|
||||
status=$2,
|
||||
finished_at=NOW(),
|
||||
duration_ms=EXTRACT(EPOCH FROM (NOW() - started_at))*1000,
|
||||
artifacts_prefix=$3,
|
||||
results=$4::jsonb,
|
||||
resource_summary=$5::jsonb,
|
||||
error=$6
|
||||
WHERE id=$1
|
||||
""",
|
||||
uuid.UUID(training_id),
|
||||
status,
|
||||
artifacts_prefix,
|
||||
json.dumps(results),
|
||||
json.dumps(resource_summary),
|
||||
error,
|
||||
)
|
||||
await r.hset(state_key, mapping={"status": status, "progress": "100", "message": error or "done"})
|
||||
await _emit(stream_key, {"type": "end", "status": status, "error": error})
|
||||
|
||||
# Flush dei punti Influx accumulati durante il training (batched).
|
||||
await influx_client.flush()
|
||||
|
||||
try:
|
||||
container.remove(force=True)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
except Exception as e:
|
||||
log.exception("training %s failed: %s", training_id, e)
|
||||
await db.execute(
|
||||
"UPDATE trainings SET status='failed', finished_at=NOW(), error=$2 WHERE id=$1",
|
||||
uuid.UUID(training_id), str(e)[:1000],
|
||||
)
|
||||
await r.hset(state_key, mapping={"status": "failed", "message": str(e)[:200]})
|
||||
await _emit(stream_key, {"type": "end", "status": "failed", "error": str(e)[:400]})
|
||||
finally:
|
||||
# cleanup workdir
|
||||
try:
|
||||
shutil.rmtree(workdir, ignore_errors=True)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
async def run_test_once(training_id: str, inputs: dict) -> dict:
|
||||
"""Esegue una singola predizione via container spawn."""
|
||||
tr = await db.fetchrow(
|
||||
"SELECT t.*, m.gitea_repo FROM trainings t JOIN models m ON t.model_id = m.id WHERE t.id=$1",
|
||||
uuid.UUID(training_id),
|
||||
)
|
||||
if not tr:
|
||||
raise RuntimeError("training not found")
|
||||
|
||||
spec = await fetch_and_parse_spec(tr["gitea_repo"], tr["patch"]) or {}
|
||||
test_spec = spec.get("test") or {}
|
||||
entrypoint = test_spec.get("entrypoint") or "python -m src.predict"
|
||||
|
||||
workdir = Path(settings.runner_tmp_dir) / f"test-{uuid.uuid4()}"
|
||||
workdir.mkdir(parents=True, exist_ok=True)
|
||||
try:
|
||||
await _clone_repo(tr["gitea_repo"], tr["patch"], workdir / "repo")
|
||||
|
||||
# scarica artefatti
|
||||
if tr["artifacts_prefix"]:
|
||||
art_dir = workdir / "artifacts"
|
||||
art_dir.mkdir(exist_ok=True)
|
||||
for obj in minio_client.list_prefix(tr["artifacts_prefix"] + "/"):
|
||||
rel = obj["name"][len(tr["artifacts_prefix"]) + 1:]
|
||||
out_path = art_dir / rel
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
out_path.write_bytes(minio_client.get_bytes(obj["name"]))
|
||||
|
||||
dc = _docker_client()
|
||||
payload = json.dumps({"inputs": inputs}).encode()
|
||||
container = dc.containers.run(
|
||||
settings.runner_image,
|
||||
command=["sh", "-c", f"cd /workdir/repo && pip install -q -r requirements.txt 2>&1 >/dev/null || true && {entrypoint}"],
|
||||
detach=True,
|
||||
working_dir="/workdir/repo",
|
||||
environment={
|
||||
"MEB_ARTIFACTS_DIR": "/workdir/artifacts",
|
||||
"MEB_TRAINING_ID": training_id,
|
||||
},
|
||||
volumes={str(workdir): {"bind": "/workdir", "mode": "ro"}},
|
||||
network_mode="none",
|
||||
mem_limit="2048m",
|
||||
nano_cpus=int(1e9),
|
||||
stdin_open=True,
|
||||
tty=False,
|
||||
)
|
||||
|
||||
# scrivi input su stdin via attach socket
|
||||
sock = container.attach_socket(params={"stdin": 1, "stream": 1})
|
||||
try:
|
||||
sock._sock.sendall(payload + b"\n")
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
sock.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
loop = asyncio.get_event_loop()
|
||||
# stats peak
|
||||
peak_cpu = 0.0
|
||||
peak_mem = 0.0
|
||||
stop = False
|
||||
|
||||
def _stats():
|
||||
nonlocal peak_cpu, peak_mem, stop
|
||||
for st in container.stats(stream=True, decode=True):
|
||||
if stop:
|
||||
return
|
||||
try:
|
||||
cpu_delta = st["cpu_stats"]["cpu_usage"]["total_usage"] - st["precpu_stats"]["cpu_usage"]["total_usage"]
|
||||
sys_delta = st["cpu_stats"].get("system_cpu_usage", 0) - st["precpu_stats"].get("system_cpu_usage", 0)
|
||||
online = st["cpu_stats"].get("online_cpus") or 1
|
||||
cpu_pct = (cpu_delta / sys_delta) * online * 100 if sys_delta > 0 else 0
|
||||
mem_mb = (st["memory_stats"].get("usage") or 0) / (1024 * 1024)
|
||||
peak_cpu = max(peak_cpu, cpu_pct)
|
||||
peak_mem = max(peak_mem, mem_mb)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
stats_fut = loop.run_in_executor(None, _stats)
|
||||
|
||||
exit_info = await loop.run_in_executor(None, container.wait)
|
||||
stop = True
|
||||
logs = container.logs(stdout=True, stderr=False).decode("utf-8", errors="replace")
|
||||
try:
|
||||
container.remove(force=True)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
outputs: dict = {}
|
||||
for line in logs.strip().splitlines():
|
||||
line = line.strip()
|
||||
if line.startswith("{") and line.endswith("}"):
|
||||
try:
|
||||
obj = json.loads(line)
|
||||
if "outputs" in obj:
|
||||
outputs = obj["outputs"]
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
return {
|
||||
"outputs": outputs,
|
||||
"exit_code": exit_info.get("StatusCode"),
|
||||
"cpu_peak": round(peak_cpu, 2),
|
||||
"mem_peak_mb": round(peak_mem, 2),
|
||||
"raw_log": logs[-2000:],
|
||||
}
|
||||
finally:
|
||||
shutil.rmtree(workdir, ignore_errors=True)
|
||||
57
ml/core/gitea.py
Normal file
57
ml/core/gitea.py
Normal file
@@ -0,0 +1,57 @@
|
||||
"""Client Gitea: browse repo, branches, commits, file raw, clone URL autenticato."""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Optional
|
||||
|
||||
import httpx
|
||||
|
||||
from core.config import settings
|
||||
|
||||
|
||||
def _headers() -> dict:
|
||||
h = {"Accept": "application/json"}
|
||||
if settings.gitea_token:
|
||||
h["Authorization"] = f"token {settings.gitea_token}"
|
||||
return h
|
||||
|
||||
|
||||
def clone_url(owner_repo: str) -> str:
|
||||
"""URL https://oauth2:TOKEN@<host>/owner/repo.git — usato SOLO lato server."""
|
||||
if not settings.gitea_url:
|
||||
raise RuntimeError("GITEA_URL not configured")
|
||||
base = settings.gitea_url.rstrip("/")
|
||||
if settings.gitea_token:
|
||||
base = base.replace("https://", f"https://oauth2:{settings.gitea_token}@").replace(
|
||||
"http://", f"http://oauth2:{settings.gitea_token}@"
|
||||
)
|
||||
return f"{base}/{owner_repo}.git"
|
||||
|
||||
|
||||
async def _get(path: str, params: Optional[dict] = None) -> list | dict:
|
||||
url = f"{settings.gitea_url.rstrip('/')}/api/v1{path}"
|
||||
async with httpx.AsyncClient(timeout=15.0) as c:
|
||||
r = await c.get(url, params=params, headers=_headers())
|
||||
r.raise_for_status()
|
||||
return r.json()
|
||||
|
||||
|
||||
async def list_repos(limit: int = 50) -> list[dict]:
|
||||
data = await _get("/repos/search", params={"limit": str(limit)})
|
||||
return data.get("data", []) if isinstance(data, dict) else []
|
||||
|
||||
|
||||
async def list_branches(owner_repo: str) -> list[dict]:
|
||||
return await _get(f"/repos/{owner_repo}/branches")
|
||||
|
||||
|
||||
async def list_commits(owner_repo: str, branch: str = "main", limit: int = 50) -> list[dict]:
|
||||
return await _get(f"/repos/{owner_repo}/commits", params={"sha": branch, "limit": str(limit)})
|
||||
|
||||
|
||||
async def get_file_raw(owner_repo: str, ref: str, path: str) -> bytes:
|
||||
"""Scarica il file raw alla revisione indicata."""
|
||||
url = f"{settings.gitea_url.rstrip('/')}/api/v1/repos/{owner_repo}/raw/{path}"
|
||||
async with httpx.AsyncClient(timeout=15.0) as c:
|
||||
r = await c.get(url, params={"ref": ref}, headers=_headers())
|
||||
r.raise_for_status()
|
||||
return r.content
|
||||
75
ml/core/influx_client.py
Normal file
75
ml/core/influx_client.py
Normal file
@@ -0,0 +1,75 @@
|
||||
"""Client InfluxDB (influxdb-client sync wrapper in thread-pool per async).
|
||||
|
||||
Le scritture usano il batching async dell'SDK invece di SYNCHRONOUS.
|
||||
Le metriche di training arrivano in burst (logs container, stats loop ogni 5s):
|
||||
con SYNCHRONOUS ogni write era una HTTP request bloccante. Con WriteOptions
|
||||
batched, l'SDK accumula i Point e fa flush periodico in background, senza
|
||||
perdere durabilità (flush forzato a fine training).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from typing import Iterable, Optional
|
||||
|
||||
from influxdb_client import InfluxDBClient, Point, WriteOptions
|
||||
|
||||
from core.config import settings
|
||||
|
||||
_client: Optional[InfluxDBClient] = None
|
||||
_write_api = None
|
||||
|
||||
|
||||
def client() -> InfluxDBClient:
|
||||
global _client, _write_api
|
||||
if _client is None:
|
||||
_client = InfluxDBClient(
|
||||
url=settings.influx_url, token=settings.influx_token, org=settings.influx_org
|
||||
)
|
||||
_write_api = _client.write_api(write_options=WriteOptions(
|
||||
batch_size=200,
|
||||
flush_interval=2_000,
|
||||
jitter_interval=200,
|
||||
retry_interval=2_000,
|
||||
max_retries=3,
|
||||
))
|
||||
return _client
|
||||
|
||||
|
||||
def _wa():
|
||||
client()
|
||||
return _write_api
|
||||
|
||||
|
||||
async def write_points(points: Iterable[Point]) -> None:
|
||||
wa = _wa()
|
||||
pts = list(points)
|
||||
await asyncio.to_thread(wa.write, settings.influx_bucket, settings.influx_org, pts)
|
||||
|
||||
|
||||
async def flush() -> None:
|
||||
"""Forza il flush del buffer batched. Da chiamare a fine training per
|
||||
garantire che tutte le metriche raccolte siano persistite."""
|
||||
if _write_api is None:
|
||||
return
|
||||
try:
|
||||
await asyncio.to_thread(_write_api.flush)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
async def query_flux(flux: str) -> list[dict]:
|
||||
c = client()
|
||||
def _q():
|
||||
tables = c.query_api().query(flux, org=settings.influx_org)
|
||||
out = []
|
||||
for table in tables:
|
||||
for r in table.records:
|
||||
out.append({
|
||||
"time": r.get_time().isoformat() if r.get_time() else None,
|
||||
"measurement": r.get_measurement(),
|
||||
"field": r.get_field(),
|
||||
"value": r.get_value(),
|
||||
"tags": {k: v for k, v in r.values.items() if k.startswith("_") is False and k not in ("result", "table")},
|
||||
})
|
||||
return out
|
||||
return await asyncio.to_thread(_q)
|
||||
118
ml/core/minio_client.py
Normal file
118
ml/core/minio_client.py
Normal file
@@ -0,0 +1,118 @@
|
||||
"""Wrapper MinIO: bucket unico (settings.minio_bucket) con prefissi logici.
|
||||
|
||||
Prefissi usati:
|
||||
datasets/<uuid>.<ext>
|
||||
models/<model_id>/spec.yml
|
||||
models/<model_id>/<version>/<patch>/... (artefatti training)
|
||||
trainings/<training_id>/logs.jsonl
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
from datetime import timedelta
|
||||
from typing import Iterable, Optional
|
||||
|
||||
from minio import Minio
|
||||
from minio.error import S3Error
|
||||
|
||||
from core.config import settings
|
||||
|
||||
|
||||
_client: Optional[Minio] = None
|
||||
|
||||
|
||||
def client() -> Minio:
|
||||
global _client
|
||||
if _client is None:
|
||||
_client = Minio(
|
||||
f"{settings.minio_endpoint}:{settings.minio_port}",
|
||||
access_key=settings.minio_access_key,
|
||||
secret_key=settings.minio_secret_key,
|
||||
secure=settings.minio_use_ssl,
|
||||
)
|
||||
return _client
|
||||
|
||||
|
||||
def _bucket(b: Optional[str] = None) -> str:
|
||||
return b or settings.minio_bucket
|
||||
|
||||
|
||||
def ensure_bucket(bucket: Optional[str] = None) -> None:
|
||||
name = _bucket(bucket)
|
||||
c = client()
|
||||
if not c.bucket_exists(name):
|
||||
c.make_bucket(name)
|
||||
|
||||
|
||||
def put_bytes(key: str, data: bytes, content_type: str = "application/octet-stream",
|
||||
bucket: Optional[str] = None) -> None:
|
||||
ensure_bucket(bucket)
|
||||
client().put_object(
|
||||
_bucket(bucket),
|
||||
key,
|
||||
io.BytesIO(data),
|
||||
length=len(data),
|
||||
content_type=content_type,
|
||||
)
|
||||
|
||||
|
||||
def put_stream(key: str, stream, length: int, content_type: str = "application/octet-stream",
|
||||
bucket: Optional[str] = None) -> None:
|
||||
ensure_bucket(bucket)
|
||||
client().put_object(
|
||||
_bucket(bucket), key, stream, length=length, content_type=content_type
|
||||
)
|
||||
|
||||
|
||||
def get_bytes(key: str, bucket: Optional[str] = None) -> bytes:
|
||||
r = client().get_object(_bucket(bucket), key)
|
||||
try:
|
||||
return r.read()
|
||||
finally:
|
||||
r.close()
|
||||
r.release_conn()
|
||||
|
||||
|
||||
def remove(key: str, bucket: Optional[str] = None) -> None:
|
||||
try:
|
||||
client().remove_object(_bucket(bucket), key)
|
||||
except S3Error:
|
||||
pass
|
||||
|
||||
|
||||
def remove_prefix(prefix: str, bucket: Optional[str] = None) -> int:
|
||||
name = _bucket(bucket)
|
||||
n = 0
|
||||
for obj in client().list_objects(name, prefix=prefix, recursive=True):
|
||||
try:
|
||||
client().remove_object(name, obj.object_name)
|
||||
n += 1
|
||||
except S3Error:
|
||||
pass
|
||||
return n
|
||||
|
||||
|
||||
def presigned_get(key: str, expires_seconds: int = 3600, bucket: Optional[str] = None) -> str:
|
||||
return client().presigned_get_object(
|
||||
_bucket(bucket), key, expires=timedelta(seconds=expires_seconds)
|
||||
)
|
||||
|
||||
|
||||
def list_prefix(prefix: str, bucket: Optional[str] = None) -> list[dict]:
|
||||
out = []
|
||||
for obj in client().list_objects(_bucket(bucket), prefix=prefix, recursive=True):
|
||||
out.append({
|
||||
"name": obj.object_name,
|
||||
"size": obj.size,
|
||||
"last_modified": obj.last_modified.isoformat() if obj.last_modified else None,
|
||||
"etag": obj.etag,
|
||||
})
|
||||
return out
|
||||
|
||||
|
||||
def check() -> bool:
|
||||
try:
|
||||
client().list_buckets()
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
90
ml/core/model_spec.py
Normal file
90
ml/core/model_spec.py
Normal file
@@ -0,0 +1,90 @@
|
||||
"""Parse e validazione del contratto `model.yml` nelle repo utente.
|
||||
|
||||
Schema sintetico (vedi piano):
|
||||
name, type, version, python
|
||||
train: {entrypoint, inputs, outputs, metrics}
|
||||
test: {entrypoint, io, input_schema[], output_schema[]}
|
||||
resources: {cpu, mem_mb, gpu}
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Optional
|
||||
|
||||
import yaml
|
||||
from pydantic import BaseModel, ValidationError
|
||||
|
||||
from core import gitea, redis_client
|
||||
|
||||
|
||||
class _FieldSpec(BaseModel):
|
||||
name: str
|
||||
dtype: str
|
||||
min: Optional[float] = None
|
||||
max: Optional[float] = None
|
||||
unit: Optional[str] = None
|
||||
|
||||
|
||||
class _Train(BaseModel):
|
||||
entrypoint: str
|
||||
inputs: dict = {}
|
||||
outputs: dict = {}
|
||||
metrics: dict = {}
|
||||
|
||||
|
||||
class _Test(BaseModel):
|
||||
entrypoint: str
|
||||
io: str = "stdio_json"
|
||||
input_schema: list[_FieldSpec] = []
|
||||
output_schema: list[_FieldSpec] = []
|
||||
|
||||
|
||||
class ModelSpec(BaseModel):
|
||||
name: str
|
||||
type: str
|
||||
version: str = "0.1.0"
|
||||
python: str = "3.11"
|
||||
train: _Train
|
||||
test: Optional[_Test] = None
|
||||
resources: dict = {}
|
||||
|
||||
|
||||
def parse_yaml(content: bytes | str) -> dict:
|
||||
"""Parsa stringa YAML → dict validato. Solleva ValueError su errore."""
|
||||
if isinstance(content, bytes):
|
||||
content = content.decode("utf-8")
|
||||
try:
|
||||
raw = yaml.safe_load(content) or {}
|
||||
spec = ModelSpec(**raw)
|
||||
return spec.model_dump()
|
||||
except (yaml.YAMLError, ValidationError) as e:
|
||||
raise ValueError(f"invalid model.yml: {e}") from e
|
||||
|
||||
|
||||
async def fetch_and_parse_spec(owner_repo: str, ref: str) -> Optional[dict]:
|
||||
"""Recupera model.yml dalla repo alla revisione e lo parsa.
|
||||
Cache Redis `ml:modelspec:{repo}:{ref}` TTL 1h.
|
||||
"""
|
||||
cache_key = f"ml:modelspec:{owner_repo}:{ref}"
|
||||
try:
|
||||
cached = await redis_client.client().get(cache_key)
|
||||
if cached:
|
||||
import json
|
||||
return json.loads(cached)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
raw = await gitea.get_file_raw(owner_repo, ref, "model.yml")
|
||||
except Exception:
|
||||
try:
|
||||
raw = await gitea.get_file_raw(owner_repo, ref, "model.yaml")
|
||||
except Exception:
|
||||
return None
|
||||
spec = parse_yaml(raw)
|
||||
|
||||
try:
|
||||
import json
|
||||
await redis_client.client().set(cache_key, json.dumps(spec), ex=3600)
|
||||
except Exception:
|
||||
pass
|
||||
return spec
|
||||
29
ml/core/redis_client.py
Normal file
29
ml/core/redis_client.py
Normal file
@@ -0,0 +1,29 @@
|
||||
"""Client Redis asincrono (redis-py asyncio). Singleton semplice."""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Optional
|
||||
|
||||
import redis.asyncio as redis
|
||||
|
||||
from core.config import settings
|
||||
|
||||
_client: Optional[redis.Redis] = None
|
||||
|
||||
|
||||
def client() -> redis.Redis:
|
||||
global _client
|
||||
if _client is None:
|
||||
_client = redis.Redis(
|
||||
host=settings.redis_host,
|
||||
port=settings.redis_port,
|
||||
decode_responses=True,
|
||||
health_check_interval=30,
|
||||
)
|
||||
return _client
|
||||
|
||||
|
||||
async def close() -> None:
|
||||
global _client
|
||||
if _client is not None:
|
||||
await _client.aclose()
|
||||
_client = None
|
||||
54
ml/core/worker.py
Normal file
54
ml/core/worker.py
Normal file
@@ -0,0 +1,54 @@
|
||||
"""Worker loop: BRPOP da ml:queue:train e dispatch al docker_runner.
|
||||
|
||||
Parte N task asincroni concorrenti (settings.train_concurrency).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
|
||||
from core import redis_client
|
||||
from core.config import settings
|
||||
from core.docker_runner import run_training_job
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
_tasks: list[asyncio.Task] = []
|
||||
|
||||
|
||||
async def _worker_loop(idx: int):
|
||||
r = redis_client.client()
|
||||
log.info("ml worker[%d] started", idx)
|
||||
while True:
|
||||
try:
|
||||
res = await r.brpop("ml:queue:train", timeout=10)
|
||||
except Exception as e:
|
||||
log.warning("brpop error: %s", e)
|
||||
await asyncio.sleep(2)
|
||||
continue
|
||||
if res is None:
|
||||
continue
|
||||
_, training_id = res
|
||||
log.info("worker[%d] picked training %s", idx, training_id)
|
||||
try:
|
||||
await run_training_job(training_id)
|
||||
except Exception:
|
||||
log.exception("worker[%d] training %s crashed", idx, training_id)
|
||||
|
||||
|
||||
def start_workers() -> None:
|
||||
global _tasks
|
||||
n = max(1, settings.train_concurrency)
|
||||
for i in range(n):
|
||||
_tasks.append(asyncio.create_task(_worker_loop(i)))
|
||||
|
||||
|
||||
async def stop_workers() -> None:
|
||||
for t in _tasks:
|
||||
t.cancel()
|
||||
for t in _tasks:
|
||||
try:
|
||||
await t
|
||||
except Exception:
|
||||
pass
|
||||
_tasks.clear()
|
||||
95
ml/main.py
95
ml/main.py
@@ -1,19 +1,90 @@
|
||||
from fastapi import FastAPI, Request, Response, Header
|
||||
from fastapi.responses import HTMLResponse, JSONResponse
|
||||
import time
|
||||
"""ml-service — FastAPI entrypoint.
|
||||
|
||||
Monta:
|
||||
/ → RedirectResponse
|
||||
/datasets /models /train /test /results → pagine Jinja
|
||||
/api/datasets /api/models /api/repos /api/trainings /api/tests /api/results → JSON
|
||||
/api/trainings/{id}/events → SSE
|
||||
/health → check
|
||||
/static/* → file statici
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from contextlib import asynccontextmanager
|
||||
from pathlib import Path
|
||||
|
||||
from fastapi import FastAPI
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
|
||||
from core import db, minio_client, redis_client, worker
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
STATIC_DIR = Path(__file__).resolve().parent / "static"
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
log.info("ml-service starting")
|
||||
await db.init_pool()
|
||||
try:
|
||||
minio_client.ensure_bucket()
|
||||
except Exception as e:
|
||||
log.warning("minio bucket ensure failed: %s", e)
|
||||
worker.start_workers()
|
||||
yield
|
||||
log.info("ml-service stopping")
|
||||
await worker.stop_workers()
|
||||
await db.close_pool()
|
||||
await redis_client.close()
|
||||
|
||||
|
||||
app = FastAPI(title="MEB ML Service", lifespan=lifespan)
|
||||
|
||||
# static
|
||||
app.mount("/static", StaticFiles(directory=str(STATIC_DIR)), name="static")
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
@app.get("/health")
|
||||
def health():
|
||||
async def health():
|
||||
pg_ok = True
|
||||
try:
|
||||
await db.fetchrow("SELECT 1")
|
||||
except Exception:
|
||||
pg_ok = False
|
||||
redis_ok = True
|
||||
try:
|
||||
await redis_client.client().ping()
|
||||
except Exception:
|
||||
redis_ok = False
|
||||
return {
|
||||
"status": "ok",
|
||||
"status": "ok" if (pg_ok and redis_ok) else "degraded",
|
||||
"service": "ml",
|
||||
"version": "1.0.0",
|
||||
"build_number": "1",
|
||||
"version_state": "dev"
|
||||
"postgres": "connected" if pg_ok else "disconnected",
|
||||
"redis": "connected" if redis_ok else "disconnected",
|
||||
"minio": "connected" if minio_client.check() else "disconnected",
|
||||
"version": "2.0.0",
|
||||
}
|
||||
|
||||
@app.get("/")
|
||||
def root():
|
||||
return {"message": "ML Service"}
|
||||
|
||||
from routers import ( # noqa: E402
|
||||
datasets,
|
||||
models,
|
||||
pages,
|
||||
repos,
|
||||
results,
|
||||
tests,
|
||||
trainings,
|
||||
trainings_stream,
|
||||
)
|
||||
|
||||
app.include_router(pages.router)
|
||||
app.include_router(datasets.router)
|
||||
app.include_router(models.router)
|
||||
app.include_router(repos.router)
|
||||
app.include_router(trainings.router)
|
||||
app.include_router(trainings_stream.router)
|
||||
app.include_router(tests.router)
|
||||
app.include_router(results.router)
|
||||
|
||||
@@ -1,3 +1,15 @@
|
||||
fastapi
|
||||
uvicorn
|
||||
uvicorn[standard]
|
||||
PyJWT
|
||||
asyncpg
|
||||
redis>=5
|
||||
minio
|
||||
influxdb-client
|
||||
docker
|
||||
PyYAML
|
||||
pydantic>=2
|
||||
python-multipart
|
||||
jinja2
|
||||
aiofiles
|
||||
httpx
|
||||
sse-starlette
|
||||
|
||||
160
ml/routers/datasets.py
Normal file
160
ml/routers/datasets.py
Normal file
@@ -0,0 +1,160 @@
|
||||
"""API datasets (ml.mebboat.it/api/datasets).
|
||||
|
||||
Upload/list/get/download/delete. Storage:
|
||||
MinIO bucket "ml" con key "datasets/<uuid>.<ext>"
|
||||
Postgres db "ml" tabella "datasets"
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import uuid
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import APIRouter, Depends, File, Form, HTTPException, Query, UploadFile
|
||||
|
||||
from core import db, minio_client
|
||||
from core.auth import require_auth
|
||||
|
||||
router = APIRouter(prefix="/api/datasets", tags=["datasets"])
|
||||
|
||||
# Bucket MinIO fisso per tutti i dataset (no prefix nelle key).
|
||||
BUCKET = "ml.datasets"
|
||||
_EXT = {"csv": "csv", "json": "json", "netcdf": "nc"}
|
||||
|
||||
|
||||
def _row(r) -> dict:
|
||||
if r is None:
|
||||
return None
|
||||
d = dict(r)
|
||||
# asyncpg ritorna JSONB come dict già; date/time come datetime
|
||||
for k in ("created_at", "updated_at", "start_date", "end_date"):
|
||||
if d.get(k) is not None and hasattr(d[k], "isoformat"):
|
||||
d[k] = d[k].isoformat()
|
||||
return d
|
||||
|
||||
|
||||
@router.get("")
|
||||
async def list_datasets(
|
||||
type: Optional[str] = Query(None),
|
||||
tags: Optional[str] = Query(None),
|
||||
mine: Optional[int] = Query(None),
|
||||
search: Optional[str] = Query(None),
|
||||
user=Depends(require_auth),
|
||||
):
|
||||
where = []
|
||||
args: list = []
|
||||
if type:
|
||||
args.append(type)
|
||||
where.append(f"type = ${len(args)}")
|
||||
if tags:
|
||||
tag_arr = [t.strip() for t in tags.split(",") if t.strip()]
|
||||
if tag_arr:
|
||||
args.append(tag_arr)
|
||||
where.append(f"tags && ${len(args)}")
|
||||
if mine and user.get("username"):
|
||||
args.append(user["username"])
|
||||
where.append(f"created_by = ${len(args)}")
|
||||
if search:
|
||||
args.append(f"%{search}%")
|
||||
where.append(f"(nome ILIKE ${len(args)} OR description ILIKE ${len(args)})")
|
||||
sql = "SELECT * FROM datasets"
|
||||
if where:
|
||||
sql += " WHERE " + " AND ".join(where)
|
||||
sql += " ORDER BY created_at DESC LIMIT 500"
|
||||
rows = await db.fetch(sql, *args)
|
||||
return {"count": len(rows), "datasets": [_row(r) for r in rows]}
|
||||
|
||||
|
||||
@router.post("", status_code=201)
|
||||
async def upload_dataset(
|
||||
file: UploadFile = File(...),
|
||||
metadata: str = Form("{}"),
|
||||
user=Depends(require_auth),
|
||||
):
|
||||
try:
|
||||
meta = json.loads(metadata or "{}")
|
||||
except json.JSONDecodeError:
|
||||
raise HTTPException(400, "metadata must be valid JSON")
|
||||
|
||||
fmt = meta.get("format") or meta.get("type") or "csv"
|
||||
if fmt not in ("csv", "json", "netcdf"):
|
||||
fmt = "csv"
|
||||
ext = _EXT[fmt]
|
||||
ds_id = str(uuid.uuid4())
|
||||
file_key = f"{ds_id}.{ext}"
|
||||
|
||||
data = await file.read()
|
||||
minio_client.put_bytes(file_key, data, content_type=file.content_type or "application/octet-stream", bucket=BUCKET)
|
||||
|
||||
created_by = user.get("username") or meta.get("created_by") or "unknown"
|
||||
row = await db.fetchrow(
|
||||
"""
|
||||
INSERT INTO datasets (
|
||||
id, file_key, nome, description, tags, type, format, notes,
|
||||
created_by, size_bytes, copernicus_id
|
||||
) VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11)
|
||||
RETURNING *
|
||||
""",
|
||||
uuid.UUID(ds_id),
|
||||
file_key,
|
||||
meta.get("nome") or file.filename or file_key,
|
||||
meta.get("description"),
|
||||
meta.get("tags") or [],
|
||||
meta.get("dataset_type") or "custom",
|
||||
fmt,
|
||||
meta.get("notes"),
|
||||
created_by,
|
||||
len(data),
|
||||
meta.get("copernicus_id") or meta.get("copernicus_dataset_id"),
|
||||
)
|
||||
return _row(row)
|
||||
|
||||
|
||||
@router.get("/{dataset_id}")
|
||||
async def get_dataset(dataset_id: str, user=Depends(require_auth)):
|
||||
row = await db.fetchrow("SELECT * FROM datasets WHERE id = $1", uuid.UUID(dataset_id))
|
||||
if not row:
|
||||
raise HTTPException(404, "not found")
|
||||
return _row(row)
|
||||
|
||||
|
||||
@router.get("/{dataset_id}/download")
|
||||
async def download_dataset(dataset_id: str, user=Depends(require_auth)):
|
||||
row = await db.fetchrow("SELECT file_key FROM datasets WHERE id = $1", uuid.UUID(dataset_id))
|
||||
if not row:
|
||||
raise HTTPException(404, "not found")
|
||||
url = minio_client.presigned_get(row["file_key"], 3600, bucket=BUCKET)
|
||||
return {"url": url, "expires_in": 3600}
|
||||
|
||||
|
||||
@router.patch("/{dataset_id}")
|
||||
async def patch_dataset(dataset_id: str, body: dict, user=Depends(require_auth)):
|
||||
allowed = {"nome", "description", "tags", "notes"}
|
||||
sets = []
|
||||
args: list = []
|
||||
for k, v in body.items():
|
||||
if k in allowed:
|
||||
args.append(v)
|
||||
sets.append(f"{k} = ${len(args)}")
|
||||
if not sets:
|
||||
raise HTTPException(400, "no fields to update")
|
||||
# Trigger updated_at non presente nel DB: lo aggiorniamo manualmente.
|
||||
sets.append("updated_at = NOW()")
|
||||
args.append(uuid.UUID(dataset_id))
|
||||
row = await db.fetchrow(
|
||||
f"UPDATE datasets SET {', '.join(sets)} WHERE id = ${len(args)} RETURNING *",
|
||||
*args,
|
||||
)
|
||||
if not row:
|
||||
raise HTTPException(404, "not found")
|
||||
return _row(row)
|
||||
|
||||
|
||||
@router.delete("/{dataset_id}", status_code=204)
|
||||
async def delete_dataset(dataset_id: str, user=Depends(require_auth)):
|
||||
row = await db.fetchrow("SELECT file_key FROM datasets WHERE id = $1", uuid.UUID(dataset_id))
|
||||
if not row:
|
||||
raise HTTPException(404, "not found")
|
||||
minio_client.remove(row["file_key"], bucket=BUCKET)
|
||||
await db.execute("DELETE FROM datasets WHERE id = $1", uuid.UUID(dataset_id))
|
||||
return None
|
||||
131
ml/routers/models.py
Normal file
131
ml/routers/models.py
Normal file
@@ -0,0 +1,131 @@
|
||||
"""API /api/models — registro modelli (repo Gitea + metadata)."""
|
||||
from __future__ import annotations
|
||||
|
||||
import uuid
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
|
||||
from core import db
|
||||
from core.auth import require_auth
|
||||
from core.model_spec import fetch_and_parse_spec
|
||||
|
||||
router = APIRouter(prefix="/api/models", tags=["models"])
|
||||
|
||||
|
||||
def _row(r) -> Optional[dict]:
|
||||
if r is None:
|
||||
return None
|
||||
d = dict(r)
|
||||
for k in ("created_at", "updated_at"):
|
||||
if d.get(k) is not None and hasattr(d[k], "isoformat"):
|
||||
d[k] = d[k].isoformat()
|
||||
return d
|
||||
|
||||
|
||||
@router.get("")
|
||||
async def list_models(user=Depends(require_auth)):
|
||||
rows = await db.fetch("SELECT * FROM models ORDER BY created_at DESC LIMIT 500")
|
||||
return {"count": len(rows), "models": [_row(r) for r in rows]}
|
||||
|
||||
|
||||
@router.post("", status_code=201)
|
||||
async def create_model(body: dict, user=Depends(require_auth)):
|
||||
required = ("name", "type", "gitea_repo")
|
||||
for k in required:
|
||||
if not body.get(k):
|
||||
raise HTTPException(400, f"missing field: {k}")
|
||||
|
||||
# prova a pre-caricare model.yml dal default branch (non fatale)
|
||||
spec = None
|
||||
try:
|
||||
spec = await fetch_and_parse_spec(body["gitea_repo"], body.get("default_branch") or "main")
|
||||
except Exception:
|
||||
spec = None
|
||||
|
||||
row = await db.fetchrow(
|
||||
"""
|
||||
INSERT INTO models (name, type, gitea_repo, default_branch, spec, created_by)
|
||||
VALUES ($1,$2,$3,$4,$5,$6)
|
||||
RETURNING *
|
||||
""",
|
||||
body["name"],
|
||||
body["type"],
|
||||
body["gitea_repo"],
|
||||
body.get("default_branch") or "main",
|
||||
spec,
|
||||
user.get("username") or "unknown",
|
||||
)
|
||||
return _row(row)
|
||||
|
||||
|
||||
@router.get("/{model_id}")
|
||||
async def get_model(model_id: str, user=Depends(require_auth)):
|
||||
row = await db.fetchrow("SELECT * FROM models WHERE id = $1", uuid.UUID(model_id))
|
||||
if not row:
|
||||
raise HTTPException(404, "not found")
|
||||
return _row(row)
|
||||
|
||||
|
||||
@router.patch("/{model_id}")
|
||||
async def patch_model(model_id: str, body: dict, user=Depends(require_auth)):
|
||||
allowed = {"name", "type", "default_branch"}
|
||||
sets = []
|
||||
args: list = []
|
||||
for k, v in body.items():
|
||||
if k in allowed:
|
||||
args.append(v)
|
||||
sets.append(f"{k} = ${len(args)}")
|
||||
if not sets:
|
||||
raise HTTPException(400, "no fields to update")
|
||||
args.append(uuid.UUID(model_id))
|
||||
row = await db.fetchrow(
|
||||
f"UPDATE models SET {', '.join(sets)} WHERE id = ${len(args)} RETURNING *",
|
||||
*args,
|
||||
)
|
||||
if not row:
|
||||
raise HTTPException(404, "not found")
|
||||
return _row(row)
|
||||
|
||||
|
||||
@router.delete("/{model_id}", status_code=204)
|
||||
async def delete_model(model_id: str, user=Depends(require_auth)):
|
||||
await db.execute("DELETE FROM models WHERE id = $1", uuid.UUID(model_id))
|
||||
return None
|
||||
|
||||
|
||||
# ── Notes ──────────────────────────────────────────────────────────────────
|
||||
@router.get("/{model_id}/notes")
|
||||
async def list_notes(model_id: str, user=Depends(require_auth)):
|
||||
rows = await db.fetch(
|
||||
"SELECT id, author, text, created_at FROM model_notes WHERE model_id = $1 ORDER BY created_at DESC",
|
||||
uuid.UUID(model_id),
|
||||
)
|
||||
return [
|
||||
{
|
||||
"id": str(r["id"]),
|
||||
"author": r["author"],
|
||||
"text": r["text"],
|
||||
"created_at": r["created_at"].isoformat(),
|
||||
}
|
||||
for r in rows
|
||||
]
|
||||
|
||||
|
||||
@router.post("/{model_id}/notes", status_code=201)
|
||||
async def add_note(model_id: str, body: dict, user=Depends(require_auth)):
|
||||
text = (body.get("text") or "").strip()
|
||||
if not text:
|
||||
raise HTTPException(400, "text required")
|
||||
row = await db.fetchrow(
|
||||
"INSERT INTO model_notes (model_id, author, text) VALUES ($1, $2, $3) RETURNING *",
|
||||
uuid.UUID(model_id),
|
||||
user.get("username") or "unknown",
|
||||
text,
|
||||
)
|
||||
return {
|
||||
"id": str(row["id"]),
|
||||
"author": row["author"],
|
||||
"text": row["text"],
|
||||
"created_at": row["created_at"].isoformat(),
|
||||
}
|
||||
75
ml/routers/pages.py
Normal file
75
ml/routers/pages.py
Normal file
@@ -0,0 +1,75 @@
|
||||
"""Pagine HTML servite direttamente da ml.mebboat.it.
|
||||
|
||||
Layout:
|
||||
/ redirect a /datasets (o landing console)
|
||||
/datasets lista/upload dataset
|
||||
/models registro modelli
|
||||
/train avvia training
|
||||
/test esegue test su modello trainato
|
||||
/results storico e confronto risultati
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from fastapi import APIRouter, Depends, Request
|
||||
from fastapi.responses import HTMLResponse, RedirectResponse
|
||||
from fastapi.templating import Jinja2Templates
|
||||
|
||||
from core.auth import _verify
|
||||
from core.config import settings
|
||||
|
||||
router = APIRouter(tags=["pages"])
|
||||
|
||||
TEMPLATES_DIR = Path(__file__).resolve().parent.parent / "templates"
|
||||
templates = Jinja2Templates(directory=str(TEMPLATES_DIR))
|
||||
|
||||
|
||||
def _user_or_redirect(request: Request):
|
||||
"""Per le pagine, se non autenticato redirect al login. Ritorna user dict o RedirectResponse."""
|
||||
token = request.cookies.get("auth_token")
|
||||
auth = request.headers.get("authorization")
|
||||
if not token and auth and auth.startswith("Bearer "):
|
||||
token = auth[7:]
|
||||
user = _verify(token)
|
||||
if not user:
|
||||
target = str(request.url)
|
||||
return RedirectResponse(url=f"{settings.auth_login_url}?redirect={target}", status_code=302)
|
||||
return user
|
||||
|
||||
|
||||
def _render(request: Request, template: str, **ctx):
|
||||
user = _user_or_redirect(request)
|
||||
if isinstance(user, RedirectResponse):
|
||||
return user
|
||||
return templates.TemplateResponse(template, {"request": request, "user": user, **ctx})
|
||||
|
||||
|
||||
@router.get("/", response_class=HTMLResponse)
|
||||
async def home(request: Request):
|
||||
return RedirectResponse(url="/datasets")
|
||||
|
||||
|
||||
@router.get("/datasets", response_class=HTMLResponse)
|
||||
async def page_datasets(request: Request):
|
||||
return _render(request, "datasets.html", page="datasets")
|
||||
|
||||
|
||||
@router.get("/models", response_class=HTMLResponse)
|
||||
async def page_models(request: Request):
|
||||
return _render(request, "models.html", page="models")
|
||||
|
||||
|
||||
@router.get("/train", response_class=HTMLResponse)
|
||||
async def page_train(request: Request):
|
||||
return _render(request, "train.html", page="train")
|
||||
|
||||
|
||||
@router.get("/test", response_class=HTMLResponse)
|
||||
async def page_test(request: Request):
|
||||
return _render(request, "test.html", page="test")
|
||||
|
||||
|
||||
@router.get("/results", response_class=HTMLResponse)
|
||||
async def page_results(request: Request):
|
||||
return _render(request, "results.html", page="results")
|
||||
51
ml/routers/repos.py
Normal file
51
ml/routers/repos.py
Normal file
@@ -0,0 +1,51 @@
|
||||
"""API /api/repos — proxy autenticato verso Gitea."""
|
||||
from __future__ import annotations
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Query
|
||||
|
||||
from core import gitea
|
||||
from core.auth import require_auth
|
||||
from core.model_spec import fetch_and_parse_spec
|
||||
|
||||
router = APIRouter(prefix="/api/repos", tags=["repos"])
|
||||
|
||||
|
||||
@router.get("")
|
||||
async def list_repos(user=Depends(require_auth)):
|
||||
try:
|
||||
return await gitea.list_repos()
|
||||
except Exception as e:
|
||||
raise HTTPException(502, f"gitea: {e}")
|
||||
|
||||
|
||||
@router.get("/{owner}/{repo}/branches")
|
||||
async def branches(owner: str, repo: str, user=Depends(require_auth)):
|
||||
try:
|
||||
return await gitea.list_branches(f"{owner}/{repo}")
|
||||
except Exception as e:
|
||||
raise HTTPException(502, f"gitea: {e}")
|
||||
|
||||
|
||||
@router.get("/{owner}/{repo}/commits")
|
||||
async def commits(owner: str, repo: str, branch: str = Query("main"), user=Depends(require_auth)):
|
||||
try:
|
||||
return await gitea.list_commits(f"{owner}/{repo}", branch)
|
||||
except Exception as e:
|
||||
raise HTTPException(502, f"gitea: {e}")
|
||||
|
||||
|
||||
@router.get("/{owner}/{repo}/file")
|
||||
async def file_raw(owner: str, repo: str, ref: str, path: str, user=Depends(require_auth)):
|
||||
try:
|
||||
raw = await gitea.get_file_raw(f"{owner}/{repo}", ref, path)
|
||||
return {"content": raw.decode("utf-8", errors="replace"), "size": len(raw)}
|
||||
except Exception as e:
|
||||
raise HTTPException(404, f"file not found: {e}")
|
||||
|
||||
|
||||
@router.get("/{owner}/{repo}/spec")
|
||||
async def spec(owner: str, repo: str, ref: str = Query("main"), user=Depends(require_auth)):
|
||||
s = await fetch_and_parse_spec(f"{owner}/{repo}", ref)
|
||||
if s is None:
|
||||
raise HTTPException(404, "model.yml not found at ref")
|
||||
return s
|
||||
89
ml/routers/results.py
Normal file
89
ml/routers/results.py
Normal file
@@ -0,0 +1,89 @@
|
||||
"""API /api/results — lista trainings/tests + compare multi-training."""
|
||||
from __future__ import annotations
|
||||
|
||||
import uuid
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Query
|
||||
|
||||
from core import db, influx_client
|
||||
from core.auth import require_auth
|
||||
from core.config import settings
|
||||
|
||||
router = APIRouter(prefix="/api/results", tags=["results"])
|
||||
|
||||
|
||||
def _row(r):
|
||||
if r is None:
|
||||
return None
|
||||
d = dict(r)
|
||||
for k in ("queued_at", "started_at", "finished_at", "started_at", "ended_at"):
|
||||
if d.get(k) is not None and hasattr(d[k], "isoformat"):
|
||||
d[k] = d[k].isoformat()
|
||||
return d
|
||||
|
||||
|
||||
@router.get("")
|
||||
async def list_results(
|
||||
model_id: Optional[str] = Query(None),
|
||||
user=Depends(require_auth),
|
||||
):
|
||||
where = []
|
||||
args: list = []
|
||||
if model_id:
|
||||
args.append(uuid.UUID(model_id))
|
||||
where.append(f"model_id = ${len(args)}")
|
||||
sql = "SELECT * FROM trainings"
|
||||
if where:
|
||||
sql += " WHERE " + " AND ".join(where)
|
||||
sql += " ORDER BY finished_at DESC NULLS LAST, queued_at DESC LIMIT 200"
|
||||
rows = await db.fetch(sql, *args)
|
||||
return {"count": len(rows), "trainings": [_row(r) for r in rows]}
|
||||
|
||||
|
||||
@router.get("/{training_id}")
|
||||
async def get_result(training_id: str, user=Depends(require_auth)):
|
||||
row = await db.fetchrow("SELECT * FROM trainings WHERE id = $1", uuid.UUID(training_id))
|
||||
if not row:
|
||||
raise HTTPException(404, "not found")
|
||||
# timeseries via Influx: loss per iter + cpu/mem
|
||||
flux = (
|
||||
f'from(bucket:"{settings.influx_bucket}") '
|
||||
f'|> range(start:-90d) '
|
||||
f'|> filter(fn: (r) => r._measurement == "ml_training" and r.training_id == "{training_id}")'
|
||||
)
|
||||
try:
|
||||
ts = await influx_client.query_flux(flux)
|
||||
except Exception:
|
||||
ts = []
|
||||
return {"training": _row(row), "timeseries": ts}
|
||||
|
||||
|
||||
@router.get("/compare")
|
||||
async def compare(
|
||||
trainings: str = Query(..., description="comma-separated training IDs"),
|
||||
user=Depends(require_auth),
|
||||
):
|
||||
ids = [s.strip() for s in trainings.split(",") if s.strip()]
|
||||
if len(ids) < 2:
|
||||
raise HTTPException(400, "at least 2 training IDs required")
|
||||
out = []
|
||||
for tid in ids:
|
||||
try:
|
||||
tid_uuid = uuid.UUID(tid)
|
||||
except ValueError:
|
||||
continue
|
||||
row = await db.fetchrow("SELECT * FROM trainings WHERE id = $1", tid_uuid)
|
||||
if not row:
|
||||
continue
|
||||
flux = (
|
||||
f'from(bucket:"{settings.influx_bucket}") '
|
||||
f'|> range(start:-90d) '
|
||||
f'|> filter(fn: (r) => r._measurement == "ml_training" and r.training_id == "{tid}")'
|
||||
)
|
||||
try:
|
||||
ts = await influx_client.query_flux(flux)
|
||||
except Exception:
|
||||
ts = []
|
||||
out.append({"training": _row(row), "timeseries": ts})
|
||||
return {"results": out}
|
||||
109
ml/routers/tests.py
Normal file
109
ml/routers/tests.py
Normal file
@@ -0,0 +1,109 @@
|
||||
"""API /api/tests — sessioni di test su training esistente (max 2 utenti simultanei)."""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import time
|
||||
import uuid
|
||||
from typing import Optional
|
||||
|
||||
import httpx
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
|
||||
from core import api_client, db, minio_client
|
||||
from core.auth import require_auth
|
||||
from core.docker_runner import run_test_once
|
||||
|
||||
router = APIRouter(prefix="/api/tests", tags=["tests"])
|
||||
|
||||
|
||||
def _row(r):
|
||||
if r is None:
|
||||
return None
|
||||
d = dict(r)
|
||||
for k in ("started_at", "ended_at"):
|
||||
if d.get(k) is not None and hasattr(d[k], "isoformat"):
|
||||
d[k] = d[k].isoformat()
|
||||
return d
|
||||
|
||||
|
||||
@router.post("/sessions", status_code=201)
|
||||
async def start_session(body: dict, user=Depends(require_auth)):
|
||||
training_id = body.get("training_id")
|
||||
if not training_id:
|
||||
raise HTTPException(400, "training_id required")
|
||||
|
||||
tr = await db.fetchrow(
|
||||
"SELECT id, status FROM trainings WHERE id = $1", uuid.UUID(training_id)
|
||||
)
|
||||
if not tr:
|
||||
raise HTTPException(404, "training not found")
|
||||
if tr["status"] != "succeeded":
|
||||
raise HTTPException(409, "training not completed")
|
||||
|
||||
sid = str(uuid.uuid4())
|
||||
try:
|
||||
await api_client.page_connect("test", user.get("username") or "unknown", sid)
|
||||
except httpx.HTTPStatusError as e:
|
||||
if e.response.status_code == 429:
|
||||
raise HTTPException(429, "test slots full (max 2 users)")
|
||||
raise HTTPException(502, f"api: {e}")
|
||||
|
||||
row = await db.fetchrow(
|
||||
"INSERT INTO tests (id, training_id, user_id) VALUES ($1,$2,$3) RETURNING *",
|
||||
uuid.UUID(sid),
|
||||
uuid.UUID(training_id),
|
||||
user.get("username") or "unknown",
|
||||
)
|
||||
return _row(row)
|
||||
|
||||
|
||||
@router.post("/sessions/{session_id}/ping")
|
||||
async def ping_session(session_id: str, user=Depends(require_auth)):
|
||||
try:
|
||||
await api_client.page_ping(session_id)
|
||||
except httpx.HTTPStatusError as e:
|
||||
raise HTTPException(e.response.status_code, e.response.text)
|
||||
return {"ok": True}
|
||||
|
||||
|
||||
@router.post("/sessions/{session_id}/runs", status_code=201)
|
||||
async def run_test(session_id: str, body: dict, user=Depends(require_auth)):
|
||||
row = await db.fetchrow("SELECT * FROM tests WHERE id = $1", uuid.UUID(session_id))
|
||||
if not row:
|
||||
raise HTTPException(404, "session not found")
|
||||
|
||||
inputs = body.get("inputs") or {}
|
||||
t0 = time.monotonic()
|
||||
try:
|
||||
result = await run_test_once(str(row["training_id"]), inputs)
|
||||
except Exception as e:
|
||||
raise HTTPException(500, f"test run failed: {e}")
|
||||
dt_ms = int((time.monotonic() - t0) * 1000)
|
||||
|
||||
run = {
|
||||
"inputs": inputs,
|
||||
"outputs": result.get("outputs", {}),
|
||||
"duration_ms": dt_ms,
|
||||
"cpu_peak": result.get("cpu_peak"),
|
||||
"mem_peak_mb": result.get("mem_peak_mb"),
|
||||
"ts": time.time(),
|
||||
}
|
||||
await db.execute(
|
||||
"UPDATE tests SET runs = runs || $1::jsonb WHERE id = $2",
|
||||
json.dumps([run]),
|
||||
uuid.UUID(session_id),
|
||||
)
|
||||
return run
|
||||
|
||||
|
||||
@router.delete("/sessions/{session_id}", status_code=204)
|
||||
async def end_session(session_id: str, user=Depends(require_auth)):
|
||||
await db.execute(
|
||||
"UPDATE tests SET ended_at = NOW() WHERE id = $1 AND ended_at IS NULL",
|
||||
uuid.UUID(session_id),
|
||||
)
|
||||
try:
|
||||
await api_client.page_disconnect(session_id)
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
129
ml/routers/trainings.py
Normal file
129
ml/routers/trainings.py
Normal file
@@ -0,0 +1,129 @@
|
||||
"""API /api/trainings — enqueue, list, get, artifacts."""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import uuid
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Query
|
||||
|
||||
from core import db, minio_client, redis_client, api_client
|
||||
from core.auth import require_auth
|
||||
|
||||
router = APIRouter(prefix="/api/trainings", tags=["trainings"])
|
||||
|
||||
|
||||
def _row(r) -> Optional[dict]:
|
||||
if r is None:
|
||||
return None
|
||||
d = dict(r)
|
||||
for k in ("queued_at", "started_at", "finished_at"):
|
||||
if d.get(k) is not None and hasattr(d[k], "isoformat"):
|
||||
d[k] = d[k].isoformat()
|
||||
return d
|
||||
|
||||
|
||||
@router.get("")
|
||||
async def list_trainings(
|
||||
model_id: Optional[str] = Query(None),
|
||||
status: Optional[str] = Query(None),
|
||||
limit: int = Query(100, le=500),
|
||||
user=Depends(require_auth),
|
||||
):
|
||||
where = []
|
||||
args: list = []
|
||||
if model_id:
|
||||
args.append(uuid.UUID(model_id))
|
||||
where.append(f"model_id = ${len(args)}")
|
||||
if status:
|
||||
args.append(status)
|
||||
where.append(f"status = ${len(args)}")
|
||||
sql = "SELECT * FROM trainings"
|
||||
if where:
|
||||
sql += " WHERE " + " AND ".join(where)
|
||||
args.append(limit)
|
||||
sql += f" ORDER BY queued_at DESC LIMIT ${len(args)}"
|
||||
rows = await db.fetch(sql, *args)
|
||||
return {"count": len(rows), "trainings": [_row(r) for r in rows]}
|
||||
|
||||
|
||||
@router.post("", status_code=202)
|
||||
async def enqueue_training(body: dict, user=Depends(require_auth)):
|
||||
for k in ("model_id", "version", "patch", "dataset_id"):
|
||||
if not body.get(k):
|
||||
raise HTTPException(400, f"missing field: {k}")
|
||||
|
||||
model_row = await db.fetchrow("SELECT * FROM models WHERE id = $1", uuid.UUID(body["model_id"]))
|
||||
if not model_row:
|
||||
raise HTTPException(404, "model not found")
|
||||
|
||||
ds_row = await db.fetchrow("SELECT id FROM datasets WHERE id = $1", uuid.UUID(body["dataset_id"]))
|
||||
if not ds_row:
|
||||
raise HTTPException(404, "dataset not found")
|
||||
|
||||
try:
|
||||
training_row = await db.fetchrow(
|
||||
"""
|
||||
INSERT INTO trainings (model_id, version, patch, dataset_id, started_by, status)
|
||||
VALUES ($1,$2,$3,$4,$5,'queued')
|
||||
RETURNING *
|
||||
""",
|
||||
uuid.UUID(body["model_id"]),
|
||||
body["version"],
|
||||
body["patch"],
|
||||
uuid.UUID(body["dataset_id"]),
|
||||
user.get("username") or "unknown",
|
||||
)
|
||||
except Exception as e:
|
||||
raise HTTPException(409, f"training already exists or invalid: {e}")
|
||||
|
||||
training_id = str(training_row["id"])
|
||||
|
||||
# crea job lato api-service (cross-service registry)
|
||||
try:
|
||||
await api_client.create_job(
|
||||
"train",
|
||||
created_by=user.get("username") or "unknown",
|
||||
payload={
|
||||
"training_id": training_id,
|
||||
"model_id": body["model_id"],
|
||||
"version": body["version"],
|
||||
"patch": body["patch"],
|
||||
"dataset_id": body["dataset_id"],
|
||||
},
|
||||
)
|
||||
except Exception as e:
|
||||
# non-fatale: il worker locale può comunque procedere; logghiamo e continuiamo
|
||||
import logging
|
||||
logging.warning("create_job failed: %s", e)
|
||||
|
||||
# enqueue in Redis (il worker locale lo raccoglie)
|
||||
await redis_client.client().lpush("ml:queue:train", training_id)
|
||||
await redis_client.client().hset(
|
||||
f"ml:train:{training_id}",
|
||||
mapping={"status": "queued", "progress": "0", "message": "queued"},
|
||||
)
|
||||
await redis_client.client().expire(f"ml:train:{training_id}", 48 * 3600)
|
||||
|
||||
return _row(training_row)
|
||||
|
||||
|
||||
@router.get("/{training_id}")
|
||||
async def get_training(training_id: str, user=Depends(require_auth)):
|
||||
row = await db.fetchrow("SELECT * FROM trainings WHERE id = $1", uuid.UUID(training_id))
|
||||
if not row:
|
||||
raise HTTPException(404, "not found")
|
||||
return _row(row)
|
||||
|
||||
|
||||
@router.get("/{training_id}/artifacts")
|
||||
async def list_artifacts(training_id: str, user=Depends(require_auth)):
|
||||
row = await db.fetchrow(
|
||||
"SELECT artifacts_prefix FROM trainings WHERE id = $1", uuid.UUID(training_id)
|
||||
)
|
||||
if not row or not row["artifacts_prefix"]:
|
||||
raise HTTPException(404, "no artifacts")
|
||||
objs = minio_client.list_prefix(row["artifacts_prefix"] + "/")
|
||||
for o in objs:
|
||||
o["url"] = minio_client.presigned_get(o["name"], 3600)
|
||||
return objs
|
||||
64
ml/routers/trainings_stream.py
Normal file
64
ml/routers/trainings_stream.py
Normal file
@@ -0,0 +1,64 @@
|
||||
"""SSE endpoint per live progress del training.
|
||||
|
||||
GET /api/trainings/{id}/events
|
||||
Streamma eventi dal Redis stream `ml:train:{id}:events` via Server-Sent Events.
|
||||
Termina quando lo stato del training è terminale (succeeded/failed/cancelled).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import uuid
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
from sse_starlette.sse import EventSourceResponse
|
||||
|
||||
from core import db, redis_client
|
||||
from core.auth import require_auth
|
||||
|
||||
router = APIRouter(prefix="/api/trainings", tags=["trainings-sse"])
|
||||
|
||||
_TERMINAL = {"succeeded", "failed", "cancelled"}
|
||||
|
||||
|
||||
@router.get("/{training_id}/events")
|
||||
async def training_events(training_id: str, user=Depends(require_auth)):
|
||||
# verifica esistenza
|
||||
row = await db.fetchrow("SELECT status FROM trainings WHERE id = $1", uuid.UUID(training_id))
|
||||
if not row:
|
||||
raise HTTPException(404, "not found")
|
||||
|
||||
stream_key = f"ml:train:{training_id}:events"
|
||||
status_key = f"ml:train:{training_id}"
|
||||
|
||||
async def gen():
|
||||
last_id = "0-0"
|
||||
r = redis_client.client()
|
||||
while True:
|
||||
try:
|
||||
# XREAD block 5s per non tenere la connessione idle troppo a lungo
|
||||
resp = await r.xread({stream_key: last_id}, count=50, block=5000)
|
||||
except Exception as e:
|
||||
yield {"event": "error", "data": json.dumps({"error": str(e)})}
|
||||
await asyncio.sleep(1)
|
||||
continue
|
||||
|
||||
if resp:
|
||||
for _stream, entries in resp:
|
||||
for entry_id, fields in entries:
|
||||
last_id = entry_id
|
||||
yield {"event": "message", "id": entry_id, "data": json.dumps(fields)}
|
||||
|
||||
# controlla stato terminale
|
||||
state = await r.hget(status_key, "status")
|
||||
if not state:
|
||||
# fallback su db se redis scaduto
|
||||
db_row = await db.fetchrow(
|
||||
"SELECT status FROM trainings WHERE id = $1", uuid.UUID(training_id)
|
||||
)
|
||||
state = db_row["status"] if db_row else "unknown"
|
||||
if state in _TERMINAL:
|
||||
yield {"event": "end", "data": json.dumps({"status": state})}
|
||||
return
|
||||
|
||||
return EventSourceResponse(gen())
|
||||
18
ml/runner/Dockerfile
Normal file
18
ml/runner/Dockerfile
Normal file
@@ -0,0 +1,18 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
git \
|
||||
build-essential \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN pip install --no-cache-dir \
|
||||
numpy pandas scikit-learn \
|
||||
xgboost \
|
||||
matplotlib \
|
||||
pyyaml
|
||||
|
||||
COPY sdk.py /opt/meb/meb_ml.py
|
||||
ENV PYTHONPATH=/opt/meb
|
||||
|
||||
WORKDIR /workdir
|
||||
CMD ["bash"]
|
||||
80
ml/runner/sdk.py
Normal file
80
ml/runner/sdk.py
Normal file
@@ -0,0 +1,80 @@
|
||||
"""meb_ml — SDK importabile dal codice utente dentro il container runner.
|
||||
|
||||
API:
|
||||
from meb_ml import emit_metric, emit_series, emit_matrix, emit_log, save_artifact
|
||||
|
||||
emit_metric(iter=10, loss=0.23)
|
||||
emit_series("roc_curve", x=fpr, y=tpr, kind="line")
|
||||
emit_matrix("confusion", labels=[...], values=[[...],[...]])
|
||||
emit_log("info", "epoch done")
|
||||
|
||||
Scrive righe JSON su stdout; il parent (ml-service) le inoltra su Redis/Influx.
|
||||
Per risultati finali scrivere `out/metrics.json` con:
|
||||
{"metrics": {...}, "plots": {"loss_curve": {"x": [...], "y": [...]}, ...}}
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any, Iterable, Sequence
|
||||
|
||||
|
||||
def _print(obj: dict) -> None:
|
||||
sys.stdout.write(json.dumps(obj, default=float) + "\n")
|
||||
sys.stdout.flush()
|
||||
|
||||
|
||||
def emit_metric(**fields: Any) -> None:
|
||||
_print({"type": "metric", **fields})
|
||||
|
||||
|
||||
def emit_series(name: str, x: Sequence, y: Sequence, kind: str = "line") -> None:
|
||||
_print({
|
||||
"type": "series",
|
||||
"name": name,
|
||||
"kind": kind,
|
||||
"x": list(x),
|
||||
"y": list(y),
|
||||
})
|
||||
|
||||
|
||||
def emit_matrix(name: str, labels: Sequence, values: Sequence[Sequence]) -> None:
|
||||
_print({
|
||||
"type": "matrix",
|
||||
"name": name,
|
||||
"labels": list(labels),
|
||||
"values": [list(row) for row in values],
|
||||
})
|
||||
|
||||
|
||||
def emit_log(level: str, message: str) -> None:
|
||||
_print({"type": "log", "level": level, "message": message})
|
||||
|
||||
|
||||
def save_artifact(path: str) -> str:
|
||||
"""Copia `path` nella cartella artefatti (MEB_ARTIFACTS_DIR). Ritorna la dest."""
|
||||
dest_dir = Path(os.environ.get("MEB_ARTIFACTS_DIR", "/workdir/out"))
|
||||
dest_dir.mkdir(parents=True, exist_ok=True)
|
||||
src = Path(path)
|
||||
dest = dest_dir / src.name
|
||||
dest.write_bytes(src.read_bytes())
|
||||
return str(dest)
|
||||
|
||||
|
||||
def dataset_path() -> str:
|
||||
return os.environ["MEB_DATASET_PATH"]
|
||||
|
||||
|
||||
def artifacts_dir() -> str:
|
||||
return os.environ.get("MEB_ARTIFACTS_DIR", "/workdir/out")
|
||||
|
||||
|
||||
def read_test_input() -> dict:
|
||||
"""Legge un singolo JSON da stdin (per script di test)."""
|
||||
return json.loads(sys.stdin.readline())
|
||||
|
||||
|
||||
def write_test_output(outputs: dict) -> None:
|
||||
_print({"type": "result", "outputs": outputs})
|
||||
146
ml/static/styles/ml.css
Normal file
146
ml/static/styles/ml.css
Normal file
@@ -0,0 +1,146 @@
|
||||
.ml-nav {
|
||||
display: flex;
|
||||
gap: 16px;
|
||||
align-items: center;
|
||||
}
|
||||
.ml-nav a {
|
||||
text-decoration: none;
|
||||
color: var(--text-secondary);
|
||||
font-weight: 600;
|
||||
padding: 8px 12px;
|
||||
border-radius: var(--radius-md);
|
||||
transition: all 0.2s ease;
|
||||
}
|
||||
.ml-nav a:hover { background: var(--accent-light); color: var(--accent-color); }
|
||||
.ml-nav a.active { background: var(--accent-light); color: var(--accent-color); }
|
||||
|
||||
.container {
|
||||
max-width: 1200px;
|
||||
margin: 24px auto;
|
||||
padding: 0 24px;
|
||||
}
|
||||
|
||||
.page-head {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: space-between;
|
||||
margin-bottom: 20px;
|
||||
}
|
||||
.page-head h2 { font-size: 1.5rem; }
|
||||
|
||||
.list {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 8px;
|
||||
}
|
||||
.list .item {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: space-between;
|
||||
padding: 12px 16px;
|
||||
border: 1px solid var(--header-border);
|
||||
border-radius: var(--radius-lg);
|
||||
background: #fff;
|
||||
transition: box-shadow 0.12s ease;
|
||||
}
|
||||
.list .item:hover { box-shadow: var(--shadow-md); }
|
||||
.list .meta { color: var(--text-secondary); font-size: 0.85rem; }
|
||||
|
||||
.form-row {
|
||||
display: flex;
|
||||
flex-wrap: wrap;
|
||||
gap: 12px;
|
||||
align-items: flex-end;
|
||||
margin-bottom: 20px;
|
||||
}
|
||||
.form-row label {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 4px;
|
||||
font-size: 0.85rem;
|
||||
color: var(--text-secondary);
|
||||
}
|
||||
.form-row input, .form-row select, .form-row textarea {
|
||||
padding: 8px 12px;
|
||||
border: 1px solid var(--header-border);
|
||||
border-radius: var(--radius-md);
|
||||
font-family: inherit;
|
||||
}
|
||||
|
||||
.hidden { display: none !important; }
|
||||
|
||||
.queue-info {
|
||||
font-size: 0.9rem;
|
||||
color: var(--text-secondary);
|
||||
padding: 6px 12px;
|
||||
background: var(--accent-light);
|
||||
border-radius: var(--radius-md);
|
||||
}
|
||||
|
||||
.charts {
|
||||
display: grid;
|
||||
grid-template-columns: 1fr 1fr;
|
||||
gap: 16px;
|
||||
margin: 16px 0;
|
||||
}
|
||||
|
||||
.logs {
|
||||
background: #0f172a;
|
||||
color: #cbd5e1;
|
||||
padding: 12px;
|
||||
border-radius: var(--radius-md);
|
||||
font-family: ui-monospace, monospace;
|
||||
font-size: 0.8rem;
|
||||
max-height: 320px;
|
||||
overflow: auto;
|
||||
white-space: pre-wrap;
|
||||
}
|
||||
|
||||
.detail {
|
||||
border: 1px solid var(--header-border);
|
||||
border-radius: var(--radius-lg);
|
||||
padding: 16px;
|
||||
margin-top: 16px;
|
||||
background: #fff;
|
||||
position: relative;
|
||||
}
|
||||
.detail #btn-close-detail {
|
||||
position: absolute;
|
||||
top: 8px;
|
||||
right: 8px;
|
||||
padding: 4px 10px;
|
||||
}
|
||||
|
||||
dialog {
|
||||
border: 1px solid var(--header-border);
|
||||
border-radius: var(--radius-lg);
|
||||
padding: 24px;
|
||||
width: min(500px, 90vw);
|
||||
}
|
||||
dialog form { display: flex; flex-direction: column; gap: 12px; }
|
||||
dialog label { display: flex; flex-direction: column; gap: 4px; font-size: 0.85rem; }
|
||||
dialog menu { display: flex; justify-content: flex-end; gap: 8px; margin-top: 16px; padding: 0; }
|
||||
|
||||
table {
|
||||
width: 100%;
|
||||
border-collapse: collapse;
|
||||
margin-top: 12px;
|
||||
}
|
||||
th, td { padding: 8px 12px; border-bottom: 1px solid var(--header-border); text-align: left; font-size: 0.9rem; }
|
||||
|
||||
code {
|
||||
font-family: ui-monospace, monospace;
|
||||
background: #f1f5f9;
|
||||
padding: 2px 6px;
|
||||
border-radius: 4px;
|
||||
font-size: 0.85em;
|
||||
}
|
||||
|
||||
pre {
|
||||
background: #f8fafc;
|
||||
padding: 12px;
|
||||
border-radius: var(--radius-md);
|
||||
overflow: auto;
|
||||
font-family: ui-monospace, monospace;
|
||||
font-size: 0.8rem;
|
||||
}
|
||||
33
ml/templates/_layout.html
Normal file
33
ml/templates/_layout.html
Normal file
@@ -0,0 +1,33 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="it">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>ML — {% block title %}{{ page|capitalize }}{% endblock %}</title>
|
||||
<link href="/static/styles/style.css" rel="stylesheet">
|
||||
<link href="/static/styles/ml.css" rel="stylesheet">
|
||||
</head>
|
||||
<body>
|
||||
<div class="header">
|
||||
<h1>Modelli ML</h1>
|
||||
<nav class="ml-nav">
|
||||
<a href="/datasets" class="{% if page=='datasets' %}active{% endif %}">Datasets</a>
|
||||
<a href="/models" class="{% if page=='models' %}active{% endif %}">Modelli</a>
|
||||
<a href="/train" class="{% if page=='train' %}active{% endif %}">Train</a>
|
||||
<a href="/test" class="{% if page=='test' %}active{% endif %}">Test</a>
|
||||
<a href="/results" class="{% if page=='results' %}active{% endif %}">Results</a>
|
||||
</nav>
|
||||
<div class="profile">
|
||||
<p id="username">{{ user.username }}</p>
|
||||
<button id="logout-btn">Logout</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="container">
|
||||
{% block content %}{% endblock %}
|
||||
</div>
|
||||
|
||||
<script src="/static/js/common.js"></script>
|
||||
{% block scripts %}{% endblock %}
|
||||
</body>
|
||||
</html>
|
||||
@@ -0,0 +1,39 @@
|
||||
{% extends "_layout.html" %}
|
||||
{% block title %}Datasets{% endblock %}
|
||||
{% block content %}
|
||||
<div class="page-head">
|
||||
<h2>Datasets</h2>
|
||||
<button class="prominent" id="btn-upload">+ Carica CSV</button>
|
||||
</div>
|
||||
|
||||
<div id="datasets-list" class="list"></div>
|
||||
|
||||
<dialog id="upload-dlg">
|
||||
<form id="upload-form" method="dialog">
|
||||
<h3>Carica dataset</h3>
|
||||
<label>Nome<input type="text" name="nome" required></label>
|
||||
<label>Tipo
|
||||
<select name="dataset_type">
|
||||
<option value="custom">custom</option>
|
||||
<option value="imported">imported</option>
|
||||
</select>
|
||||
</label>
|
||||
<label>Formato
|
||||
<select name="format">
|
||||
<option value="csv">csv</option>
|
||||
<option value="json">json</option>
|
||||
</select>
|
||||
</label>
|
||||
<label>Tags (virgola)<input type="text" name="tags"></label>
|
||||
<label>Descrizione<textarea name="description"></textarea></label>
|
||||
<label>File<input type="file" name="file" required></label>
|
||||
<menu>
|
||||
<button type="button" id="upload-cancel">Annulla</button>
|
||||
<button type="submit" class="prominent">Carica</button>
|
||||
</menu>
|
||||
</form>
|
||||
</dialog>
|
||||
{% endblock %}
|
||||
{% block scripts %}
|
||||
<script src="/static/js/datasets.js"></script>
|
||||
{% endblock %}
|
||||
|
||||
57
ml/templates/models.html
Normal file
57
ml/templates/models.html
Normal file
@@ -0,0 +1,57 @@
|
||||
{% extends "_layout.html" %}
|
||||
{% block title %}Modelli{% endblock %}
|
||||
{% block content %}
|
||||
<div class="page-head">
|
||||
<h2>Modelli</h2>
|
||||
<button class="prominent" id="btn-add-model">+ Aggiungi modello</button>
|
||||
</div>
|
||||
|
||||
<div id="models-list" class="list"></div>
|
||||
|
||||
<div id="model-detail" class="detail hidden">
|
||||
<button id="btn-close-detail">×</button>
|
||||
<h3 id="md-name"></h3>
|
||||
<p id="md-meta"></p>
|
||||
<section>
|
||||
<h4>Branch / Commits</h4>
|
||||
<select id="md-branch"></select>
|
||||
<ul id="md-commits"></ul>
|
||||
</section>
|
||||
<section>
|
||||
<h4>model.yml</h4>
|
||||
<pre id="md-spec"></pre>
|
||||
</section>
|
||||
<section>
|
||||
<h4>Note</h4>
|
||||
<ul id="md-notes"></ul>
|
||||
<form id="md-note-form">
|
||||
<textarea name="text" placeholder="Nuova nota"></textarea>
|
||||
<button type="submit" class="prominent">Aggiungi</button>
|
||||
</form>
|
||||
</section>
|
||||
</div>
|
||||
|
||||
<dialog id="add-model-dlg">
|
||||
<form id="add-model-form" method="dialog">
|
||||
<h3>Nuovo modello</h3>
|
||||
<label>Nome<input type="text" name="name" required></label>
|
||||
<label>Tipo
|
||||
<select name="type">
|
||||
<option>xgboost</option>
|
||||
<option>lstm</option>
|
||||
<option>sklearn</option>
|
||||
<option>other</option>
|
||||
</select>
|
||||
</label>
|
||||
<label>Repo Gitea (owner/repo)<input type="text" name="gitea_repo" required></label>
|
||||
<label>Branch<input type="text" name="default_branch" value="main"></label>
|
||||
<menu>
|
||||
<button type="button" id="add-model-cancel">Annulla</button>
|
||||
<button type="submit" class="prominent">Crea</button>
|
||||
</menu>
|
||||
</form>
|
||||
</dialog>
|
||||
{% endblock %}
|
||||
{% block scripts %}
|
||||
<script src="/static/js/models.js"></script>
|
||||
{% endblock %}
|
||||
@@ -1,89 +1,33 @@
|
||||
<!DOCTYPE html>
|
||||
{% extends "_layout.html" %}
|
||||
{% block title %}Risultati{% endblock %}
|
||||
{% block content %}
|
||||
<div class="page-head">
|
||||
<h2>Risultati training</h2>
|
||||
<button id="btn-compare" class="prominent">Confronta selezionati</button>
|
||||
</div>
|
||||
|
||||
<html>
|
||||
<head>
|
||||
<title>Risultati</title>
|
||||
<link href="../static/styles/style.css" rel="stylesheet">
|
||||
<div id="results-list" class="list"></div>
|
||||
|
||||
<style>
|
||||
.container {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
height: 100%;
|
||||
}
|
||||
<section id="compare-panel" class="hidden">
|
||||
<h3>Confronto</h3>
|
||||
<div class="charts">
|
||||
<canvas id="cmp-loss"></canvas>
|
||||
</div>
|
||||
<table id="cmp-table"></table>
|
||||
<div id="cmp-plots"></div>
|
||||
</section>
|
||||
|
||||
.picker {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
height: 100%;
|
||||
}
|
||||
|
||||
.picker .header {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
height: 100%;
|
||||
}
|
||||
|
||||
</style>
|
||||
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<div class="header">
|
||||
<h1>Risultati</h1>
|
||||
<div class="profile">
|
||||
<p>Utente</p>
|
||||
<button>Logout</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="container">
|
||||
|
||||
<div class="picker">
|
||||
|
||||
<div class="header">
|
||||
<h2>
|
||||
Seleziona
|
||||
</h2>
|
||||
|
||||
<p>
|
||||
una sessione di training eseguita per visualizzarne i risultati
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="grid">
|
||||
|
||||
<div class="card">
|
||||
<h3>sessione 1</h3>
|
||||
<div class="train-info">
|
||||
<p>24/03/2026</p>
|
||||
<p>12:00</p>
|
||||
<p>dataset: d-1</p>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="card">
|
||||
<h3>sessione 2</h3>
|
||||
<p>24/03/2026</p>
|
||||
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
</body>
|
||||
|
||||
<script>
|
||||
|
||||
</script>
|
||||
</html>
|
||||
<section id="detail-panel" class="hidden">
|
||||
<h3>Dettaglio training <code id="dt-id"></code></h3>
|
||||
<div id="dt-meta"></div>
|
||||
<div class="charts">
|
||||
<canvas id="dt-loss"></canvas>
|
||||
<canvas id="dt-res"></canvas>
|
||||
</div>
|
||||
<div id="dt-plots"></div>
|
||||
</section>
|
||||
{% endblock %}
|
||||
{% block scripts %}
|
||||
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
|
||||
<script src="/static/js/results.js"></script>
|
||||
{% endblock %}
|
||||
|
||||
@@ -0,0 +1,33 @@
|
||||
{% extends "_layout.html" %}
|
||||
{% block title %}Test{% endblock %}
|
||||
{% block content %}
|
||||
<div class="page-head">
|
||||
<h2>Test modello</h2>
|
||||
<div id="slot-info" class="queue-info">Slot: <span id="slot-count">–</span>/2</div>
|
||||
</div>
|
||||
|
||||
<div id="slot-full" class="info-panel hidden">
|
||||
<div class="icon">🚧</div>
|
||||
<h3>Slot test pieni</h3>
|
||||
<p>Massimo 2 utenti possono eseguire test contemporaneamente. Riprova tra qualche minuto.</p>
|
||||
</div>
|
||||
|
||||
<form id="test-start" class="form-row">
|
||||
<label>Modello<select id="t-model"></select></label>
|
||||
<label>Training<select id="t-training"></select></label>
|
||||
<button type="submit" class="prominent">Avvia sessione</button>
|
||||
</form>
|
||||
|
||||
<section id="test-session" class="hidden">
|
||||
<h3>Sessione <code id="ts-id"></code></h3>
|
||||
<form id="inputs-form"></form>
|
||||
<button id="btn-run" class="prominent">Esegui test</button>
|
||||
<button id="btn-end">Chiudi sessione</button>
|
||||
|
||||
<h4>Risultati</h4>
|
||||
<div id="runs-list"></div>
|
||||
</section>
|
||||
{% endblock %}
|
||||
{% block scripts %}
|
||||
<script src="/static/js/test.js"></script>
|
||||
{% endblock %}
|
||||
|
||||
@@ -0,0 +1,35 @@
|
||||
{% extends "_layout.html" %}
|
||||
{% block title %}Train{% endblock %}
|
||||
{% block content %}
|
||||
<div class="page-head">
|
||||
<h2>Avvia training</h2>
|
||||
<div class="queue-info">Coda: <span id="queue-count">–</span></div>
|
||||
</div>
|
||||
|
||||
<form id="train-form" class="form-row">
|
||||
<label>Modello<select name="model_id" id="f-model"></select></label>
|
||||
<label>Branch<select name="branch" id="f-branch"></select></label>
|
||||
<label>Commit<select name="patch" id="f-patch"></select></label>
|
||||
<label>Versione<input type="text" name="version" placeholder="1.0.0" required></label>
|
||||
<label>Dataset<select name="dataset_id" id="f-dataset"></select></label>
|
||||
<button type="submit" class="prominent">Avvia</button>
|
||||
</form>
|
||||
|
||||
<section id="live-panel" class="hidden">
|
||||
<h3>Training <code id="live-id"></code> — <span id="live-status">queued</span></h3>
|
||||
<div class="charts">
|
||||
<canvas id="chart-loss"></canvas>
|
||||
<canvas id="chart-cpu"></canvas>
|
||||
</div>
|
||||
<pre id="live-logs" class="logs"></pre>
|
||||
</section>
|
||||
|
||||
<section>
|
||||
<h3>Recenti</h3>
|
||||
<div id="recent-trainings" class="list"></div>
|
||||
</section>
|
||||
{% endblock %}
|
||||
{% block scripts %}
|
||||
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
|
||||
<script src="/static/js/train.js"></script>
|
||||
{% endblock %}
|
||||
|
||||
Reference in New Issue
Block a user