feat: Add new API endpoints and HTML pages for ML model management

- Implemented HTML pages for datasets, models, training, testing, and results.
- Created API endpoints for managing repositories, results, tests, and training sessions.
- Added functionality for streaming training progress via Server-Sent Events (SSE).
- Introduced a Dockerfile for the ML runner with necessary dependencies.
- Developed an SDK for user code execution within the runner container.
- Enhanced CSS styles for improved UI layout and navigation.
- Established a layout template for consistent HTML structure across pages.
- Added JavaScript for dynamic interactions on the models page.
- Implemented WebSocket handling for real-time communication with kiosk devices and controllers.
- Implemented model registration and management API at /api/models
- Added Gitea proxy API for repository interactions at /api/repos
- Created results API for listing and comparing training results at /api/results
- Developed training management API for enqueueing and retrieving training jobs at /api/trainings
- Introduced SSE endpoint for live training progress updates
- Added HTML pages for models, datasets, and training management
- Created a Dockerfile for the ML runner with necessary dependencies
- Developed SDK for user code execution within the runner container
- Enhanced CSS styles for improved UI/UX
- Implemented WebSocket communication for real-time device and controller interactions in the kiosk system
This commit is contained in:
Giuseppe Raffa
2026-04-28 09:24:38 +02:00
parent ee478e52ef
commit 0ce879aa44
81 changed files with 7491 additions and 746 deletions

View File

@@ -1,125 +1,292 @@
"""
Redis Keys:
- marine:catalog:full → lista dei dataset completo (TTL 1h)
- marine:catalog:search:{hash} → risultati ricerca (TTL 30min)
- marine:job:{session_id} → stato job download (TTL 48h)
Cache two-tier per il servizio Marine.
L1 = Redis (RAM): scadenza 2 ore, velocissima, condivisa tra processi.
L2 = SQLite+disco: persistente (200GB), fallback quando Redis non c'è
o quando L1 è scaduta. Scadenza configurabile (default 30 giorni).
Flusso lettura:
1. Prova L1 (Redis). Se hit → ritorna.
2. Prova L2 (SQLite). Se hit non scaduta → ritorna E ripopola L1 (re-warm).
3. Miss totale → None.
Flusso scrittura:
Scrive in entrambi i tier contemporaneamente.
Chiavi standard:
- marine:catalog:full → lista completa dataset Copernicus
- marine:catalog:search:{hash} → risultati ricerca utente
- marine:job:{session_id} → stato job download (solo Redis, ephemeri)
"""
import gzip
import json
import os
import logging
import os
import sqlite3
import threading
import time
from pathlib import Path
from typing import Any, Optional
import redis
logger = logging.getLogger(__name__)
# Configurazione Redis da variabili ambiente
# ── Config ───────────────────────────────────────────────────────────────
REDIS_HOST = os.getenv("REDIS_HOST", "meb-redis")
REDIS_PORT = int(os.getenv("REDIS_PORT", "6379"))
# Pool di connessioni condiviso (thread-safe, riutilizzabile)
# Il volume persistente è montato dal container, default /app/cache
CACHE_DIR = Path(os.getenv("CACHE_DIR", "/app/cache"))
CACHE_DB = CACHE_DIR / "catalog.sqlite"
BLOB_DIR = CACHE_DIR / "blobs"
# TTL default
DEFAULT_REDIS_TTL = 2 * 3600 # 2 ore (L1)
DEFAULT_DISK_TTL = 30 * 24 * 3600 # 30 giorni (L2)
# Soglia sopra la quale il valore va in un file su disco invece che in sqlite
BLOB_THRESHOLD_BYTES = 64 * 1024 # 64 KB
# ── Stato globale ────────────────────────────────────────────────────────
_pool: Optional[redis.ConnectionPool] = None
_client: Optional[redis.Redis] = None
_redis_disabled = False
_sqlite_lock = threading.Lock()
_sqlite_initialized = False
def _get_client() -> Optional[redis.Redis]:
"""Restituisce il client Redis singleton con connection pool.
Ritorna None se Redis non è raggiungibile."""
global _pool, _client
# ── Redis (L1) ───────────────────────────────────────────────────────────
def _get_redis() -> Optional[redis.Redis]:
global _pool, _client, _redis_disabled
if _redis_disabled:
return None
if _client is not None:
return _client
try:
_pool = redis.ConnectionPool(
host=REDIS_HOST,
port=REDIS_PORT,
# Decodifica automatica delle risposte in stringhe UTF-8
decode_responses=True,
# Massimo 5 connessioni nel pool (VPS 1-core, non serve di più)
decode_responses=False, # tratto blob binari (gzip)
max_connections=5,
# Timeout connessione e socket per evitare blocchi
socket_connect_timeout=3,
socket_timeout=3,
# Riprova automaticamente se la connessione viene interrotta
retry_on_timeout=True,
)
_client = redis.Redis(connection_pool=_pool)
# Test connessione
_client.ping()
logger.info("[Redis] Connessione stabilita per il servizio Marine")
logger.info("[Cache] Redis L1 connesso")
return _client
except Exception as e:
logger.warning(f"[Redis] Non disponibile, la cache è disabilitata: {e}")
logger.warning(f"[Cache] Redis non disponibile, uso solo disco: {e}")
_redis_disabled = True
_client = None
return None
def cache_get(key: str) -> Optional[Any]:
"""Legge un valore dalla cache Redis.
# ── SQLite (L2) ──────────────────────────────────────────────────────────
def _ensure_sqlite() -> sqlite3.Connection:
"""Apre/crea il db SQLite su disco. Crea anche la dir blob."""
global _sqlite_initialized
CACHE_DIR.mkdir(parents=True, exist_ok=True)
BLOB_DIR.mkdir(parents=True, exist_ok=True)
conn = sqlite3.connect(str(CACHE_DB), timeout=5.0, isolation_level=None)
conn.execute("PRAGMA journal_mode=WAL")
conn.execute("PRAGMA synchronous=NORMAL")
if not _sqlite_initialized:
conn.execute("""
CREATE TABLE IF NOT EXISTS cache (
key TEXT PRIMARY KEY,
expires_at INTEGER NOT NULL,
is_blob INTEGER NOT NULL DEFAULT 0,
value BLOB,
blob_path TEXT,
size_bytes INTEGER NOT NULL,
updated_at INTEGER NOT NULL
)
""")
conn.execute("CREATE INDEX IF NOT EXISTS idx_cache_expires ON cache(expires_at)")
_sqlite_initialized = True
return conn
Args:
key: Chiave Redis (es. 'marine:catalog:full')
Returns:
Il valore deserializzato da JSON, oppure None se non trovato o errore
"""
def _blob_path(key: str) -> Path:
# Nome file safe: solo caratteri alfanumerici + hash per unicità
safe = "".join(c if c.isalnum() or c in ("-", "_") else "_" for c in key)
return BLOB_DIR / f"{safe}.json.gz"
def _disk_get(key: str) -> Optional[Any]:
try:
client = _get_client()
if client is None:
with _sqlite_lock:
conn = _ensure_sqlite()
row = conn.execute(
"SELECT expires_at, is_blob, value, blob_path FROM cache WHERE key = ?",
(key,)
).fetchone()
if row is None:
return None
data = client.get(key)
if data is None:
expires_at, is_blob, value, blob_path = row
if expires_at < int(time.time()):
# Scaduta: la elimino in lazy
_disk_delete(key)
return None
return json.loads(data)
if is_blob:
data = Path(blob_path).read_bytes()
else:
data = value
return json.loads(gzip.decompress(data).decode("utf-8"))
except Exception as e:
logger.warning(f"[Redis] Errore lettura chiave '{key}': {e}")
logger.warning(f"[Cache] Errore lettura disco '{key}': {e}")
return None
def cache_set(key: str, value: Any, ttl: int = 3600) -> bool:
"""Scrive un valore nella cache Redis con TTL.
Args:
key: Chiave Redis
value: Valore da serializzare in JSON
ttl: Tempo di vita in secondi (default: 1 ora)
Returns:
True se scritto con successo, False altrimenti
"""
def _disk_set(key: str, raw_gz: bytes, ttl: int) -> None:
try:
client = _get_client()
if client is None:
return False
serialized = json.dumps(value)
client.setex(key, ttl, serialized)
return True
expires_at = int(time.time()) + ttl
updated_at = int(time.time())
size = len(raw_gz)
if size > BLOB_THRESHOLD_BYTES:
path = _blob_path(key)
path.write_bytes(raw_gz)
with _sqlite_lock:
conn = _ensure_sqlite()
conn.execute(
"INSERT OR REPLACE INTO cache(key, expires_at, is_blob, value, blob_path, size_bytes, updated_at) "
"VALUES(?,?,?,?,?,?,?)",
(key, expires_at, 1, None, str(path), size, updated_at)
)
else:
with _sqlite_lock:
conn = _ensure_sqlite()
conn.execute(
"INSERT OR REPLACE INTO cache(key, expires_at, is_blob, value, blob_path, size_bytes, updated_at) "
"VALUES(?,?,?,?,?,?,?)",
(key, expires_at, 0, raw_gz, None, size, updated_at)
)
except Exception as e:
logger.warning(f"[Redis] Errore scrittura chiave '{key}': {e}")
logger.warning(f"[Cache] Errore scrittura disco '{key}': {e}")
def _disk_delete(key: str) -> None:
try:
with _sqlite_lock:
conn = _ensure_sqlite()
row = conn.execute("SELECT blob_path FROM cache WHERE key = ?", (key,)).fetchone()
conn.execute("DELETE FROM cache WHERE key = ?", (key,))
if row and row[0]:
try:
Path(row[0]).unlink(missing_ok=True)
except Exception:
pass
except Exception as e:
logger.warning(f"[Cache] Errore delete disco '{key}': {e}")
# ── API pubblica ─────────────────────────────────────────────────────────
def cache_get(key: str) -> Optional[Any]:
"""Legge L1 → L2. Se L2 hit, ripopola L1 (re-warm)."""
# L1
client = _get_redis()
if client is not None:
try:
raw = client.get(key)
if raw is not None:
return json.loads(gzip.decompress(raw).decode("utf-8"))
except Exception as e:
logger.warning(f"[Cache] Errore Redis '{key}': {e}")
# L2
value = _disk_get(key)
if value is not None and client is not None:
# Re-warm L1 con TTL standard
try:
raw_gz = gzip.compress(json.dumps(value).encode("utf-8"))
client.setex(key, DEFAULT_REDIS_TTL, raw_gz)
except Exception:
pass
return value
def cache_set(key: str, value: Any, ttl: int = DEFAULT_REDIS_TTL, disk_ttl: Optional[int] = None) -> bool:
"""Scrive in L1 (ttl) e L2 (disk_ttl, default 30 giorni).
Per chiavi ephemere (es. job state) passa disk_ttl=0 per saltare il disco."""
if disk_ttl is None:
disk_ttl = DEFAULT_DISK_TTL
try:
serialized = json.dumps(value).encode("utf-8")
raw_gz = gzip.compress(serialized)
except Exception as e:
logger.warning(f"[Cache] Errore serializzazione '{key}': {e}")
return False
ok = False
# L1
client = _get_redis()
if client is not None:
try:
client.setex(key, ttl, raw_gz)
ok = True
except Exception as e:
logger.warning(f"[Cache] Errore scrittura Redis '{key}': {e}")
# L2
if disk_ttl > 0:
_disk_set(key, raw_gz, disk_ttl)
ok = True
return ok
def cache_delete(key: str) -> bool:
"""Elimina una chiave dalla cache Redis.
client = _get_redis()
if client is not None:
try:
client.delete(key)
except Exception:
pass
_disk_delete(key)
return True
Args:
key: Chiave Redis da eliminare
Returns:
True se eliminata, False altrimenti
"""
def cache_stats() -> dict:
"""Ritorna statistiche della cache: utile per /health e debug."""
stats = {"redis": False, "disk": {"entries": 0, "bytes": 0, "blobs": 0}}
if _get_redis() is not None:
stats["redis"] = True
try:
client = _get_client()
if client is None:
return False
with _sqlite_lock:
conn = _ensure_sqlite()
row = conn.execute(
"SELECT COUNT(*), COALESCE(SUM(size_bytes),0), COALESCE(SUM(is_blob),0) FROM cache"
).fetchone()
stats["disk"]["entries"] = row[0]
stats["disk"]["bytes"] = row[1]
stats["disk"]["blobs"] = row[2]
except Exception:
pass
return stats
client.delete(key)
return True
def cache_sweep() -> int:
"""Rimuove voci scadute su disco (da chiamare periodicamente). Ritorna numero eliminate."""
try:
now = int(time.time())
with _sqlite_lock:
conn = _ensure_sqlite()
rows = conn.execute(
"SELECT key, blob_path FROM cache WHERE expires_at < ?", (now,)
).fetchall()
conn.execute("DELETE FROM cache WHERE expires_at < ?", (now,))
for _, path in rows:
if path:
try:
Path(path).unlink(missing_ok=True)
except Exception:
pass
return len(rows)
except Exception as e:
logger.warning(f"[Redis] Errore eliminazione chiave '{key}': {e}")
return False
logger.warning(f"[Cache] Errore sweep: {e}")
return 0

View File

@@ -2,6 +2,7 @@ import hashlib
import io
import logging
import os
import threading
from datetime import datetime, timezone
from typing import Callable, List, Optional
@@ -11,13 +12,20 @@ from core.cache import cache_get, cache_set
logger = logging.getLogger(__name__)
# ── Chiavi Redis e TTL ────────────────────────────────────────────────
# Lock di "single-flight" per il fetch del catalogo Copernicus.
# Senza questo, N richieste concorrenti con cache miss farebbero N chiamate
# all'SDK (10-30s ciascuna, ~200MB di response). Con il lock, solo la prima
# scarica e popola la cache; le altre attendono e leggono da cache.
_catalog_fetch_lock = threading.Lock()
# ── Chiavi cache e TTL ────────────────────────────────────────────────
# Chiave per il catalogo completo Copernicus
_CATALOG_KEY = "marine:catalog:full"
# TTL del catalogo: 1 ora (il catalogo Copernicus cambia raramente)
_CATALOG_TTL = 3600
# TTL per i risultati di ricerca: 30 minuti
_SEARCH_TTL = 1800
# TTL L1 (Redis): 2 ore. L2 (disco) usa il default 30 giorni.
# Il catalogo Copernicus cambia raramente, ha senso tenerlo a lungo su disco.
_CATALOG_TTL = 2 * 3600
# TTL L1 per le ricerche utente: 2 ore. Su disco 30 giorni.
_SEARCH_TTL = 2 * 3600
def _fmt_description(name: Optional[str]) -> Optional[str]:
@@ -44,10 +52,17 @@ def _get_raw_catalog() -> dict:
logger.debug("[Catalogo] Servito da cache Redis")
return cached
# Cache miss: interroga Copernicus SDK (operazione lenta, ~10-30s)
logger.info("[Catalogo] Cache miss, scaricamento da Copernicus SDK...")
import copernicusmarine
catalog = copernicusmarine.describe(disable_progress_bar=True)
# Single-flight: solo un thread alla volta scarica il catalogo. Gli altri
# attendono il lock e poi leggono il valore appena messo in cache.
with _catalog_fetch_lock:
cached = cache_get(_CATALOG_KEY)
if cached is not None:
return cached
# Cache miss: interroga Copernicus SDK (operazione lenta, ~10-30s)
logger.info("[Catalogo] Cache miss, scaricamento da Copernicus SDK...")
import copernicusmarine
catalog = copernicusmarine.describe(disable_progress_bar=True)
# Serializza la risposta SDK in un dizionario standard
if hasattr(catalog, "model_dump"):
@@ -57,11 +72,11 @@ def _get_raw_catalog() -> dict:
else:
result = catalog
# Salva in Redis per le prossime richieste (TTL 1 ora)
cache_set(_CATALOG_KEY, result, _CATALOG_TTL)
logger.info("[Catalogo] Salvato in cache Redis")
# Salva in Redis per le prossime richieste (TTL 1 ora)
cache_set(_CATALOG_KEY, result, _CATALOG_TTL)
logger.info("[Catalogo] Salvato in cache Redis")
return result
return result
def _get_dataset_reqs(ds: dict) -> tuple:

View File

@@ -12,11 +12,16 @@ from fastapi.middleware.cors import CORSMiddleware
load_dotenv()
from routers import catalog, datasets, jobs
from core.cache import cache_stats, cache_sweep
@asynccontextmanager
async def lifespan(app: FastAPI):
api_url = os.getenv("API_SERVICE_URL", "http://api:3003")
# Pulizia voci scadute della cache su disco all'avvio
removed = cache_sweep()
if removed:
print(f"[Cache] Rimosse {removed} voci scadute dal disco")
yield
@@ -50,4 +55,9 @@ async def root():
@app.get("/health", tags=["health"])
async def health():
return {"status": "healthy"}
return {"status": "healthy", "cache": cache_stats()}
@app.post("/cache/sweep", tags=["health"])
async def sweep():
return {"removed": cache_sweep()}

View File

@@ -7,6 +7,7 @@ Flusso:
import json
import os
import threading
import uuid
from typing import Any, Dict
@@ -24,6 +25,13 @@ API_URL = os.getenv("API_SERVICE_URL", "http://api:3003")
# TTL per lo stato dei job: 48 ore (i job completati vengono puliti automaticamente)
_JOB_TTL = 48 * 3600
# Limite di download Copernicus concorrenti. Le subset() dell'SDK sono
# CPU + memoria intensive (xarray + netCDF + pandas conversion) e sul server
# le risorse sono limitate. Senza semaforo, N utenti che cliccano insieme
# saturano la RAM e fanno OOM-kill del processo.
_DOWNLOAD_CONCURRENCY = int(os.getenv("MARINE_DOWNLOAD_CONCURRENCY", "2"))
_download_semaphore = threading.BoundedSemaphore(_DOWNLOAD_CONCURRENCY)
def _job_key(session_id: str) -> str:
"""Genera la chiave Redis per un job."""
@@ -42,7 +50,7 @@ def _set_job(session_id: str, **kwargs):
if job is None:
return
job.update(kwargs)
cache_set(_job_key(session_id), job, _JOB_TTL)
cache_set(_job_key(session_id), job, _JOB_TTL, disk_ttl=0)
def _run_download(session_id: str, req: DownloadJobRequest, username: str, user_token: str):
@@ -55,20 +63,26 @@ def _run_download(session_id: str, req: DownloadJobRequest, username: str, user_
_set_job(session_id, progress=pct, message=msg)
try:
_set_job(session_id, status="downloading", progress=5, message="Scarico da Copernicus Marine...")
_set_job(session_id, status="queued", progress=2, message="In coda (max concorrenti raggiunto)...")
# Scarica dati dal catalogo Copernicus
df = copernicus.download_dataset(
dataset_id=req.dataset_id,
variables=req.variables,
min_longitude=req.min_longitude,
max_longitude=req.max_longitude,
min_latitude=req.min_latitude,
max_latitude=req.max_latitude,
start_datetime=req.start_date,
end_datetime=req.end_date,
progress_callback=progress,
)
# Acquisisce uno slot di download (blocca se già al limite). Garantisce
# che il numero di chiamate Copernicus simultanee non superi
# MARINE_DOWNLOAD_CONCURRENCY, proteggendo CPU/RAM del server.
with _download_semaphore:
_set_job(session_id, status="downloading", progress=5, message="Scarico da Copernicus Marine...")
# Scarica dati dal catalogo Copernicus
df = copernicus.download_dataset(
dataset_id=req.dataset_id,
variables=req.variables,
min_longitude=req.min_longitude,
max_longitude=req.max_longitude,
min_latitude=req.min_latitude,
max_latitude=req.max_latitude,
start_datetime=req.start_date,
end_datetime=req.end_date,
progress_callback=progress,
)
_set_job(session_id, status="converting", progress=80, message="Creo il file...")
@@ -85,7 +99,7 @@ def _run_download(session_id: str, req: DownloadJobRequest, username: str, user_
"created_by": username,
"type": req.format,
"notes": req.notes,
"copernicus_dataset_id": req.dataset_id,
"copernicus_id": req.dataset_id,
"variables": req.variables,
"variable_renames": req.variable_renames,
"bbox": [req.min_longitude, req.min_latitude, req.max_longitude, req.max_latitude],
@@ -129,7 +143,7 @@ async def new_download_session(
"message": "In coda",
"dataset_id": None,
}
cache_set(_job_key(session_id), initial_state, _JOB_TTL)
cache_set(_job_key(session_id), initial_state, _JOB_TTL, disk_ttl=0)
# Avvia il download in background
background_tasks.add_task(_run_download, session_id, req, user["username"], user["token"])

View File

@@ -65,7 +65,7 @@ class DatasetMeta(BaseModel):
notes: str = ""
version: int = 1
filename: str
copernicus_dataset_id: str
copernicus_id: str
variables: List[str] = []
bbox: List[float] = [] # [min_lon, min_lat, max_lon, max_lat]
start_date: str