feat: Add new API endpoints and HTML pages for ML model management

- Implemented HTML pages for datasets, models, training, testing, and results. - Created API endpoints for managing repositories, results, tests, and training sessions. - Added functionality for streaming training progress via Server-Sent Events (SSE). - Introduced a Dockerfile for the ML runner with necessary dependencies. - Developed an SDK for user code execution within the runner container. - Enhanced CSS styles for improved UI layout and navigation. - Established a layout template for consistent HTML structure across pages. - Added JavaScript for dynamic interactions on the models page. - Implemented WebSocket handling for real-time communication with kiosk devices and controllers. - Implemented model registration and management API at /api/models - Added Gitea proxy API for repository interactions at /api/repos - Created results API for listing and comparing training results at /api/results - Developed training management API for enqueueing and retrieving training jobs at /api/trainings - Introduced SSE endpoint for live training progress updates - Added HTML pages for models, datasets, and training management - Created a Dockerfile for the ML runner with necessary dependencies - Developed SDK for user code execution within the runner container - Enhanced CSS styles for improved UI/UX - Implemented WebSocket communication for real-time device and controller interactions in the kiosk system
2026-04-28 09:24:38 +02:00
parent ee478e52ef
commit 0ce879aa44
81 changed files with 7491 additions and 746 deletions
--- a/copernicus/core/cache.py
+++ b/copernicus/core/cache.py
@@ -1,125 +1,292 @@
 """
-Redis Keys:
- marine:catalog:full → lista dei dataset completo (TTL 1h)
- marine:catalog:search:{hash} → risultati ricerca (TTL 30min)
- marine:job:{session_id} → stato job download (TTL 48h)
+Cache two-tier per il servizio Marine.
+
+L1 = Redis (RAM):   scadenza 2 ore, velocissima, condivisa tra processi.
+L2 = SQLite+disco:  persistente (200GB), fallback quando Redis non c'è
+                    o quando L1 è scaduta. Scadenza configurabile (default 30 giorni).
+
+Flusso lettura:
+    1. Prova L1 (Redis). Se hit → ritorna.
+    2. Prova L2 (SQLite). Se hit non scaduta → ritorna E ripopola L1 (re-warm).
+    3. Miss totale → None.
+
+Flusso scrittura:
+    Scrive in entrambi i tier contemporaneamente.
+
+Chiavi standard:
+- marine:catalog:full            → lista completa dataset Copernicus
+- marine:catalog:search:{hash}   → risultati ricerca utente
+- marine:job:{session_id}        → stato job download (solo Redis, ephemeri)
 """

+import gzip
 import json
-import os
 import logging
+import os
+import sqlite3
+import threading
+import time
+from pathlib import Path
 from typing import Any, Optional

 import redis

 logger = logging.getLogger(__name__)

-# Configurazione Redis da variabili ambiente
+# ── Config ───────────────────────────────────────────────────────────────
 REDIS_HOST = os.getenv("REDIS_HOST", "meb-redis")
 REDIS_PORT = int(os.getenv("REDIS_PORT", "6379"))

-# Pool di connessioni condiviso (thread-safe, riutilizzabile)
+# Il volume persistente è montato dal container, default /app/cache
+CACHE_DIR = Path(os.getenv("CACHE_DIR", "/app/cache"))
+CACHE_DB = CACHE_DIR / "catalog.sqlite"
+BLOB_DIR = CACHE_DIR / "blobs"
+
+# TTL default
+DEFAULT_REDIS_TTL = 2 * 3600       # 2 ore (L1)
+DEFAULT_DISK_TTL = 30 * 24 * 3600  # 30 giorni (L2)
+
+# Soglia sopra la quale il valore va in un file su disco invece che in sqlite
+BLOB_THRESHOLD_BYTES = 64 * 1024   # 64 KB
+
+# ── Stato globale ────────────────────────────────────────────────────────
 _pool: Optional[redis.ConnectionPool] = None
 _client: Optional[redis.Redis] = None
+_redis_disabled = False
+
+_sqlite_lock = threading.Lock()
+_sqlite_initialized = False


-def _get_client() -> Optional[redis.Redis]:
-    """Restituisce il client Redis singleton con connection pool.
-    Ritorna None se Redis non è raggiungibile."""
-    global _pool, _client
-
+# ── Redis (L1) ───────────────────────────────────────────────────────────
+def _get_redis() -> Optional[redis.Redis]:
+    global _pool, _client, _redis_disabled
+    if _redis_disabled:
+        return None
    if _client is not None:
        return _client
-
    try:
        _pool = redis.ConnectionPool(
            host=REDIS_HOST,
            port=REDIS_PORT,
-            # Decodifica automatica delle risposte in stringhe UTF-8
-            decode_responses=True,
-            # Massimo 5 connessioni nel pool (VPS 1-core, non serve di più)
+            decode_responses=False,   # tratto blob binari (gzip)
            max_connections=5,
-            # Timeout connessione e socket per evitare blocchi
            socket_connect_timeout=3,
            socket_timeout=3,
-            # Riprova automaticamente se la connessione viene interrotta
            retry_on_timeout=True,
        )
        _client = redis.Redis(connection_pool=_pool)
-        # Test connessione
        _client.ping()
-        logger.info("[Redis] Connessione stabilita per il servizio Marine")
+        logger.info("[Cache] Redis L1 connesso")
        return _client
    except Exception as e:
-        logger.warning(f"[Redis] Non disponibile, la cache è disabilitata: {e}")
+        logger.warning(f"[Cache] Redis non disponibile, uso solo disco: {e}")
+        _redis_disabled = True
        _client = None
        return None


-def cache_get(key: str) -> Optional[Any]:
-    """Legge un valore dalla cache Redis.
+# ── SQLite (L2) ──────────────────────────────────────────────────────────
+def _ensure_sqlite() -> sqlite3.Connection:
+    """Apre/crea il db SQLite su disco. Crea anche la dir blob."""
+    global _sqlite_initialized
+    CACHE_DIR.mkdir(parents=True, exist_ok=True)
+    BLOB_DIR.mkdir(parents=True, exist_ok=True)
+    conn = sqlite3.connect(str(CACHE_DB), timeout=5.0, isolation_level=None)
+    conn.execute("PRAGMA journal_mode=WAL")
+    conn.execute("PRAGMA synchronous=NORMAL")
+    if not _sqlite_initialized:
+        conn.execute("""
+            CREATE TABLE IF NOT EXISTS cache (
+                key        TEXT PRIMARY KEY,
+                expires_at INTEGER NOT NULL,
+                is_blob    INTEGER NOT NULL DEFAULT 0,
+                value      BLOB,
+                blob_path  TEXT,
+                size_bytes INTEGER NOT NULL,
+                updated_at INTEGER NOT NULL
+            )
+        """)
+        conn.execute("CREATE INDEX IF NOT EXISTS idx_cache_expires ON cache(expires_at)")
+        _sqlite_initialized = True
+    return conn

-    Args:
-        key: Chiave Redis (es. 'marine:catalog:full')

-    Returns:
-        Il valore deserializzato da JSON, oppure None se non trovato o errore
-    """
+def _blob_path(key: str) -> Path:
+    # Nome file safe: solo caratteri alfanumerici + hash per unicità
+    safe = "".join(c if c.isalnum() or c in ("-", "_") else "_" for c in key)
+    return BLOB_DIR / f"{safe}.json.gz"
+
+
+def _disk_get(key: str) -> Optional[Any]:
    try:
-        client = _get_client()
-        if client is None:
+        with _sqlite_lock:
+            conn = _ensure_sqlite()
+            row = conn.execute(
+                "SELECT expires_at, is_blob, value, blob_path FROM cache WHERE key = ?",
+                (key,)
+            ).fetchone()
+        if row is None:
            return None
-
-        data = client.get(key)
-        if data is None:
+        expires_at, is_blob, value, blob_path = row
+        if expires_at < int(time.time()):
+            # Scaduta: la elimino in lazy
+            _disk_delete(key)
            return None
-
-        return json.loads(data)
+        if is_blob:
+            data = Path(blob_path).read_bytes()
+        else:
+            data = value
+        return json.loads(gzip.decompress(data).decode("utf-8"))
    except Exception as e:
-        logger.warning(f"[Redis] Errore lettura chiave '{key}': {e}")
+        logger.warning(f"[Cache] Errore lettura disco '{key}': {e}")
        return None


-def cache_set(key: str, value: Any, ttl: int = 3600) -> bool:
-    """Scrive un valore nella cache Redis con TTL.
-
-    Args:
-        key: Chiave Redis
-        value: Valore da serializzare in JSON
-        ttl: Tempo di vita in secondi (default: 1 ora)
-
-    Returns:
-        True se scritto con successo, False altrimenti
-    """
+def _disk_set(key: str, raw_gz: bytes, ttl: int) -> None:
    try:
-        client = _get_client()
-        if client is None:
-            return False
-
-        serialized = json.dumps(value)
-        client.setex(key, ttl, serialized)
-        return True
+        expires_at = int(time.time()) + ttl
+        updated_at = int(time.time())
+        size = len(raw_gz)
+        if size > BLOB_THRESHOLD_BYTES:
+            path = _blob_path(key)
+            path.write_bytes(raw_gz)
+            with _sqlite_lock:
+                conn = _ensure_sqlite()
+                conn.execute(
+                    "INSERT OR REPLACE INTO cache(key, expires_at, is_blob, value, blob_path, size_bytes, updated_at) "
+                    "VALUES(?,?,?,?,?,?,?)",
+                    (key, expires_at, 1, None, str(path), size, updated_at)
+                )
+        else:
+            with _sqlite_lock:
+                conn = _ensure_sqlite()
+                conn.execute(
+                    "INSERT OR REPLACE INTO cache(key, expires_at, is_blob, value, blob_path, size_bytes, updated_at) "
+                    "VALUES(?,?,?,?,?,?,?)",
+                    (key, expires_at, 0, raw_gz, None, size, updated_at)
+                )
    except Exception as e:
-        logger.warning(f"[Redis] Errore scrittura chiave '{key}': {e}")
+        logger.warning(f"[Cache] Errore scrittura disco '{key}': {e}")
+
+
+def _disk_delete(key: str) -> None:
+    try:
+        with _sqlite_lock:
+            conn = _ensure_sqlite()
+            row = conn.execute("SELECT blob_path FROM cache WHERE key = ?", (key,)).fetchone()
+            conn.execute("DELETE FROM cache WHERE key = ?", (key,))
+        if row and row[0]:
+            try:
+                Path(row[0]).unlink(missing_ok=True)
+            except Exception:
+                pass
+    except Exception as e:
+        logger.warning(f"[Cache] Errore delete disco '{key}': {e}")
+
+
+# ── API pubblica ─────────────────────────────────────────────────────────
+def cache_get(key: str) -> Optional[Any]:
+    """Legge L1 → L2. Se L2 hit, ripopola L1 (re-warm)."""
+    # L1
+    client = _get_redis()
+    if client is not None:
+        try:
+            raw = client.get(key)
+            if raw is not None:
+                return json.loads(gzip.decompress(raw).decode("utf-8"))
+        except Exception as e:
+            logger.warning(f"[Cache] Errore Redis '{key}': {e}")
+
+    # L2
+    value = _disk_get(key)
+    if value is not None and client is not None:
+        # Re-warm L1 con TTL standard
+        try:
+            raw_gz = gzip.compress(json.dumps(value).encode("utf-8"))
+            client.setex(key, DEFAULT_REDIS_TTL, raw_gz)
+        except Exception:
+            pass
+    return value
+
+
+def cache_set(key: str, value: Any, ttl: int = DEFAULT_REDIS_TTL, disk_ttl: Optional[int] = None) -> bool:
+    """Scrive in L1 (ttl) e L2 (disk_ttl, default 30 giorni).
+    Per chiavi ephemere (es. job state) passa disk_ttl=0 per saltare il disco."""
+    if disk_ttl is None:
+        disk_ttl = DEFAULT_DISK_TTL
+    try:
+        serialized = json.dumps(value).encode("utf-8")
+        raw_gz = gzip.compress(serialized)
+    except Exception as e:
+        logger.warning(f"[Cache] Errore serializzazione '{key}': {e}")
        return False

+    ok = False
+    # L1
+    client = _get_redis()
+    if client is not None:
+        try:
+            client.setex(key, ttl, raw_gz)
+            ok = True
+        except Exception as e:
+            logger.warning(f"[Cache] Errore scrittura Redis '{key}': {e}")
+
+    # L2
+    if disk_ttl > 0:
+        _disk_set(key, raw_gz, disk_ttl)
+        ok = True
+
+    return ok
+

 def cache_delete(key: str) -> bool:
-    """Elimina una chiave dalla cache Redis.
+    client = _get_redis()
+    if client is not None:
+        try:
+            client.delete(key)
+        except Exception:
+            pass
+    _disk_delete(key)
+    return True

-    Args:
-        key: Chiave Redis da eliminare

-    Returns:
-        True se eliminata, False altrimenti
-    """
+def cache_stats() -> dict:
+    """Ritorna statistiche della cache: utile per /health e debug."""
+    stats = {"redis": False, "disk": {"entries": 0, "bytes": 0, "blobs": 0}}
+    if _get_redis() is not None:
+        stats["redis"] = True
    try:
-        client = _get_client()
-        if client is None:
-            return False
+        with _sqlite_lock:
+            conn = _ensure_sqlite()
+            row = conn.execute(
+                "SELECT COUNT(*), COALESCE(SUM(size_bytes),0), COALESCE(SUM(is_blob),0) FROM cache"
+            ).fetchone()
+        stats["disk"]["entries"] = row[0]
+        stats["disk"]["bytes"] = row[1]
+        stats["disk"]["blobs"] = row[2]
+    except Exception:
+        pass
+    return stats

-        client.delete(key)
-        return True
+
+def cache_sweep() -> int:
+    """Rimuove voci scadute su disco (da chiamare periodicamente). Ritorna numero eliminate."""
+    try:
+        now = int(time.time())
+        with _sqlite_lock:
+            conn = _ensure_sqlite()
+            rows = conn.execute(
+                "SELECT key, blob_path FROM cache WHERE expires_at < ?", (now,)
+            ).fetchall()
+            conn.execute("DELETE FROM cache WHERE expires_at < ?", (now,))
+        for _, path in rows:
+            if path:
+                try:
+                    Path(path).unlink(missing_ok=True)
+                except Exception:
+                    pass
+        return len(rows)
    except Exception as e:
-        logger.warning(f"[Redis] Errore eliminazione chiave '{key}': {e}")
-        return False
+        logger.warning(f"[Cache] Errore sweep: {e}")
+        return 0
--- a/copernicus/core/copernicus.py
+++ b/copernicus/core/copernicus.py
@@ -2,6 +2,7 @@ import hashlib
 import io
 import logging
 import os
+import threading
 from datetime import datetime, timezone
 from typing import Callable, List, Optional

@@ -11,13 +12,20 @@ from core.cache import cache_get, cache_set

 logger = logging.getLogger(__name__)

-# ── Chiavi Redis e TTL ────────────────────────────────────────────────
+# Lock di "single-flight" per il fetch del catalogo Copernicus.
+# Senza questo, N richieste concorrenti con cache miss farebbero N chiamate
+# all'SDK (10-30s ciascuna, ~200MB di response). Con il lock, solo la prima
+# scarica e popola la cache; le altre attendono e leggono da cache.
+_catalog_fetch_lock = threading.Lock()
+
+# ── Chiavi cache e TTL ────────────────────────────────────────────────
 # Chiave per il catalogo completo Copernicus
 _CATALOG_KEY = "marine:catalog:full"
-# TTL del catalogo: 1 ora (il catalogo Copernicus cambia raramente)
-_CATALOG_TTL = 3600
-# TTL per i risultati di ricerca: 30 minuti
-_SEARCH_TTL = 1800
+# TTL L1 (Redis): 2 ore. L2 (disco) usa il default 30 giorni.
+# Il catalogo Copernicus cambia raramente, ha senso tenerlo a lungo su disco.
+_CATALOG_TTL = 2 * 3600
+# TTL L1 per le ricerche utente: 2 ore. Su disco 30 giorni.
+_SEARCH_TTL = 2 * 3600


 def _fmt_description(name: Optional[str]) -> Optional[str]:
@@ -44,10 +52,17 @@ def _get_raw_catalog() -> dict:
        logger.debug("[Catalogo] Servito da cache Redis")
        return cached

-    # Cache miss: interroga Copernicus SDK (operazione lenta, ~10-30s)
-    logger.info("[Catalogo] Cache miss, scaricamento da Copernicus SDK...")
-    import copernicusmarine
-    catalog = copernicusmarine.describe(disable_progress_bar=True)
+    # Single-flight: solo un thread alla volta scarica il catalogo. Gli altri
+    # attendono il lock e poi leggono il valore appena messo in cache.
+    with _catalog_fetch_lock:
+        cached = cache_get(_CATALOG_KEY)
+        if cached is not None:
+            return cached
+
+        # Cache miss: interroga Copernicus SDK (operazione lenta, ~10-30s)
+        logger.info("[Catalogo] Cache miss, scaricamento da Copernicus SDK...")
+        import copernicusmarine
+        catalog = copernicusmarine.describe(disable_progress_bar=True)

    # Serializza la risposta SDK in un dizionario standard
    if hasattr(catalog, "model_dump"):
@@ -57,11 +72,11 @@ def _get_raw_catalog() -> dict:
    else:
        result = catalog

-    # Salva in Redis per le prossime richieste (TTL 1 ora)
-    cache_set(_CATALOG_KEY, result, _CATALOG_TTL)
-    logger.info("[Catalogo] Salvato in cache Redis")
+        # Salva in Redis per le prossime richieste (TTL 1 ora)
+        cache_set(_CATALOG_KEY, result, _CATALOG_TTL)
+        logger.info("[Catalogo] Salvato in cache Redis")

-    return result
+        return result


 def _get_dataset_reqs(ds: dict) -> tuple:
--- a/copernicus/main.py
+++ b/copernicus/main.py
@@ -12,11 +12,16 @@ from fastapi.middleware.cors import CORSMiddleware
 load_dotenv()

 from routers import catalog, datasets, jobs
+from core.cache import cache_stats, cache_sweep


@asynccontextmanager
 async def lifespan(app: FastAPI):
    api_url = os.getenv("API_SERVICE_URL", "http://api:3003")
+    # Pulizia voci scadute della cache su disco all'avvio
+    removed = cache_sweep()
+    if removed:
+        print(f"[Cache] Rimosse {removed} voci scadute dal disco")
    yield


@@ -50,4 +55,9 @@ async def root():

@app.get("/health", tags=["health"])
 async def health():
-    return {"status": "healthy"}
+    return {"status": "healthy", "cache": cache_stats()}
+
+
+@app.post("/cache/sweep", tags=["health"])
+async def sweep():
+    return {"removed": cache_sweep()}
--- a/copernicus/routers/jobs.py
+++ b/copernicus/routers/jobs.py
@@ -7,6 +7,7 @@ Flusso:

 import json
 import os
+import threading
 import uuid
 from typing import Any, Dict

@@ -24,6 +25,13 @@ API_URL = os.getenv("API_SERVICE_URL", "http://api:3003")
 # TTL per lo stato dei job: 48 ore (i job completati vengono puliti automaticamente)
 _JOB_TTL = 48 * 3600

+# Limite di download Copernicus concorrenti. Le subset() dell'SDK sono
+# CPU + memoria intensive (xarray + netCDF + pandas conversion) e sul server
+# le risorse sono limitate. Senza semaforo, N utenti che cliccano insieme
+# saturano la RAM e fanno OOM-kill del processo.
+_DOWNLOAD_CONCURRENCY = int(os.getenv("MARINE_DOWNLOAD_CONCURRENCY", "2"))
+_download_semaphore = threading.BoundedSemaphore(_DOWNLOAD_CONCURRENCY)
+

 def _job_key(session_id: str) -> str:
    """Genera la chiave Redis per un job."""
@@ -42,7 +50,7 @@ def _set_job(session_id: str, **kwargs):
    if job is None:
        return
    job.update(kwargs)
-    cache_set(_job_key(session_id), job, _JOB_TTL)
+    cache_set(_job_key(session_id), job, _JOB_TTL, disk_ttl=0)


 def _run_download(session_id: str, req: DownloadJobRequest, username: str, user_token: str):
@@ -55,20 +63,26 @@ def _run_download(session_id: str, req: DownloadJobRequest, username: str, user_
        _set_job(session_id, progress=pct, message=msg)

    try:
-        _set_job(session_id, status="downloading", progress=5, message="Scarico da Copernicus Marine...")
+        _set_job(session_id, status="queued", progress=2, message="In coda (max concorrenti raggiunto)...")

-        # Scarica dati dal catalogo Copernicus
-        df = copernicus.download_dataset(
-            dataset_id=req.dataset_id,
-            variables=req.variables,
-            min_longitude=req.min_longitude,
-            max_longitude=req.max_longitude,
-            min_latitude=req.min_latitude,
-            max_latitude=req.max_latitude,
-            start_datetime=req.start_date,
-            end_datetime=req.end_date,
-            progress_callback=progress,
-        )
+        # Acquisisce uno slot di download (blocca se già al limite). Garantisce
+        # che il numero di chiamate Copernicus simultanee non superi
+        # MARINE_DOWNLOAD_CONCURRENCY, proteggendo CPU/RAM del server.
+        with _download_semaphore:
+            _set_job(session_id, status="downloading", progress=5, message="Scarico da Copernicus Marine...")
+
+            # Scarica dati dal catalogo Copernicus
+            df = copernicus.download_dataset(
+                dataset_id=req.dataset_id,
+                variables=req.variables,
+                min_longitude=req.min_longitude,
+                max_longitude=req.max_longitude,
+                min_latitude=req.min_latitude,
+                max_latitude=req.max_latitude,
+                start_datetime=req.start_date,
+                end_datetime=req.end_date,
+                progress_callback=progress,
+            )

        _set_job(session_id, status="converting", progress=80, message="Creo il file...")

@@ -85,7 +99,7 @@ def _run_download(session_id: str, req: DownloadJobRequest, username: str, user_
            "created_by":            username,
            "type":                  req.format,
            "notes":                 req.notes,
-            "copernicus_dataset_id": req.dataset_id,
+            "copernicus_id":         req.dataset_id,
            "variables":             req.variables,
            "variable_renames":      req.variable_renames,
            "bbox":                  [req.min_longitude, req.min_latitude, req.max_longitude, req.max_latitude],
@@ -129,7 +143,7 @@ async def new_download_session(
        "message": "In coda",
        "dataset_id": None,
    }
-    cache_set(_job_key(session_id), initial_state, _JOB_TTL)
+    cache_set(_job_key(session_id), initial_state, _JOB_TTL, disk_ttl=0)

    # Avvia il download in background
    background_tasks.add_task(_run_download, session_id, req, user["username"], user["token"])
--- a/copernicus/schemas.py
+++ b/copernicus/schemas.py
@@ -65,7 +65,7 @@ class DatasetMeta(BaseModel):
    notes: str = ""
    version: int = 1
    filename: str
-    copernicus_dataset_id: str
+    copernicus_id: str
    variables: List[str] = []
    bbox: List[float] = []  # [min_lon, min_lat, max_lon, max_lat]
    start_date: str