Files
OLD-server-architecture/copernicus/core/copernicus.py
Giuseppe Raffa 0ce879aa44 feat: Add new API endpoints and HTML pages for ML model management
- Implemented HTML pages for datasets, models, training, testing, and results.
- Created API endpoints for managing repositories, results, tests, and training sessions.
- Added functionality for streaming training progress via Server-Sent Events (SSE).
- Introduced a Dockerfile for the ML runner with necessary dependencies.
- Developed an SDK for user code execution within the runner container.
- Enhanced CSS styles for improved UI layout and navigation.
- Established a layout template for consistent HTML structure across pages.
- Added JavaScript for dynamic interactions on the models page.
- Implemented WebSocket handling for real-time communication with kiosk devices and controllers.
- Implemented model registration and management API at /api/models
- Added Gitea proxy API for repository interactions at /api/repos
- Created results API for listing and comparing training results at /api/results
- Developed training management API for enqueueing and retrieving training jobs at /api/trainings
- Introduced SSE endpoint for live training progress updates
- Added HTML pages for models, datasets, and training management
- Created a Dockerfile for the ML runner with necessary dependencies
- Developed SDK for user code execution within the runner container
- Enhanced CSS styles for improved UI/UX
- Implemented WebSocket communication for real-time device and controller interactions in the kiosk system
2026-04-28 09:24:38 +02:00

326 lines
12 KiB
Python

import hashlib
import io
import logging
import os
import threading
from datetime import datetime, timezone
from typing import Callable, List, Optional
import pandas as pd
from core.cache import cache_get, cache_set
logger = logging.getLogger(__name__)
# Lock di "single-flight" per il fetch del catalogo Copernicus.
# Senza questo, N richieste concorrenti con cache miss farebbero N chiamate
# all'SDK (10-30s ciascuna, ~200MB di response). Con il lock, solo la prima
# scarica e popola la cache; le altre attendono e leggono da cache.
_catalog_fetch_lock = threading.Lock()
# ── Chiavi cache e TTL ────────────────────────────────────────────────
# Chiave per il catalogo completo Copernicus
_CATALOG_KEY = "marine:catalog:full"
# TTL L1 (Redis): 2 ore. L2 (disco) usa il default 30 giorni.
# Il catalogo Copernicus cambia raramente, ha senso tenerlo a lungo su disco.
_CATALOG_TTL = 2 * 3600
# TTL L1 per le ricerche utente: 2 ore. Su disco 30 giorni.
_SEARCH_TTL = 2 * 3600
def _fmt_description(name: Optional[str]) -> Optional[str]:
"""Formatta meglio il titolo del dataset"""
if not name:
return None
return name.replace("_", " ").title()
def _get_raw_catalog() -> dict:
"""Interroga le API di Copernicus per ottenere la lista completa dei dataset.
Strategia cache Redis:
1. Cerca in Redis (chiave marine:catalog:full)
2. Se non trovato → chiama Copernicus SDK → salva in Redis con TTL 1h
3. Se Redis non disponibile → chiama sempre l'SDK (nessuna cache)
Il catalogo in Redis sopravvive al restart del servizio grazie
alla persistenza RDB+AOF configurata in redis.conf.
"""
# Cerca in Redis prima di chiamare l'SDK Copernicus
cached = cache_get(_CATALOG_KEY)
if cached is not None:
logger.debug("[Catalogo] Servito da cache Redis")
return cached
# Single-flight: solo un thread alla volta scarica il catalogo. Gli altri
# attendono il lock e poi leggono il valore appena messo in cache.
with _catalog_fetch_lock:
cached = cache_get(_CATALOG_KEY)
if cached is not None:
return cached
# Cache miss: interroga Copernicus SDK (operazione lenta, ~10-30s)
logger.info("[Catalogo] Cache miss, scaricamento da Copernicus SDK...")
import copernicusmarine
catalog = copernicusmarine.describe(disable_progress_bar=True)
# Serializza la risposta SDK in un dizionario standard
if hasattr(catalog, "model_dump"):
result = catalog.model_dump()
elif hasattr(catalog, "__dict__"):
result = catalog.__dict__
else:
result = catalog
# Salva in Redis per le prossime richieste (TTL 1 ora)
cache_set(_CATALOG_KEY, result, _CATALOG_TTL)
logger.info("[Catalogo] Salvato in cache Redis")
return result
def _get_dataset_reqs(ds: dict) -> tuple:
"""
Ottieni dalla risposta del dataset le variabili disponibili e le coordinate dell'area disponibile.
Attualmente è implementato Copernicus SDK v2, le variabili sono in::
dataset -> versions[-1] -> parts[] -> services[] -> variables[]
Le coordinate sono disponibili in variable.bbox = [min_lon, min_lat, max_lon, max_lat].
La finestra temporale disponibile è nel servizio "arco-time-series"
dove coordinate_id == 'time' (i valori sono in millisecondi, usando Unix epoch).
"""
variables = []
seen: set = set()
bounds = {
"min_longitude": None, "max_longitude": None,
"min_latitude": None, "max_latitude": None,
"start_datetime": None, "end_datetime": None,
}
versions = ds.get("versions", [])
if not versions:
return variables, bounds
for part in versions[-1].get("parts", []):
for service in part.get("services", []):
service_name = service.get("service_name", "")
for var in service.get("variables", []):
short_name = var.get("short_name", "")
if not short_name or short_name in seen:
continue
seen.add(short_name)
std = var.get("standard_name")
variables.append({
"short_name": short_name,
"standard_name": std,
"units": var.get("units"),
"description": _fmt_description(std),
})
# Ottieni la box delle coordinate
if bounds["min_longitude"] is None:
bbox = var.get("bbox")
if bbox and len(bbox) >= 4:
# [min_lon, min_lat, max_lon, max_lat]
bounds["min_longitude"] = bbox[0]
bounds["min_latitude"] = bbox[1]
bounds["max_longitude"] = bbox[2]
bounds["max_latitude"] = bbox[3]
# Ottieni la finestra temporale del dataset dal servizio "arco-time-series"
if bounds["start_datetime"] is None and "arco-time" in service_name:
for coord in var.get("coordinates", []):
if coord.get("coordinate_id") == "time":
min_ms = coord.get("minimum_value")
max_ms = coord.get("maximum_value")
if min_ms is not None:
bounds["start_datetime"] = datetime.fromtimestamp(
min_ms / 1000, tz=timezone.utc
).strftime("%Y-%m-%d")
if max_ms is not None:
bounds["end_datetime"] = datetime.fromtimestamp(
max_ms / 1000, tz=timezone.utc
).strftime("%Y-%m-%d")
break
return variables, bounds
def get_catalog(search: Optional[str] = None, limit: int = 50, offset: int = 0) -> dict:
"""Ottieni dataset dal catalogo Copernicus Marine, filtrabili per nome o ID.
Cache Redis per le ricerche:
- Chiave: marine:catalog:search:{md5(search|limit|offset)}
- TTL: 30 minuti
- La cache ricerca viene invalidata quando il catalogo scade (1h)
"""
# Genera chiave cache unica per questa combinazione di parametri
cache_key = None
if search:
query_hash = hashlib.md5(f"{search}|{limit}|{offset}".encode()).hexdigest()[:12]
cache_key = f"marine:catalog:search:{query_hash}"
# Cerca risultato in cache Redis
cached_result = cache_get(cache_key)
if cached_result is not None:
logger.debug(f"[Catalogo] Ricerca '{search}' servita da cache Redis")
return cached_result
raw = _get_raw_catalog()
# Gestisce formati diversi della risposta SDK (lista o dizionario)
if isinstance(raw, list):
products = raw
else:
products = raw.get("products", [])
results = []
for product in products:
title = product.get("title", "")
description = product.get("description", "")
for ds in product.get("datasets", []):
dataset_id = ds.get("dataset_id", "")
if search:
needle = search.lower()
if needle not in dataset_id.lower() and needle not in title.lower():
continue
variables, bounds = _get_dataset_reqs(ds)
results.append({
"dataset_id": dataset_id,
"title": title,
"description": description[:200] if description else "",
"variables": variables,
**bounds,
})
total = len(results)
page = results[offset: offset + limit]
response = {"total": total, "offset": offset, "limit": limit, "datasets": page}
# Salva risultato ricerca in cache Redis (solo se c'è un filtro di ricerca)
if cache_key:
cache_set(cache_key, response, _SEARCH_TTL)
return response
def get_dataset_info(dataset_id: str) -> Optional[dict]:
"""Return detailed info for a single dataset (variables, bounds, time range)."""
raw = _get_raw_catalog()
if isinstance(raw, list):
products = raw
else:
products = raw.get("products", [])
for product in products:
for ds in product.get("datasets", []):
if ds.get("dataset_id") == dataset_id:
variables, bounds = _get_dataset_reqs(ds)
return {
"dataset_id": dataset_id,
"title": product.get("title", ""),
"description": product.get("description", ""),
"variables": variables,
**bounds,
}
return None
def download_dataset(
dataset_id: str,
variables: List[str],
min_longitude: float,
max_longitude: float,
min_latitude: float,
max_latitude: float,
start_datetime: str,
end_datetime: str,
progress_callback: Optional[Callable[[int, str], None]] = None
) -> pd.DataFrame:
"""
Scarica i dati di un dataset da Copernicus Marine. L'SDK ufficiale di Copernicus,
restituisce i dati del download sotto forma di pandas Dataframe.
"""
import tempfile
import copernicusmarine
if progress_callback:
progress_callback(5, "Avvio dowload...")
# l'SDK di copernicus richiede l'autenticazione di un utente
if not os.getenv("COPERNICUS_USERNAME") or not os.getenv("COPERNICUS_PASSWORD"):
raise ValueError("non sono presenti username e password per copernicus.")
with tempfile.TemporaryDirectory() as tmpdir:
try:
copernicusmarine.subset(
dataset_id=dataset_id,
variables=variables,
minimum_longitude=min_longitude,
maximum_longitude=max_longitude,
minimum_latitude=min_latitude,
maximum_latitude=max_latitude,
start_datetime=start_datetime,
end_datetime=end_datetime,
username=os.getenv("COPERNICUS_USERNAME"),
password=os.getenv("COPERNICUS_PASSWORD"),
output_directory=tmpdir,
output_filename="data.nc",
force_download=True,
overwrite_output_data=True,
disable_progress_bar=True,
)
except TypeError:
# Fallback for older versions of copernicusmarine
copernicusmarine.subset(
dataset_id=dataset_id,
variables=variables,
minimum_longitude=min_longitude,
maximum_longitude=max_longitude,
minimum_latitude=min_latitude,
maximum_latitude=max_latitude,
start_datetime=start_datetime,
end_datetime=end_datetime,
username=os.getenv("COPERNICUS_USERNAME"),
password=os.getenv("COPERNICUS_PASSWORD"),
output_directory=tmpdir,
output_filename="data.nc",
overwrite=True,
disable_progress_bar=True,
)
if progress_callback:
progress_callback(50, "Download completato, elaboro i dati...")
import xarray as xr
ds = xr.open_dataset(os.path.join(tmpdir, "data.nc"))
df = ds.to_dataframe().reset_index()
ds.close()
if df is None or df.empty:
raise ValueError("Nessun dato disponibile. errore nel download")
if progress_callback:
progress_callback(75, "Elaborazione completata, formatto i dati...")
return df
def dataframe_to_bytes(df: pd.DataFrame, fmt: str, variable_renames: dict = None) -> tuple:
"""
Converte i dati in memorie sottoforma di DataFrame scaircati da Copernicus in byte per migliorarne l'elaborazione e la formattazione in file CSV o JSON."""
if variable_renames:
df = df.rename(columns=variable_renames)
if fmt == "csv":
buf = io.StringIO()
df.to_csv(buf, index=True)
return buf.getvalue().encode("utf-8"), "text/csv"
else:
buf = io.StringIO()
df.to_json(buf, orient="records", date_format="iso", indent=2)
return buf.getvalue().encode("utf-8"), "application/json"