feat: initialize microservice architecture with auth, api, realtime, copernicus, ml, and console modules

2026-03-28 15:29:34 +01:00
commit bcfce32adb
89 changed files with 12025 additions and 0 deletions
--- a/copernicus/core/cache.py
+++ b/copernicus/core/cache.py
@@ -0,0 +1,125 @@
+"""
+Redis Keys:
+- marine:catalog:full → lista dei dataset completo (TTL 1h)
+- marine:catalog:search:{hash} → risultati ricerca (TTL 30min)
+- marine:job:{session_id} → stato job download (TTL 48h)
+"""
+
+import json
+import os
+import logging
+from typing import Any, Optional
+
+import redis
+
+logger = logging.getLogger(__name__)
+
+# Configurazione Redis da variabili ambiente
+REDIS_HOST = os.getenv("REDIS_HOST", "meb-redis")
+REDIS_PORT = int(os.getenv("REDIS_PORT", "6379"))
+
+# Pool di connessioni condiviso (thread-safe, riutilizzabile)
+_pool: Optional[redis.ConnectionPool] = None
+_client: Optional[redis.Redis] = None
+
+
+def _get_client() -> Optional[redis.Redis]:
+    """Restituisce il client Redis singleton con connection pool.
+    Ritorna None se Redis non è raggiungibile."""
+    global _pool, _client
+
+    if _client is not None:
+        return _client
+
+    try:
+        _pool = redis.ConnectionPool(
+            host=REDIS_HOST,
+            port=REDIS_PORT,
+            # Decodifica automatica delle risposte in stringhe UTF-8
+            decode_responses=True,
+            # Massimo 5 connessioni nel pool (VPS 1-core, non serve di più)
+            max_connections=5,
+            # Timeout connessione e socket per evitare blocchi
+            socket_connect_timeout=3,
+            socket_timeout=3,
+            # Riprova automaticamente se la connessione viene interrotta
+            retry_on_timeout=True,
+        )
+        _client = redis.Redis(connection_pool=_pool)
+        # Test connessione
+        _client.ping()
+        logger.info("[Redis] Connessione stabilita per il servizio Marine")
+        return _client
+    except Exception as e:
+        logger.warning(f"[Redis] Non disponibile, la cache è disabilitata: {e}")
+        _client = None
+        return None
+
+
+def cache_get(key: str) -> Optional[Any]:
+    """Legge un valore dalla cache Redis.
+
+    Args:
+        key: Chiave Redis (es. 'marine:catalog:full')
+
+    Returns:
+        Il valore deserializzato da JSON, oppure None se non trovato o errore
+    """
+    try:
+        client = _get_client()
+        if client is None:
+            return None
+
+        data = client.get(key)
+        if data is None:
+            return None
+
+        return json.loads(data)
+    except Exception as e:
+        logger.warning(f"[Redis] Errore lettura chiave '{key}': {e}")
+        return None
+
+
+def cache_set(key: str, value: Any, ttl: int = 3600) -> bool:
+    """Scrive un valore nella cache Redis con TTL.
+
+    Args:
+        key: Chiave Redis
+        value: Valore da serializzare in JSON
+        ttl: Tempo di vita in secondi (default: 1 ora)
+
+    Returns:
+        True se scritto con successo, False altrimenti
+    """
+    try:
+        client = _get_client()
+        if client is None:
+            return False
+
+        serialized = json.dumps(value)
+        client.setex(key, ttl, serialized)
+        return True
+    except Exception as e:
+        logger.warning(f"[Redis] Errore scrittura chiave '{key}': {e}")
+        return False
+
+
+def cache_delete(key: str) -> bool:
+    """Elimina una chiave dalla cache Redis.
+
+    Args:
+        key: Chiave Redis da eliminare
+
+    Returns:
+        True se eliminata, False altrimenti
+    """
+    try:
+        client = _get_client()
+        if client is None:
+            return False
+
+        client.delete(key)
+        return True
+    except Exception as e:
+        logger.warning(f"[Redis] Errore eliminazione chiave '{key}': {e}")
+        return False
--- a/copernicus/core/copernicus.py
+++ b/copernicus/core/copernicus.py
@@ -0,0 +1,310 @@
+import hashlib
+import io
+import logging
+import os
+from datetime import datetime, timezone
+from typing import Callable, List, Optional
+
+import pandas as pd
+
+from core.cache import cache_get, cache_set
+
+logger = logging.getLogger(__name__)
+
+# ── Chiavi Redis e TTL ────────────────────────────────────────────────
+# Chiave per il catalogo completo Copernicus
+_CATALOG_KEY = "marine:catalog:full"
+# TTL del catalogo: 1 ora (il catalogo Copernicus cambia raramente)
+_CATALOG_TTL = 3600
+# TTL per i risultati di ricerca: 30 minuti
+_SEARCH_TTL = 1800
+
+
+def _fmt_description(name: Optional[str]) -> Optional[str]:
+    """Formatta meglio il titolo del dataset"""
+    if not name:
+        return None
+    return name.replace("_", " ").title()
+
+
+def _get_raw_catalog() -> dict:
+    """Interroga le API di Copernicus per ottenere la lista completa dei dataset.
+
+    Strategia cache Redis:
+    1. Cerca in Redis (chiave marine:catalog:full)
+    2. Se non trovato → chiama Copernicus SDK → salva in Redis con TTL 1h
+    3. Se Redis non disponibile → chiama sempre l'SDK (nessuna cache)
+
+    Il catalogo in Redis sopravvive al restart del servizio grazie
+    alla persistenza RDB+AOF configurata in redis.conf.
+    """
+    # Cerca in Redis prima di chiamare l'SDK Copernicus
+    cached = cache_get(_CATALOG_KEY)
+    if cached is not None:
+        logger.debug("[Catalogo] Servito da cache Redis")
+        return cached
+
+    # Cache miss: interroga Copernicus SDK (operazione lenta, ~10-30s)
+    logger.info("[Catalogo] Cache miss, scaricamento da Copernicus SDK...")
+    import copernicusmarine
+    catalog = copernicusmarine.describe(disable_progress_bar=True)
+
+    # Serializza la risposta SDK in un dizionario standard
+    if hasattr(catalog, "model_dump"):
+        result = catalog.model_dump()
+    elif hasattr(catalog, "__dict__"):
+        result = catalog.__dict__
+    else:
+        result = catalog
+
+    # Salva in Redis per le prossime richieste (TTL 1 ora)
+    cache_set(_CATALOG_KEY, result, _CATALOG_TTL)
+    logger.info("[Catalogo] Salvato in cache Redis")
+
+    return result
+
+
+def _get_dataset_reqs(ds: dict) -> tuple:
+    """
+    Ottieni dalla risposta del dataset le variabili disponibili e le coordinate dell'area disponibile.
+
+    Attualmente è implementato Copernicus SDK v2, le variabili sono in::
+      dataset -> versions[-1] -> parts[] -> services[] -> variables[]
+
+    Le coordinate sono disponibili in variable.bbox = [min_lon, min_lat, max_lon, max_lat].
+    La finestra temporale disponibile è nel servizio "arco-time-series"
+    dove coordinate_id == 'time' (i valori sono in millisecondi, usando Unix epoch).
+    """
+    variables = []
+    seen: set = set()
+    bounds = {
+        "min_longitude": None, "max_longitude": None,
+        "min_latitude": None, "max_latitude": None,
+        "start_datetime": None, "end_datetime": None,
+    }
+
+    versions = ds.get("versions", [])
+    if not versions:
+        return variables, bounds
+
+    for part in versions[-1].get("parts", []):
+        for service in part.get("services", []):
+            service_name = service.get("service_name", "")
+            for var in service.get("variables", []):
+                short_name = var.get("short_name", "")
+                if not short_name or short_name in seen:
+                    continue
+                seen.add(short_name)
+                std = var.get("standard_name")
+                variables.append({
+                    "short_name": short_name,
+                    "standard_name": std,
+                    "units": var.get("units"),
+                    "description": _fmt_description(std),
+                })
+
+                # Ottieni la box delle coordinate
+                if bounds["min_longitude"] is None:
+                    bbox = var.get("bbox")
+                    if bbox and len(bbox) >= 4:
+                        # [min_lon, min_lat, max_lon, max_lat]
+                        bounds["min_longitude"] = bbox[0]
+                        bounds["min_latitude"] = bbox[1]
+                        bounds["max_longitude"] = bbox[2]
+                        bounds["max_latitude"] = bbox[3]
+
+                # Ottieni la finestra temporale del dataset dal servizio "arco-time-series"
+                if bounds["start_datetime"] is None and "arco-time" in service_name:
+                    for coord in var.get("coordinates", []):
+                        if coord.get("coordinate_id") == "time":
+                            min_ms = coord.get("minimum_value")
+                            max_ms = coord.get("maximum_value")
+                            if min_ms is not None:
+                                bounds["start_datetime"] = datetime.fromtimestamp(
+                                    min_ms / 1000, tz=timezone.utc
+                                ).strftime("%Y-%m-%d")
+                            if max_ms is not None:
+                                bounds["end_datetime"] = datetime.fromtimestamp(
+                                    max_ms / 1000, tz=timezone.utc
+                                ).strftime("%Y-%m-%d")
+                            break
+
+    return variables, bounds
+
+
+def get_catalog(search: Optional[str] = None, limit: int = 50, offset: int = 0) -> dict:
+    """Ottieni dataset dal catalogo Copernicus Marine, filtrabili per nome o ID.
+
+    Cache Redis per le ricerche:
+    - Chiave: marine:catalog:search:{md5(search|limit|offset)}
+    - TTL: 30 minuti
+    - La cache ricerca viene invalidata quando il catalogo scade (1h)
+    """
+    # Genera chiave cache unica per questa combinazione di parametri
+    cache_key = None
+    if search:
+        query_hash = hashlib.md5(f"{search}|{limit}|{offset}".encode()).hexdigest()[:12]
+        cache_key = f"marine:catalog:search:{query_hash}"
+
+        # Cerca risultato in cache Redis
+        cached_result = cache_get(cache_key)
+        if cached_result is not None:
+            logger.debug(f"[Catalogo] Ricerca '{search}' servita da cache Redis")
+            return cached_result
+
+    raw = _get_raw_catalog()
+    # Gestisce formati diversi della risposta SDK (lista o dizionario)
+    if isinstance(raw, list):
+        products = raw
+    else:
+        products = raw.get("products", [])
+
+    results = []
+    for product in products:
+        title = product.get("title", "")
+        description = product.get("description", "")
+
+        for ds in product.get("datasets", []):
+            dataset_id = ds.get("dataset_id", "")
+
+            if search:
+                needle = search.lower()
+                if needle not in dataset_id.lower() and needle not in title.lower():
+                    continue
+
+            variables, bounds = _get_dataset_reqs(ds)
+            results.append({
+                "dataset_id": dataset_id,
+                "title": title,
+                "description": description[:200] if description else "",
+                "variables": variables,
+                **bounds,
+            })
+
+    total = len(results)
+    page = results[offset: offset + limit]
+    response = {"total": total, "offset": offset, "limit": limit, "datasets": page}
+
+    # Salva risultato ricerca in cache Redis (solo se c'è un filtro di ricerca)
+    if cache_key:
+        cache_set(cache_key, response, _SEARCH_TTL)
+
+    return response
+
+
+def get_dataset_info(dataset_id: str) -> Optional[dict]:
+    """Return detailed info for a single dataset (variables, bounds, time range)."""
+    raw = _get_raw_catalog()
+    if isinstance(raw, list):
+        products = raw
+    else:
+        products = raw.get("products", [])
+        
+    for product in products:
+        for ds in product.get("datasets", []):
+            if ds.get("dataset_id") == dataset_id:
+                variables, bounds = _get_dataset_reqs(ds)
+                return {
+                    "dataset_id": dataset_id,
+                    "title": product.get("title", ""),
+                    "description": product.get("description", ""),
+                    "variables": variables,
+                    **bounds,
+                }
+    return None
+
+
+def download_dataset(
+    dataset_id: str,
+    variables: List[str],
+    min_longitude: float,
+    max_longitude: float,
+    min_latitude: float,
+    max_latitude: float,
+    start_datetime: str,
+    end_datetime: str,
+    progress_callback: Optional[Callable[[int, str], None]] = None
+) -> pd.DataFrame:
+    """
+    Scarica i dati di un dataset da Copernicus Marine. L'SDK ufficiale di Copernicus, 
+    restituisce i dati del download sotto forma di pandas Dataframe. 
+    """
+    import tempfile
+
+    import copernicusmarine
+
+    if progress_callback:
+        progress_callback(5, "Avvio dowload...")
+
+    # l'SDK di copernicus richiede l'autenticazione di un utente
+    if not os.getenv("COPERNICUS_USERNAME") or not os.getenv("COPERNICUS_PASSWORD"):
+        raise ValueError("non sono presenti username e password per copernicus.")
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        try:
+            copernicusmarine.subset(
+                dataset_id=dataset_id,
+                variables=variables,
+                minimum_longitude=min_longitude,
+                maximum_longitude=max_longitude,
+                minimum_latitude=min_latitude,
+                maximum_latitude=max_latitude,
+                start_datetime=start_datetime,
+                end_datetime=end_datetime,
+                username=os.getenv("COPERNICUS_USERNAME"),
+                password=os.getenv("COPERNICUS_PASSWORD"),
+                output_directory=tmpdir,
+                output_filename="data.nc",
+                force_download=True,
+                overwrite_output_data=True,
+                disable_progress_bar=True,
+            )
+        except TypeError:
+            # Fallback for older versions of copernicusmarine
+            copernicusmarine.subset(
+                dataset_id=dataset_id,
+                variables=variables,
+                minimum_longitude=min_longitude,
+                maximum_longitude=max_longitude,
+                minimum_latitude=min_latitude,
+                maximum_latitude=max_latitude,
+                start_datetime=start_datetime,
+                end_datetime=end_datetime,
+                username=os.getenv("COPERNICUS_USERNAME"),
+                password=os.getenv("COPERNICUS_PASSWORD"),
+                output_directory=tmpdir,
+                output_filename="data.nc",
+                overwrite=True,
+                disable_progress_bar=True,
+            )
+
+        if progress_callback:
+            progress_callback(50, "Download completato, elaboro i dati...")
+
+        import xarray as xr
+        ds = xr.open_dataset(os.path.join(tmpdir, "data.nc"))
+        df = ds.to_dataframe().reset_index()
+        ds.close()
+
+    if df is None or df.empty:
+        raise ValueError("Nessun dato disponibile. errore nel download")
+
+    if progress_callback:
+        progress_callback(75, "Elaborazione completata, formatto i dati...")
+
+    return df
+
+
+def dataframe_to_bytes(df: pd.DataFrame, fmt: str, variable_renames: dict = None) -> tuple:
+    """
+    Converte i dati in memorie sottoforma di DataFrame scaircati da Copernicus in byte per migliorarne l'elaborazione e la formattazione in file CSV o JSON."""
+    if variable_renames:
+        df = df.rename(columns=variable_renames)
+    if fmt == "csv":
+        buf = io.StringIO()
+        df.to_csv(buf, index=True)
+        return buf.getvalue().encode("utf-8"), "text/csv"
+    else:
+        buf = io.StringIO()
+        df.to_json(buf, orient="records", date_format="iso", indent=2)
+        return buf.getvalue().encode("utf-8"), "application/json"
--- a/copernicus/core/storage.py
+++ b/copernicus/core/storage.py
@@ -0,0 +1,112 @@
+import io
+import json
+import os
+from typing import Any, Optional
+
+from minio.error import S3Error
+
+from minio import Minio
+
+_minio_host = os.getenv("MINIO_ENDPOINT", "minio")
+_minio_port = os.getenv("MINIO_PORT", "9000")
+MINIO_ACCESS_KEY = os.getenv("MINIO_ACCESS_KEY", "meb-admin")
+MINIO_SECRET_KEY = os.getenv("MINIO_SECRET_KEY", "meb-cloud")
+MINIO_SECURE = os.getenv("MINIO_SECURE", "false").lower() == "true"
+
+DATASETS_BUCKET = "datasets"
+METADATA_FILE = "metadata.json"
+
+_client: Optional[Minio] = None
+
+
+def get_client() -> Minio:
+    
+    global _client
+    if _client is None:
+        _client = Minio(
+            f"{_minio_host}:{_minio_port}",
+            access_key=MINIO_ACCESS_KEY,
+            secret_key=MINIO_SECRET_KEY,
+            secure=MINIO_SECURE
+        )
+    return _client
+
+
+def bucket_exists(bucket: str = DATASETS_BUCKET) -> bool:
+    try:
+        client = get_client()
+        if not client.bucket_exists(bucket):
+            client.make_bucket(bucket)
+        return True
+    except Exception as e:
+        print(f"[Storage] Error in '{bucket}': {e}")
+        return False
+
+
+def fetch_metadata() -> dict:
+    """Il bucket datasets contiene un file JSON di metadata valido per tutti i file dataset salvati, che questi siano JSON, csv o 
+    un altro formato. I metadata per ogni file sono salvati come oggetti nel file metadata.json. """
+    try:
+        client = get_client()
+        response = client.get_object(DATASETS_BUCKET, METADATA_FILE)
+        data = json.loads(response.read().decode("utf-8"))
+        response.close()
+        return data
+    except S3Error as e:
+        if e.code == "NoSuchKey":
+            return {"datasets": []}
+        raise
+    except Exception:
+        return {"datasets": []}
+
+
+def write_metadata(data: dict) -> None:
+    """Aggiunge al file metadata.json un nuovo oggetto con l'id del nuovo file caricato dall'utente"""
+    client = get_client()
+    raw = json.dumps(data, ensure_ascii=False, indent=2).encode("utf-8")
+    client.put_object(
+        DATASETS_BUCKET,
+        METADATA_FILE,
+        io.BytesIO(raw),
+        length=len(raw),
+        content_type="application/json"
+    )
+
+
+def upload_file(data: bytes, filename: str, content_type: str) -> None:
+    """Carica un nuovo file di qualsiasi formato nel bucket dataset."""
+    client = get_client()
+    client.put_object(
+        DATASETS_BUCKET,
+        filename,
+        io.BytesIO(data),
+        length=len(data),
+        content_type=content_type
+    )
+
+
+def delete_file(filename: str) -> None:
+    """Elimina un file dal bucket dataset."""
+    client = get_client()
+    client.remove_object(DATASETS_BUCKET, filename)
+
+
+def get_presigned_url(filename: str, expires_hours: int = 1) -> str:
+    """Genera un URL temporaneo per scaricare un file dal bucket dataset"""
+    from datetime import timedelta
+    client = get_client()
+    return client.presigned_get_object(
+        DATASETS_BUCKET,
+        filename,
+        expires=timedelta(hours=expires_hours)
+    )
+
+
+def file_exists(filename: str) -> bool:
+    """Verifica se un file esiste."""
+    try:
+        client = get_client()
+        client.stat_object(DATASETS_BUCKET, filename)
+        return True
+    except S3Error:
+        return False