Source code for geolatent.utils.validators

"""Input validation utilities for geolatent.

All public API functions route their inputs through the validators defined here
before any computation begins.  Validation is strict-but-informative: errors
surface with enough context to enable fast diagnosis without requiring the user
to read source code.
"""

from __future__ import annotations

from typing import Any, Dict, Optional, Tuple

import numpy as np


# Data validators


[docs] def validate_feature_matrix( X: Any, *, min_samples: int = 4, min_features: int = 2, name: str = "X", ) -> np.ndarray: """Validate and coerce a feature matrix. Parameters ---------- X : array-like of shape (n_samples, n_features) Input feature matrix. min_samples : int Minimum acceptable number of samples. min_features : int Minimum acceptable number of features. name : str Variable name used in error messages. Returns ------- X : np.ndarray of shape (n_samples, n_features), dtype float64 Raises ------ TypeError If *X* cannot be converted to a NumPy array. ValueError If *X* is not 2-D, contains fewer than *min_samples* rows, fewer than *min_features* columns, or contains non-finite values. """ try: X = np.asarray(X, dtype=np.float64) except (TypeError, ValueError) as exc: raise TypeError( f"'{name}' must be convertible to a float64 NumPy array; " f"got type {type(X).__name__}." ) from exc if X.ndim != 2: raise ValueError( f"'{name}' must be a 2-D array of shape (n_samples, n_features), " f"got shape {X.shape}." ) n_samples, n_features = X.shape if n_samples < min_samples: raise ValueError( f"'{name}' must have at least {min_samples} samples; " f"got {n_samples}." ) if n_features < min_features: raise ValueError( f"'{name}' must have at least {min_features} features for 3-D " f"projection; got {n_features}." ) if not np.isfinite(X).all(): n_nan = np.sum(np.isnan(X)) n_inf = np.sum(np.isinf(X)) raise ValueError( f"'{name}' contains {n_nan} NaN value(s) and {n_inf} Inf value(s). " "Impute or remove these before calling geolatent." ) return X
[docs] def validate_label_vector( y: Any, *, n_samples: int, name: str = "y", ) -> np.ndarray: """Validate and coerce a label / target vector. Parameters ---------- y : array-like of shape (n_samples,) Class labels or regression targets. n_samples : int Expected length (must match the corresponding feature matrix). name : str Variable name used in error messages. Returns ------- y : np.ndarray of shape (n_samples,) Raises ------ TypeError If *y* cannot be converted to a NumPy array. ValueError If *y* is not 1-D or its length does not match *n_samples*. """ try: y = np.asarray(y) except (TypeError, ValueError) as exc: raise TypeError( f"'{name}' must be convertible to a NumPy array; " f"got type {type(y).__name__}." ) from exc if y.ndim != 1: raise ValueError( f"'{name}' must be a 1-D array of shape (n_samples,), " f"got shape {y.shape}." ) if len(y) != n_samples: raise ValueError( f"Length mismatch: X has {n_samples} samples but '{name}' has " f"{len(y)} entries." ) return y
[docs] def validate_classification_labels(y: np.ndarray, *, name: str = "y") -> np.ndarray: """Assert that *y* is suitable for classification. Parameters ---------- y : np.ndarray Label vector (already validated by :func:`validate_label_vector`). name : str Returns ------- y : np.ndarray The input unchanged. Raises ------ ValueError If *y* contains only one unique class (degenerate problem). """ n_classes = len(np.unique(y)) if n_classes < 2: raise ValueError( f"'{name}' must contain at least 2 distinct classes for " f"classification visualisation; found {n_classes}." ) return y
[docs] def validate_embeddings( embeddings: Any, labels: Any, ) -> Tuple[np.ndarray, np.ndarray]: """Validate an (embeddings, labels) pair for latent-space visualisation. Parameters ---------- embeddings : array-like of shape (n_samples, n_dims) High-dimensional embedding vectors. labels : array-like of shape (n_samples,) Integer or string class labels. Returns ------- embeddings : np.ndarray of shape (n_samples, n_dims) labels : np.ndarray of shape (n_samples,) """ embeddings = validate_feature_matrix(embeddings, min_features=2, name="embeddings") labels = validate_label_vector(labels, n_samples=len(embeddings), name="labels") return embeddings, labels
# Model validators
[docs] def validate_sklearn_model(model: Any, *, require_predict_proba: bool = False) -> None: """Validate that *model* exposes a scikit-learn-compatible interface. Parameters ---------- model : Any The model to validate. require_predict_proba : bool If ``True``, also assert that ``predict_proba`` is present. Raises ------ TypeError If *model* does not expose ``predict``. AttributeError If *require_predict_proba* is ``True`` and ``predict_proba`` is absent. """ if not callable(getattr(model, "predict", None)): raise TypeError( f"model of type '{type(model).__name__}' must implement a callable " "``predict(X)`` method (sklearn estimator interface required)." ) if require_predict_proba and not callable(getattr(model, "predict_proba", None)): raise AttributeError( f"model of type '{type(model).__name__}' does not implement " "``predict_proba``. Use a probabilistic classifier (e.g., " "RandomForestClassifier, SVC(probability=True)) or disable " "confidence surface rendering." )
[docs] def validate_class_names( class_names: Optional[Dict], unique_classes: np.ndarray, ) -> Optional[Dict]: """Validate an optional class-name mapping. Parameters ---------- class_names : dict or None Mapping from class label to display string. unique_classes : np.ndarray Array of unique class labels detected in the data. Returns ------- class_names : dict or None The input unchanged if valid. Raises ------ TypeError If *class_names* is not a dict. """ if class_names is None: return None if not isinstance(class_names, dict): raise TypeError( f"class_names must be a dict mapping class labels to strings; " f"got {type(class_names).__name__}." ) return class_names