Skip to content

GMM

Module: gmm.py

This module provides a Gaussian Mixture Model (GMM) clustering implementation using scikit-learn's GaussianMixture.

It optionally allows a user-defined number of components or automatically scans [2..max_k] for the best silhouette score.

Classes:

Name Description
Gmm

Gaussian Mixture Model clustering with optional user-specified n_components or silhouette-based auto selection.

Dependencies
  • numpy
  • sklearn.mixture.GaussianMixture
  • sklearn.metrics.silhouette_score
  • base.py (Clustering)
Key Features
  • Automatic scanning of [2..max_k] for best silhouette score if n_components is None
  • Final model is stored, along with predicted cluster labels
  • Ties into the base Clustering for plotting/metrics
Authors
  • Debajyoti Sahoo (debajyotis@iisc.ac.in)
Version Info
  • 28/Dec/2024: Initial release

Gmm

Bases: Clustering

Gaussian Mixture Model clustering with optional user-defined 'n_components' or automatic silhouette-based selection.

Attributes:

Name Type Description
n_components Optional[int]

The actual number of components used in the final fitted model. If provided, the class will skip auto-selection and directly use this many mixture components.

max_k int

Maximum number of components to consider for auto selection if n_components is None.

labels Optional[ndarray]

Cluster/component labels for each data point after fitting.

Source code in scirex/core/ml/unsupervised/clustering/gmm.py
class Gmm(Clustering):
    """
    Gaussian Mixture Model clustering with optional user-defined 'n_components'
    or automatic silhouette-based selection.

    Attributes:
        n_components (Optional[int]):
            The actual number of components used in the final fitted model.
            If provided, the class will skip auto-selection
            and directly use this many mixture components.
        max_k (int):
            Maximum number of components to consider for auto selection if n_components is None.
        labels (Optional[np.ndarray]):
            Cluster/component labels for each data point after fitting.
    """

    def __init__(self, n_components: Optional[int] = None, max_k: int = 10) -> None:
        """
        Initialize the Gmm clustering class.

        Args:
            n_components (Optional[int], optional):
                If provided, the model will directly use this many Gaussian components.
                Otherwise, it scans [2..max_k] for the best silhouette score. Defaults to None.
            max_k (int, optional):
                Maximum components to try for auto selection if n_components is None. Defaults to 10.
        """
        super().__init__("gmm")
        self.n_components = n_components
        self.max_k = max_k

        # Populated after fitting
        self.labels: Optional[np.ndarray] = None
        self.n_components_: Optional[int] = None
        self.model: Optional[GaussianMixture] = None

    def fit(self, X: np.ndarray) -> None:
        """
        Fit the GMM model to the data.

        If user-defined n_components is set, skip auto selection.
        Otherwise, compute silhouette scores across [2..max_k]
        and pick the best.

        Args:
            X (np.ndarray): Scaled feature matrix of shape (n_samples, n_features).
        """
        X = X.astype(np.float32, copy=False)
        n_samples, n_features = X.shape

        if self.n_components is not None:
            # Use user-specified
            self.n_components_ = self.n_components
            print(f"Fitting GMM with user-defined n_components={self.n_components_}.\n")
        else:
            # Automatic silhouette-based selection
            k_values = range(2, self.max_k + 1)
            silhouettes = []

            # Subsampling for silhouette
            rng = np.random.default_rng(self.random_state)
            sample_size = min(1000, n_samples)

            for k in k_values:
                gmm = GaussianMixture(n_components=k, random_state=self.random_state)
                gmm.fit(X)
                labels_candidate = gmm.predict(X)

                # Must have at least 2 distinct clusters for silhouette
                if len(np.unique(labels_candidate)) > 1:
                    if n_samples > sample_size:
                        indices = rng.choice(n_samples, sample_size, replace=False)
                        X_sample = X[indices]
                        labels_sample = labels_candidate[indices]
                    else:
                        X_sample = X
                        labels_sample = labels_candidate
                    score = silhouette_score(X_sample, labels_sample)
                else:
                    score = -1  # invalid silhouette

                silhouettes.append(score)

            best_k = k_values[np.argmax(silhouettes)]
            self.n_components_ = best_k
            print(f"Optimal k (silhouette) = {best_k}\n")

        self.model = GaussianMixture(
            n_components=self.n_components_, random_state=self.random_state
        )
        self.labels = self.model.fit_predict(X)

        print(f"GMM fitted with n_components={self.n_components_}.\n")

    def get_model_params(self) -> Dict[str, Any]:
        """
        Get parameters/results of the fitted GMM model.

        Returns:
            Dict[str, Any]:
                - n_components (int): The final number of components used
                - max_k (int): The maximum considered if auto
        """
        return {"n_components": self.n_components_, "max_k": self.max_k}

__init__(n_components=None, max_k=10)

Initialize the Gmm clustering class.

Parameters:

Name Type Description Default
n_components Optional[int]

If provided, the model will directly use this many Gaussian components. Otherwise, it scans [2..max_k] for the best silhouette score. Defaults to None.

None
max_k int

Maximum components to try for auto selection if n_components is None. Defaults to 10.

10
Source code in scirex/core/ml/unsupervised/clustering/gmm.py
def __init__(self, n_components: Optional[int] = None, max_k: int = 10) -> None:
    """
    Initialize the Gmm clustering class.

    Args:
        n_components (Optional[int], optional):
            If provided, the model will directly use this many Gaussian components.
            Otherwise, it scans [2..max_k] for the best silhouette score. Defaults to None.
        max_k (int, optional):
            Maximum components to try for auto selection if n_components is None. Defaults to 10.
    """
    super().__init__("gmm")
    self.n_components = n_components
    self.max_k = max_k

    # Populated after fitting
    self.labels: Optional[np.ndarray] = None
    self.n_components_: Optional[int] = None
    self.model: Optional[GaussianMixture] = None

fit(X)

Fit the GMM model to the data.

If user-defined n_components is set, skip auto selection. Otherwise, compute silhouette scores across [2..max_k] and pick the best.

Parameters:

Name Type Description Default
X ndarray

Scaled feature matrix of shape (n_samples, n_features).

required
Source code in scirex/core/ml/unsupervised/clustering/gmm.py
def fit(self, X: np.ndarray) -> None:
    """
    Fit the GMM model to the data.

    If user-defined n_components is set, skip auto selection.
    Otherwise, compute silhouette scores across [2..max_k]
    and pick the best.

    Args:
        X (np.ndarray): Scaled feature matrix of shape (n_samples, n_features).
    """
    X = X.astype(np.float32, copy=False)
    n_samples, n_features = X.shape

    if self.n_components is not None:
        # Use user-specified
        self.n_components_ = self.n_components
        print(f"Fitting GMM with user-defined n_components={self.n_components_}.\n")
    else:
        # Automatic silhouette-based selection
        k_values = range(2, self.max_k + 1)
        silhouettes = []

        # Subsampling for silhouette
        rng = np.random.default_rng(self.random_state)
        sample_size = min(1000, n_samples)

        for k in k_values:
            gmm = GaussianMixture(n_components=k, random_state=self.random_state)
            gmm.fit(X)
            labels_candidate = gmm.predict(X)

            # Must have at least 2 distinct clusters for silhouette
            if len(np.unique(labels_candidate)) > 1:
                if n_samples > sample_size:
                    indices = rng.choice(n_samples, sample_size, replace=False)
                    X_sample = X[indices]
                    labels_sample = labels_candidate[indices]
                else:
                    X_sample = X
                    labels_sample = labels_candidate
                score = silhouette_score(X_sample, labels_sample)
            else:
                score = -1  # invalid silhouette

            silhouettes.append(score)

        best_k = k_values[np.argmax(silhouettes)]
        self.n_components_ = best_k
        print(f"Optimal k (silhouette) = {best_k}\n")

    self.model = GaussianMixture(
        n_components=self.n_components_, random_state=self.random_state
    )
    self.labels = self.model.fit_predict(X)

    print(f"GMM fitted with n_components={self.n_components_}.\n")

get_model_params()

Get parameters/results of the fitted GMM model.

Returns:

Type Description
Dict[str, Any]

Dict[str, Any]: - n_components (int): The final number of components used - max_k (int): The maximum considered if auto

Source code in scirex/core/ml/unsupervised/clustering/gmm.py
def get_model_params(self) -> Dict[str, Any]:
    """
    Get parameters/results of the fitted GMM model.

    Returns:
        Dict[str, Any]:
            - n_components (int): The final number of components used
            - max_k (int): The maximum considered if auto
    """
    return {"n_components": self.n_components_, "max_k": self.max_k}