naive bayes

Module: naive_bayes.py

This module implements Naive Bayes classification algorithms, including: - Gaussian Naive Bayes - Multinomial Naive Bayes - Bernoulli Naive Bayes

It provides functionality to

Train Naive Bayes classifiers on a dataset
Evaluate model performance using classification metrics
Visualize results with a confusion matrix
Optimize hyperparameters using grid search

Classes:

Name	Description
`NaiveBayes`	Implements Naive Bayes classification using scikit-learn.

Dependencies

numpy
sklearn.naive_bayes.GaussianNB
sklearn.naive_bayes.MultinomialNB
sklearn.naive_bayes.BernoulliNB
sklearn.metrics (classification metrics)
base.py (Classification)

Key Features

Support for Gaussian, Multinomial, and Bernoulli Naive Bayes
Grid search for hyperparameter tuning
Automatic data preparation and evaluation

Authors

Protyush P. Chowdhury (protyushc@iisc.ac.in)

Version Info

28/Dec/2024: Initial version

`NaiveBayes`

Bases: Classification

Implements Naive Bayes classification for Gaussian, Multinomial, and Bernoulli distributions.

Source code in scirex/core/ml/supervised/classification/naive_bayes.py

class NaiveBayes(Classification):
    """
    Implements Naive Bayes classification for Gaussian, Multinomial, and Bernoulli distributions.
    """

    def __init__(self, model_type: str = "gaussian", random_state: int = 42) -> None:
        """
        Initialize the NaiveBayes classifier.

        Args:
            model_type (str): Type of Naive Bayes classifier. Options are:
                              "gaussian", "multinomial", "bernoulli".
            random_state (int): Seed for reproducibility where applicable.
        """
        super().__init__(model_type=model_type, random_state=random_state)

        if model_type == "gaussian":
            self.model = GaussianNB()
        elif model_type == "multinomial":
            self.model = MultinomialNB()
        elif model_type == "bernoulli":
            self.model = BernoulliNB()
        else:
            raise ValueError(
                "Invalid model_type. Choose 'gaussian', 'multinomial', or 'bernoulli'."
            )

    def fit(self, X_train: np.ndarray, y_train: np.ndarray) -> None:
        """
        Train the Naive Bayes model.

        Args:
            X_train (np.ndarray): Training data features.
            y_train (np.ndarray): Training data labels.
        """
        self.model.fit(X_train, y_train)

    def evaluate(self, X_test: np.ndarray, y_test: np.ndarray) -> Dict[str, Any]:
        """
        Evaluate the model on test data.

        Args:
            X_test (np.ndarray): Test data features.
            y_test (np.ndarray): Test data labels.

        Returns:
            Dict[str, Any]: Dictionary containing evaluation metrics (accuracy, precision, recall, F1-score).
        """
        y_pred = self.model.predict(X_test)
        report = classification_report(y_test, y_pred, output_dict=True)
        return {
            "accuracy": report["accuracy"],
            "precision": report["weighted avg"]["precision"],
            "recall": report["weighted avg"]["recall"],
            "f1_score": report["weighted avg"]["f1-score"],
        }

    def plot(self, X_test: np.ndarray, y_test: np.ndarray) -> None:
        """
        Plot the confusion matrix for the test data.

        Args:
            X_test (np.ndarray): Test data features.
            y_test (np.ndarray): Test data labels.
        """
        y_pred = self.model.predict(X_test)
        cm = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(8, 6))
        sns.heatmap(
            cm,
            annot=True,
            fmt="d",
            cmap="Blues",
            xticklabels=np.unique(y_test),
            yticklabels=np.unique(y_test),
        )
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.title(f"Confusion Matrix - {self.model_type.capitalize()} Naive Bayes")
        plt.show()

    def grid_search(
        self,
        X_train: np.ndarray,
        y_train: np.ndarray,
        param_grid: Dict[str, Any],
        cv: int = 5,
    ) -> None:
        """
        Perform hyperparameter tuning using grid search.

        Args:
            X_train (np.ndarray): Training data features.
            y_train (np.ndarray): Training data labels.
            param_grid (Dict[str, Any]): Dictionary of hyperparameters to search.
            cv (int): Number of cross-validation folds. Default is 5.
        """
        grid = GridSearchCV(
            estimator=self.model, param_grid=param_grid, scoring="accuracy", cv=cv
        )
        grid.fit(X_train, y_train)
        self.model = grid.best_estimator_
        print(f"Best Parameters: {grid.best_params_}")
        print(f"Best Cross-Validated Accuracy: {grid.best_score_}")

    def run(
        self,
        data: np.ndarray,
        labels: np.ndarray,
        test_size: float = 0.2,
        param_grid: Dict[str, Any] = None,
        cv: int = 5,
    ) -> Dict[str, Any]:
        """
        Execute the full classification pipeline with optional grid search.

        Args:
            data (np.ndarray): Input features.
            labels (np.ndarray): Input labels.
            test_size (float): Proportion of data to use for testing. Defaults to 0.2.
            param_grid (Dict[str, Any], optional): Dictionary of hyperparameters to search for grid search. If None, grid search will not be performed.
            cv (int): Number of cross-validation folds for grid search. Default is 5.

        Returns:
            Dict[str, Any]: Performance metrics.
        """
        # Split the data into training and test sets
        X_train, X_test, y_train, y_test = train_test_split(
            data, labels, test_size=test_size, random_state=self.random_state
        )

        # Perform grid search if param_grid is provided
        if param_grid:
            self.grid_search(X_train, y_train, param_grid, cv=cv)

        # Train the model with the (possibly tuned) parameters
        self.fit(X_train, y_train)

        # Evaluate the model
        metrics = self.evaluate(X_test, y_test)

        # Plot the confusion matrix
        self.plot(X_test, y_test)

        return metrics

    # Implementing the abstract method get_model_params
    def get_model_params(self) -> Dict[str, Any]:
        """
        Return the parameters of the model.

        Returns:
            Dict[str, Any]: Dictionary containing model parameters.
        """
        return {
            "alpha": self.model.alpha,
            "fit_prior": self.model.fit_prior,
            "class_prior": self.model.class_prior,
        }

    # Method to save the model to a file
    def save_model(self, file_path: str) -> None:
        """
        Save the trained model to a file.

        Args:
            file_path (str): Path where the model will be saved.
        """
        joblib.dump(self.model, file_path)
        print(f"Model saved to {file_path}")

`init(model_type='gaussian', random_state=42)`

Initialize the NaiveBayes classifier.

Parameters:

Name	Type	Description	Default
`model_type`	`str`	Type of Naive Bayes classifier. Options are: "gaussian", "multinomial", "bernoulli".	`'gaussian'`
`random_state`	`int`	Seed for reproducibility where applicable.	`42`

Source code in scirex/core/ml/supervised/classification/naive_bayes.py

def __init__(self, model_type: str = "gaussian", random_state: int = 42) -> None:
    """
    Initialize the NaiveBayes classifier.

    Args:
        model_type (str): Type of Naive Bayes classifier. Options are:
                          "gaussian", "multinomial", "bernoulli".
        random_state (int): Seed for reproducibility where applicable.
    """
    super().__init__(model_type=model_type, random_state=random_state)

    if model_type == "gaussian":
        self.model = GaussianNB()
    elif model_type == "multinomial":
        self.model = MultinomialNB()
    elif model_type == "bernoulli":
        self.model = BernoulliNB()
    else:
        raise ValueError(
            "Invalid model_type. Choose 'gaussian', 'multinomial', or 'bernoulli'."
        )

`evaluate(X_test, y_test)`

Evaluate the model on test data.

Parameters:

Name	Type	Description	Default
`X_test`	`ndarray`	Test data features.	required
`y_test`	`ndarray`	Test data labels.	required

Returns:

Type	Description
`Dict[str, Any]`	Dict[str, Any]: Dictionary containing evaluation metrics (accuracy, precision, recall, F1-score).

Source code in scirex/core/ml/supervised/classification/naive_bayes.py

def evaluate(self, X_test: np.ndarray, y_test: np.ndarray) -> Dict[str, Any]:
    """
    Evaluate the model on test data.

    Args:
        X_test (np.ndarray): Test data features.
        y_test (np.ndarray): Test data labels.

    Returns:
        Dict[str, Any]: Dictionary containing evaluation metrics (accuracy, precision, recall, F1-score).
    """
    y_pred = self.model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    return {
        "accuracy": report["accuracy"],
        "precision": report["weighted avg"]["precision"],
        "recall": report["weighted avg"]["recall"],
        "f1_score": report["weighted avg"]["f1-score"],
    }

`fit(X_train, y_train)`

Train the Naive Bayes model.

Parameters:

Name	Type	Description	Default
`X_train`	`ndarray`	Training data features.	required
`y_train`	`ndarray`	Training data labels.	required

Source code in scirex/core/ml/supervised/classification/naive_bayes.py

def fit(self, X_train: np.ndarray, y_train: np.ndarray) -> None:
    """
    Train the Naive Bayes model.

    Args:
        X_train (np.ndarray): Training data features.
        y_train (np.ndarray): Training data labels.
    """
    self.model.fit(X_train, y_train)

`get_model_params()`

Return the parameters of the model.

Returns:

Type	Description
`Dict[str, Any]`	Dict[str, Any]: Dictionary containing model parameters.

Source code in scirex/core/ml/supervised/classification/naive_bayes.py

def get_model_params(self) -> Dict[str, Any]:
    """
    Return the parameters of the model.

    Returns:
        Dict[str, Any]: Dictionary containing model parameters.
    """
    return {
        "alpha": self.model.alpha,
        "fit_prior": self.model.fit_prior,
        "class_prior": self.model.class_prior,
    }

`grid_search(X_train, y_train, param_grid, cv=5)`

Perform hyperparameter tuning using grid search.

Parameters:

Name	Type	Description	Default
`X_train`	`ndarray`	Training data features.	required
`y_train`	`ndarray`	Training data labels.	required
`param_grid`	`Dict[str, Any]`	Dictionary of hyperparameters to search.	required
`cv`	`int`	Number of cross-validation folds. Default is 5.	`5`

Source code in scirex/core/ml/supervised/classification/naive_bayes.py

def grid_search(
    self,
    X_train: np.ndarray,
    y_train: np.ndarray,
    param_grid: Dict[str, Any],
    cv: int = 5,
) -> None:
    """
    Perform hyperparameter tuning using grid search.

    Args:
        X_train (np.ndarray): Training data features.
        y_train (np.ndarray): Training data labels.
        param_grid (Dict[str, Any]): Dictionary of hyperparameters to search.
        cv (int): Number of cross-validation folds. Default is 5.
    """
    grid = GridSearchCV(
        estimator=self.model, param_grid=param_grid, scoring="accuracy", cv=cv
    )
    grid.fit(X_train, y_train)
    self.model = grid.best_estimator_
    print(f"Best Parameters: {grid.best_params_}")
    print(f"Best Cross-Validated Accuracy: {grid.best_score_}")

`plot(X_test, y_test)`

Plot the confusion matrix for the test data.

Parameters:

Name	Type	Description	Default
`X_test`	`ndarray`	Test data features.	required
`y_test`	`ndarray`	Test data labels.	required

Source code in scirex/core/ml/supervised/classification/naive_bayes.py

def plot(self, X_test: np.ndarray, y_test: np.ndarray) -> None:
    """
    Plot the confusion matrix for the test data.

    Args:
        X_test (np.ndarray): Test data features.
        y_test (np.ndarray): Test data labels.
    """
    y_pred = self.model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(
        cm,
        annot=True,
        fmt="d",
        cmap="Blues",
        xticklabels=np.unique(y_test),
        yticklabels=np.unique(y_test),
    )
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title(f"Confusion Matrix - {self.model_type.capitalize()} Naive Bayes")
    plt.show()

`run(data, labels, test_size=0.2, param_grid=None, cv=5)`

Execute the full classification pipeline with optional grid search.

Parameters:

Name	Type	Description	Default
`data`	`ndarray`	Input features.	required
`labels`	`ndarray`	Input labels.	required
`test_size`	`float`	Proportion of data to use for testing. Defaults to 0.2.	`0.2`
`param_grid`	`Dict[str, Any]`	Dictionary of hyperparameters to search for grid search. If None, grid search will not be performed.	`None`
`cv`	`int`	Number of cross-validation folds for grid search. Default is 5.	`5`

Returns:

Type	Description
`Dict[str, Any]`	Dict[str, Any]: Performance metrics.

Source code in scirex/core/ml/supervised/classification/naive_bayes.py

def run(
    self,
    data: np.ndarray,
    labels: np.ndarray,
    test_size: float = 0.2,
    param_grid: Dict[str, Any] = None,
    cv: int = 5,
) -> Dict[str, Any]:
    """
    Execute the full classification pipeline with optional grid search.

    Args:
        data (np.ndarray): Input features.
        labels (np.ndarray): Input labels.
        test_size (float): Proportion of data to use for testing. Defaults to 0.2.
        param_grid (Dict[str, Any], optional): Dictionary of hyperparameters to search for grid search. If None, grid search will not be performed.
        cv (int): Number of cross-validation folds for grid search. Default is 5.

    Returns:
        Dict[str, Any]: Performance metrics.
    """
    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        data, labels, test_size=test_size, random_state=self.random_state
    )

    # Perform grid search if param_grid is provided
    if param_grid:
        self.grid_search(X_train, y_train, param_grid, cv=cv)

    # Train the model with the (possibly tuned) parameters
    self.fit(X_train, y_train)

    # Evaluate the model
    metrics = self.evaluate(X_test, y_test)

    # Plot the confusion matrix
    self.plot(X_test, y_test)

    return metrics

`save_model(file_path)`

Save the trained model to a file.

Parameters:

Name	Type	Description	Default
`file_path`	`str`	Path where the model will be saved.	required

Source code in scirex/core/ml/supervised/classification/naive_bayes.py

def save_model(self, file_path: str) -> None:
    """
    Save the trained model to a file.

    Args:
        file_path (str): Path where the model will be saved.
    """
    joblib.dump(self.model, file_path)
    print(f"Model saved to {file_path}")

naive bayes

NaiveBayes

__init__(model_type='gaussian', random_state=42)

evaluate(X_test, y_test)

fit(X_train, y_train)

get_model_params()

grid_search(X_train, y_train, param_grid, cv=5)

plot(X_test, y_test)

run(data, labels, test_size=0.2, param_grid=None, cv=5)

save_model(file_path)

`NaiveBayes`

`init(model_type='gaussian', random_state=42)`

`evaluate(X_test, y_test)`

`fit(X_train, y_train)`

`get_model_params()`

`grid_search(X_train, y_train, param_grid, cv=5)`

`plot(X_test, y_test)`

`run(data, labels, test_size=0.2, param_grid=None, cv=5)`

`save_model(file_path)`