Skip to content

Logistic regression

Module: logistic_regression.py

This module implements Logistic Regression for binary and multi-class classification tasks.

It provides functionality to
  • Train Logistic Regression classifiers on a dataset
  • Evaluate model performance using classification metrics
  • Visualize results with a confusion matrix
  • Optimize hyperparameters using grid search

Classes:

Name Description
LogisticRegressionClassifier

Implements Logistic Regression using scikit-learn.

Dependencies
  • numpy
  • sklearn.linear_model.LogisticRegression
  • sklearn.metrics (classification metrics)
  • sklearn.model_selection.GridSearchCV
  • matplotlib, seaborn
  • base.py (Classification)
Key Features
  • Support for binary and multi-class classification
  • Regularization options (L1, L2, ElasticNet)
  • Grid search for hyperparameter tuning
  • Automatic data preparation and evaluation
Authors
  • Protyush P. Chowdhury (protyushc@iisc.ac.in)
Version Info
  • 28/Dec/2024: Initial version

LogisticRegressionClassifier

Bases: Classification

Implements Logistic Regression for classification tasks.

Source code in scirex/core/ml/supervised/classification/logistic_regression.py
class LogisticRegressionClassifier(Classification):
    """
    Implements Logistic Regression for classification tasks.
    """

    def __init__(self, random_state: int = 42) -> None:
        """
        Initialize the Logistic Regression classifier.

        Args:
            random_state (int): Seed for reproducibility.
        """
        super().__init__(model_type="logistic_regression", random_state=random_state)
        self.model = LogisticRegression(random_state=random_state, solver="lbfgs")

    def fit(self, X_train: np.ndarray, y_train: np.ndarray) -> None:
        """
        Train the Logistic Regression model.

        Args:
            X_train (np.ndarray): Training data features.
            y_train (np.ndarray): Training data labels.
        """
        self.model.fit(X_train, y_train)

    def evaluate(self, X_test: np.ndarray, y_test: np.ndarray) -> Dict[str, Any]:
        """
        Evaluate the model on test data.

        Args:
            X_test (np.ndarray): Test data features.
            y_test (np.ndarray): Test data labels.

        Returns:
            Dict[str, Any]: Dictionary containing evaluation metrics (accuracy, precision, recall, F1-score).
        """
        y_pred = self.model.predict(X_test)
        report = classification_report(y_test, y_pred, output_dict=True)
        return {
            "accuracy": report["accuracy"],
            "precision": report["weighted avg"]["precision"],
            "recall": report["weighted avg"]["recall"],
            "f1_score": report["weighted avg"]["f1-score"],
        }

    def predict(self, X_test: np.ndarray) -> np.ndarray:
        """
        Predict the labels for the test data.

        Args:
            X_test (np.ndarray): Test data features.

        Returns:
            np.ndarray: Array of predicted labels.
        """
        # Use the model to predict the labels for the given test data
        return self.model.predict(X_test)

    def plot(self, X_test: np.ndarray, y_test: np.ndarray) -> None:
        """
        Plot the confusion matrix for the test data.

        Args:
            X_test (np.ndarray): Test data features.
            y_test (np.ndarray): Test data labels.
        """
        y_pred = self.model.predict(X_test)
        cm = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(8, 6))
        sns.heatmap(
            cm,
            annot=True,
            fmt="d",
            cmap="Blues",
            xticklabels=np.unique(y_test),
            yticklabels=np.unique(y_test),
        )
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.title("Confusion Matrix - Logistic Regression")
        plt.show()

    def grid_search(
        self, X_train: np.ndarray, y_train: np.ndarray, param_grid: Dict[str, Any]
    ) -> None:
        """
        Perform hyperparameter tuning using grid search.

        Args:
            X_train (np.ndarray): Training data features.
            y_train (np.ndarray): Training data labels.
            param_grid (Dict[str, Any]): Dictionary of hyperparameters to search.
        """
        grid = GridSearchCV(
            estimator=self.model, param_grid=param_grid, scoring="accuracy", cv=5
        )
        grid.fit(X_train, y_train)
        self.model = grid.best_estimator_
        print(f"Best Parameters: {grid.best_params_}")
        print(f"Best Cross-Validated Accuracy: {grid.best_score_}")

    def run(
        self,
        data: np.ndarray,
        labels: np.ndarray,
        split_ratio: float = 0.2,
        param_grid: Dict[str, Any] = None,
    ) -> Dict[str, Any]:
        """
        Execute the full classification pipeline with optional grid search.

        Args:
            data (np.ndarray): Input features.
            labels (np.ndarray): Input labels.
            split_ratio (float): Proportion of data to use for testing. Defaults to 0.2.
            param_grid (Dict[str, Any], optional): Dictionary of hyperparameters to search for grid search. If None, grid search will not be performed.

        Returns:
            Dict[str, Any]: Performance metrics.
        """
        # Split the data into training and test sets
        X_train, X_test, y_train, y_test = train_test_split(
            data, labels, test_size=split_ratio
        )

        # Perform grid search if param_grid is provided
        if param_grid:
            self.grid_search(X_train, y_train, param_grid)

        # Train the model with the (possibly tuned) parameters
        self.fit(X_train, y_train)

        # Evaluate the model
        metrics = self.evaluate(X_test, y_test)

        # Plot the confusion matrix
        self.plot(X_test, y_test)

        return metrics

    # Implementing the abstract method get_model_params
    def get_model_params(self) -> Dict[str, Any]:
        """
        Return the parameters of the model.

        Returns:
            Dict[str, Any]: Dictionary containing model parameters.
        """
        return {
            "C": self.model.C,
            "max_iter": self.model.max_iter,
            "solver": self.model.solver,
            "penalty": self.model.penalty,
        }

__init__(random_state=42)

Initialize the Logistic Regression classifier.

Parameters:

Name Type Description Default
random_state int

Seed for reproducibility.

42
Source code in scirex/core/ml/supervised/classification/logistic_regression.py
def __init__(self, random_state: int = 42) -> None:
    """
    Initialize the Logistic Regression classifier.

    Args:
        random_state (int): Seed for reproducibility.
    """
    super().__init__(model_type="logistic_regression", random_state=random_state)
    self.model = LogisticRegression(random_state=random_state, solver="lbfgs")

evaluate(X_test, y_test)

Evaluate the model on test data.

Parameters:

Name Type Description Default
X_test ndarray

Test data features.

required
y_test ndarray

Test data labels.

required

Returns:

Type Description
Dict[str, Any]

Dict[str, Any]: Dictionary containing evaluation metrics (accuracy, precision, recall, F1-score).

Source code in scirex/core/ml/supervised/classification/logistic_regression.py
def evaluate(self, X_test: np.ndarray, y_test: np.ndarray) -> Dict[str, Any]:
    """
    Evaluate the model on test data.

    Args:
        X_test (np.ndarray): Test data features.
        y_test (np.ndarray): Test data labels.

    Returns:
        Dict[str, Any]: Dictionary containing evaluation metrics (accuracy, precision, recall, F1-score).
    """
    y_pred = self.model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    return {
        "accuracy": report["accuracy"],
        "precision": report["weighted avg"]["precision"],
        "recall": report["weighted avg"]["recall"],
        "f1_score": report["weighted avg"]["f1-score"],
    }

fit(X_train, y_train)

Train the Logistic Regression model.

Parameters:

Name Type Description Default
X_train ndarray

Training data features.

required
y_train ndarray

Training data labels.

required
Source code in scirex/core/ml/supervised/classification/logistic_regression.py
def fit(self, X_train: np.ndarray, y_train: np.ndarray) -> None:
    """
    Train the Logistic Regression model.

    Args:
        X_train (np.ndarray): Training data features.
        y_train (np.ndarray): Training data labels.
    """
    self.model.fit(X_train, y_train)

get_model_params()

Return the parameters of the model.

Returns:

Type Description
Dict[str, Any]

Dict[str, Any]: Dictionary containing model parameters.

Source code in scirex/core/ml/supervised/classification/logistic_regression.py
def get_model_params(self) -> Dict[str, Any]:
    """
    Return the parameters of the model.

    Returns:
        Dict[str, Any]: Dictionary containing model parameters.
    """
    return {
        "C": self.model.C,
        "max_iter": self.model.max_iter,
        "solver": self.model.solver,
        "penalty": self.model.penalty,
    }

Perform hyperparameter tuning using grid search.

Parameters:

Name Type Description Default
X_train ndarray

Training data features.

required
y_train ndarray

Training data labels.

required
param_grid Dict[str, Any]

Dictionary of hyperparameters to search.

required
Source code in scirex/core/ml/supervised/classification/logistic_regression.py
def grid_search(
    self, X_train: np.ndarray, y_train: np.ndarray, param_grid: Dict[str, Any]
) -> None:
    """
    Perform hyperparameter tuning using grid search.

    Args:
        X_train (np.ndarray): Training data features.
        y_train (np.ndarray): Training data labels.
        param_grid (Dict[str, Any]): Dictionary of hyperparameters to search.
    """
    grid = GridSearchCV(
        estimator=self.model, param_grid=param_grid, scoring="accuracy", cv=5
    )
    grid.fit(X_train, y_train)
    self.model = grid.best_estimator_
    print(f"Best Parameters: {grid.best_params_}")
    print(f"Best Cross-Validated Accuracy: {grid.best_score_}")

plot(X_test, y_test)

Plot the confusion matrix for the test data.

Parameters:

Name Type Description Default
X_test ndarray

Test data features.

required
y_test ndarray

Test data labels.

required
Source code in scirex/core/ml/supervised/classification/logistic_regression.py
def plot(self, X_test: np.ndarray, y_test: np.ndarray) -> None:
    """
    Plot the confusion matrix for the test data.

    Args:
        X_test (np.ndarray): Test data features.
        y_test (np.ndarray): Test data labels.
    """
    y_pred = self.model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(
        cm,
        annot=True,
        fmt="d",
        cmap="Blues",
        xticklabels=np.unique(y_test),
        yticklabels=np.unique(y_test),
    )
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix - Logistic Regression")
    plt.show()

predict(X_test)

Predict the labels for the test data.

Parameters:

Name Type Description Default
X_test ndarray

Test data features.

required

Returns:

Type Description
ndarray

np.ndarray: Array of predicted labels.

Source code in scirex/core/ml/supervised/classification/logistic_regression.py
def predict(self, X_test: np.ndarray) -> np.ndarray:
    """
    Predict the labels for the test data.

    Args:
        X_test (np.ndarray): Test data features.

    Returns:
        np.ndarray: Array of predicted labels.
    """
    # Use the model to predict the labels for the given test data
    return self.model.predict(X_test)

run(data, labels, split_ratio=0.2, param_grid=None)

Execute the full classification pipeline with optional grid search.

Parameters:

Name Type Description Default
data ndarray

Input features.

required
labels ndarray

Input labels.

required
split_ratio float

Proportion of data to use for testing. Defaults to 0.2.

0.2
param_grid Dict[str, Any]

Dictionary of hyperparameters to search for grid search. If None, grid search will not be performed.

None

Returns:

Type Description
Dict[str, Any]

Dict[str, Any]: Performance metrics.

Source code in scirex/core/ml/supervised/classification/logistic_regression.py
def run(
    self,
    data: np.ndarray,
    labels: np.ndarray,
    split_ratio: float = 0.2,
    param_grid: Dict[str, Any] = None,
) -> Dict[str, Any]:
    """
    Execute the full classification pipeline with optional grid search.

    Args:
        data (np.ndarray): Input features.
        labels (np.ndarray): Input labels.
        split_ratio (float): Proportion of data to use for testing. Defaults to 0.2.
        param_grid (Dict[str, Any], optional): Dictionary of hyperparameters to search for grid search. If None, grid search will not be performed.

    Returns:
        Dict[str, Any]: Performance metrics.
    """
    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        data, labels, test_size=split_ratio
    )

    # Perform grid search if param_grid is provided
    if param_grid:
        self.grid_search(X_train, y_train, param_grid)

    # Train the model with the (possibly tuned) parameters
    self.fit(X_train, y_train)

    # Evaluate the model
    metrics = self.evaluate(X_test, y_test)

    # Plot the confusion matrix
    self.plot(X_test, y_test)

    return metrics