Logistic regression

Module: logistic_regression.py

This module implements Logistic Regression for binary and multi-class classification tasks.

It provides functionality to

Train Logistic Regression classifiers on a dataset
Evaluate model performance using classification metrics
Visualize results with a confusion matrix
Optimize hyperparameters using grid search

Classes:

Name	Description
`LogisticRegressionClassifier`	Implements Logistic Regression using scikit-learn.

Dependencies

numpy
sklearn.linear_model.LogisticRegression
sklearn.metrics (classification metrics)
sklearn.model_selection.GridSearchCV
matplotlib, seaborn
base.py (Classification)

Key Features

Support for binary and multi-class classification
Regularization options (L1, L2, ElasticNet)
Grid search for hyperparameter tuning
Automatic data preparation and evaluation

Authors

Protyush P. Chowdhury (protyushc@iisc.ac.in)

Version Info

28/Dec/2024: Initial version

`LogisticRegressionClassifier`

Bases: Classification

Implements Logistic Regression for classification tasks.

Source code in scirex/core/ml/supervised/classification/logistic_regression.py

class LogisticRegressionClassifier(Classification):
    """
    Implements Logistic Regression for classification tasks.
    """

    def __init__(self, random_state: int = 42) -> None:
        """
        Initialize the Logistic Regression classifier.

        Args:
            random_state (int): Seed for reproducibility.
        """
        super().__init__(model_type="logistic_regression", random_state=random_state)
        self.model = LogisticRegression(random_state=random_state, solver="lbfgs")

    def fit(self, X_train: np.ndarray, y_train: np.ndarray) -> None:
        """
        Train the Logistic Regression model.

        Args:
            X_train (np.ndarray): Training data features.
            y_train (np.ndarray): Training data labels.
        """
        self.model.fit(X_train, y_train)

    def evaluate(self, X_test: np.ndarray, y_test: np.ndarray) -> Dict[str, Any]:
        """
        Evaluate the model on test data.

        Args:
            X_test (np.ndarray): Test data features.
            y_test (np.ndarray): Test data labels.

        Returns:
            Dict[str, Any]: Dictionary containing evaluation metrics (accuracy, precision, recall, F1-score).
        """
        y_pred = self.model.predict(X_test)
        report = classification_report(y_test, y_pred, output_dict=True)
        return {
            "accuracy": report["accuracy"],
            "precision": report["weighted avg"]["precision"],
            "recall": report["weighted avg"]["recall"],
            "f1_score": report["weighted avg"]["f1-score"],
        }

    def predict(self, X_test: np.ndarray) -> np.ndarray:
        """
        Predict the labels for the test data.

        Args:
            X_test (np.ndarray): Test data features.

        Returns:
            np.ndarray: Array of predicted labels.
        """
        # Use the model to predict the labels for the given test data
        return self.model.predict(X_test)

    def plot(self, X_test: np.ndarray, y_test: np.ndarray) -> None:
        """
        Plot the confusion matrix for the test data.

        Args:
            X_test (np.ndarray): Test data features.
            y_test (np.ndarray): Test data labels.
        """
        y_pred = self.model.predict(X_test)
        cm = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(8, 6))
        sns.heatmap(
            cm,
            annot=True,
            fmt="d",
            cmap="Blues",
            xticklabels=np.unique(y_test),
            yticklabels=np.unique(y_test),
        )
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.title("Confusion Matrix - Logistic Regression")
        plt.show()

    def grid_search(
        self, X_train: np.ndarray, y_train: np.ndarray, param_grid: Dict[str, Any]
    ) -> None:
        """
        Perform hyperparameter tuning using grid search.

        Args:
            X_train (np.ndarray): Training data features.
            y_train (np.ndarray): Training data labels.
            param_grid (Dict[str, Any]): Dictionary of hyperparameters to search.
        """
        grid = GridSearchCV(
            estimator=self.model, param_grid=param_grid, scoring="accuracy", cv=5
        )
        grid.fit(X_train, y_train)
        self.model = grid.best_estimator_
        print(f"Best Parameters: {grid.best_params_}")
        print(f"Best Cross-Validated Accuracy: {grid.best_score_}")

    def run(
        self,
        data: np.ndarray,
        labels: np.ndarray,
        split_ratio: float = 0.2,
        param_grid: Dict[str, Any] = None,
    ) -> Dict[str, Any]:
        """
        Execute the full classification pipeline with optional grid search.

        Args:
            data (np.ndarray): Input features.
            labels (np.ndarray): Input labels.
            split_ratio (float): Proportion of data to use for testing. Defaults to 0.2.
            param_grid (Dict[str, Any], optional): Dictionary of hyperparameters to search for grid search. If None, grid search will not be performed.

        Returns:
            Dict[str, Any]: Performance metrics.
        """
        # Split the data into training and test sets
        X_train, X_test, y_train, y_test = train_test_split(
            data, labels, test_size=split_ratio
        )

        # Perform grid search if param_grid is provided
        if param_grid:
            self.grid_search(X_train, y_train, param_grid)

        # Train the model with the (possibly tuned) parameters
        self.fit(X_train, y_train)

        # Evaluate the model
        metrics = self.evaluate(X_test, y_test)

        # Plot the confusion matrix
        self.plot(X_test, y_test)

        return metrics

    # Implementing the abstract method get_model_params
    def get_model_params(self) -> Dict[str, Any]:
        """
        Return the parameters of the model.

        Returns:
            Dict[str, Any]: Dictionary containing model parameters.
        """
        return {
            "C": self.model.C,
            "max_iter": self.model.max_iter,
            "solver": self.model.solver,
            "penalty": self.model.penalty,
        }

`init(random_state=42)`

Initialize the Logistic Regression classifier.

Parameters:

Name	Type	Description	Default
`random_state`	`int`	Seed for reproducibility.	`42`

Source code in scirex/core/ml/supervised/classification/logistic_regression.py

def __init__(self, random_state: int = 42) -> None:
    """
    Initialize the Logistic Regression classifier.

    Args:
        random_state (int): Seed for reproducibility.
    """
    super().__init__(model_type="logistic_regression", random_state=random_state)
    self.model = LogisticRegression(random_state=random_state, solver="lbfgs")

`evaluate(X_test, y_test)`

Evaluate the model on test data.

Parameters:

Name	Type	Description	Default
`X_test`	`ndarray`	Test data features.	required
`y_test`	`ndarray`	Test data labels.	required

Returns:

Type	Description
`Dict[str, Any]`	Dict[str, Any]: Dictionary containing evaluation metrics (accuracy, precision, recall, F1-score).

Source code in scirex/core/ml/supervised/classification/logistic_regression.py

def evaluate(self, X_test: np.ndarray, y_test: np.ndarray) -> Dict[str, Any]:
    """
    Evaluate the model on test data.

    Args:
        X_test (np.ndarray): Test data features.
        y_test (np.ndarray): Test data labels.

    Returns:
        Dict[str, Any]: Dictionary containing evaluation metrics (accuracy, precision, recall, F1-score).
    """
    y_pred = self.model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    return {
        "accuracy": report["accuracy"],
        "precision": report["weighted avg"]["precision"],
        "recall": report["weighted avg"]["recall"],
        "f1_score": report["weighted avg"]["f1-score"],
    }

`fit(X_train, y_train)`

Train the Logistic Regression model.

Parameters:

Name	Type	Description	Default
`X_train`	`ndarray`	Training data features.	required
`y_train`	`ndarray`	Training data labels.	required

Source code in scirex/core/ml/supervised/classification/logistic_regression.py

def fit(self, X_train: np.ndarray, y_train: np.ndarray) -> None:
    """
    Train the Logistic Regression model.

    Args:
        X_train (np.ndarray): Training data features.
        y_train (np.ndarray): Training data labels.
    """
    self.model.fit(X_train, y_train)

`get_model_params()`

Return the parameters of the model.

Returns:

Type	Description
`Dict[str, Any]`	Dict[str, Any]: Dictionary containing model parameters.

Source code in scirex/core/ml/supervised/classification/logistic_regression.py

def get_model_params(self) -> Dict[str, Any]:
    """
    Return the parameters of the model.

    Returns:
        Dict[str, Any]: Dictionary containing model parameters.
    """
    return {
        "C": self.model.C,
        "max_iter": self.model.max_iter,
        "solver": self.model.solver,
        "penalty": self.model.penalty,
    }

`grid_search(X_train, y_train, param_grid)`

Perform hyperparameter tuning using grid search.

Parameters:

Name	Type	Description	Default
`X_train`	`ndarray`	Training data features.	required
`y_train`	`ndarray`	Training data labels.	required
`param_grid`	`Dict[str, Any]`	Dictionary of hyperparameters to search.	required

Source code in scirex/core/ml/supervised/classification/logistic_regression.py

def grid_search(
    self, X_train: np.ndarray, y_train: np.ndarray, param_grid: Dict[str, Any]
) -> None:
    """
    Perform hyperparameter tuning using grid search.

    Args:
        X_train (np.ndarray): Training data features.
        y_train (np.ndarray): Training data labels.
        param_grid (Dict[str, Any]): Dictionary of hyperparameters to search.
    """
    grid = GridSearchCV(
        estimator=self.model, param_grid=param_grid, scoring="accuracy", cv=5
    )
    grid.fit(X_train, y_train)
    self.model = grid.best_estimator_
    print(f"Best Parameters: {grid.best_params_}")
    print(f"Best Cross-Validated Accuracy: {grid.best_score_}")

`plot(X_test, y_test)`

Plot the confusion matrix for the test data.

Parameters:

Name	Type	Description	Default
`X_test`	`ndarray`	Test data features.	required
`y_test`	`ndarray`	Test data labels.	required

Source code in scirex/core/ml/supervised/classification/logistic_regression.py

def plot(self, X_test: np.ndarray, y_test: np.ndarray) -> None:
    """
    Plot the confusion matrix for the test data.

    Args:
        X_test (np.ndarray): Test data features.
        y_test (np.ndarray): Test data labels.
    """
    y_pred = self.model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(
        cm,
        annot=True,
        fmt="d",
        cmap="Blues",
        xticklabels=np.unique(y_test),
        yticklabels=np.unique(y_test),
    )
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix - Logistic Regression")
    plt.show()

`predict(X_test)`

Predict the labels for the test data.

Parameters:

Name	Type	Description	Default
`X_test`	`ndarray`	Test data features.	required

Returns:

Type	Description
`ndarray`	np.ndarray: Array of predicted labels.

Source code in scirex/core/ml/supervised/classification/logistic_regression.py

def predict(self, X_test: np.ndarray) -> np.ndarray:
    """
    Predict the labels for the test data.

    Args:
        X_test (np.ndarray): Test data features.

    Returns:
        np.ndarray: Array of predicted labels.
    """
    # Use the model to predict the labels for the given test data
    return self.model.predict(X_test)

`run(data, labels, split_ratio=0.2, param_grid=None)`

Execute the full classification pipeline with optional grid search.

Parameters:

Name	Type	Description	Default
`data`	`ndarray`	Input features.	required
`labels`	`ndarray`	Input labels.	required
`split_ratio`	`float`	Proportion of data to use for testing. Defaults to 0.2.	`0.2`
`param_grid`	`Dict[str, Any]`	Dictionary of hyperparameters to search for grid search. If None, grid search will not be performed.	`None`

Returns:

Type	Description
`Dict[str, Any]`	Dict[str, Any]: Performance metrics.

Source code in scirex/core/ml/supervised/classification/logistic_regression.py

def run(
    self,
    data: np.ndarray,
    labels: np.ndarray,
    split_ratio: float = 0.2,
    param_grid: Dict[str, Any] = None,
) -> Dict[str, Any]:
    """
    Execute the full classification pipeline with optional grid search.

    Args:
        data (np.ndarray): Input features.
        labels (np.ndarray): Input labels.
        split_ratio (float): Proportion of data to use for testing. Defaults to 0.2.
        param_grid (Dict[str, Any], optional): Dictionary of hyperparameters to search for grid search. If None, grid search will not be performed.

    Returns:
        Dict[str, Any]: Performance metrics.
    """
    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        data, labels, test_size=split_ratio
    )

    # Perform grid search if param_grid is provided
    if param_grid:
        self.grid_search(X_train, y_train, param_grid)

    # Train the model with the (possibly tuned) parameters
    self.fit(X_train, y_train)

    # Evaluate the model
    metrics = self.evaluate(X_test, y_test)

    # Plot the confusion matrix
    self.plot(X_test, y_test)

    return metrics

Logistic regression

LogisticRegressionClassifier

__init__(random_state=42)

evaluate(X_test, y_test)

fit(X_train, y_train)

get_model_params()

grid_search(X_train, y_train, param_grid)

plot(X_test, y_test)

predict(X_test)

run(data, labels, split_ratio=0.2, param_grid=None)

`LogisticRegressionClassifier`

`init(random_state=42)`

`evaluate(X_test, y_test)`

`fit(X_train, y_train)`

`get_model_params()`

`grid_search(X_train, y_train, param_grid)`

`plot(X_test, y_test)`

`predict(X_test)`

`run(data, labels, split_ratio=0.2, param_grid=None)`