Source code for sparselm.model._miqp._regularized_l0

"""MIQP based solvers for sparse solutions with hierarchical constraints.

Generalized regularized l0 solvers that allow grouping parameters as detailed in:

    https://doi.org/10.1287/opre.2015.1436

L1L0 proposed by Wenxuan Huang:

    https://arxiv.org/abs/1807.10753

L2L0 proposed by Peichen Zhong:

    https://journals.aps.org/prb/abstract/10.1103/PhysRevB.106.024203

Regressors allow optional inclusion of hierarchical constraints at the single coefficient
or group of coefficients level.
"""

from __future__ import annotations

__author__ = "Luis Barroso-Luque, Fengyu Xie"


from abc import ABCMeta, abstractmethod
from numbers import Real
from types import SimpleNamespace
from typing import Any

import cvxpy as cp
import numpy as np
from numpy.typing import ArrayLike, NDArray
from sklearn.utils._param_validation import Interval

from sparselm.model._base import TikhonovMixin

from ._base import MIQPl0


[docs]class RegularizedL0(MIQPl0):
    r"""Implementation of mixed-integer quadratic programming l0 regularized Regressor.

    Supports grouping parameters and group-level hierarchy, but requires groups as a
    compulsory argument.

    Regularized regression objective:

    .. math::

        \min_{\beta} || X \beta - y ||^2_2 + \alpha \sum_{G} z_G

    Where G represents groups of features/coefficients and :math:`z_G` is are boolean
    valued slack variables.

    Args:
        groups (ArrayLike):
            1D array-like of integers specifying groups. Length should be the
            same as model, where each integer entry specifies the group
            each parameter corresponds to. If no grouping is needed pass a list
            of all distinct numbers (ie range(len(coefs)) to create singleton groups
            for each parameter.
        alpha (float):
            L0 pseudo-norm regularization hyper-parameter.
        big_M (float):
            Upper bound on the norm of coefficients associated with each
            groups of coefficients :math:`||\beta_c||_2`.
        hierarchy (list):
            A list of lists of integers storing hierarchy relations between
            groups.
            Each sublist contains indices of other groups
            on which the group associated with each element of
            the list depends. i.e. hierarchy = [[1, 2], [0], []] mean that
            group 0 depends on 1, and 2; 1 depends on 0, and 2 has no
            dependence.
        ignore_psd_check (bool):
            Whether to ignore cvxpy's PSD checks  of matrix used in quadratic
            form. Default is True to avoid raising errors for poorly
            conditioned matrices. But if you want to be strict set to False.
        fit_intercept (bool):
            Whether the intercept should be estimated or not.
            If False, the data is assumed to be already centered.
        copy_X (bool):
            If True, X will be copied; else, it may be overwritten.
        warm_start (bool):
            When set to True, reuse the solution of the previous call to
            fit as initialization, otherwise, just erase the previous
            solution.
        solver (str):
            cvxpy backend solver to use. Supported solvers are listed here:
            https://www.cvxpy.org/tutorial/advanced/index.html#solve-method-options
        solver_options (dict):
            dictionary of keyword arguments passed to cvxpy solve.
            See docs in CVXRegressor for more information.

    Attributes:
        coef_ (NDArray):
            Parameter vector (:math:`\beta` in the cost function formula) of shape (n_features,).
        intercept_ (float):
            Independent term in decision function.
        canonicals_ (SimpleNamespace):
            Namespace that contains underlying cvxpy objects used to define
            the optimization problem. The objects included are the following:
                - objective - the objective function.
                - beta - variable to be optimized (corresponds to the estimated coef_ attribute).
                - parameters - hyper-parameters
                - auxiliaries - auxiliary variables and expressions
                - constraints - solution constraints

    Notes:
        Installation of Gurobi is not a must, but highly recommended. An open source alternative
        is SCIP. ECOS_BB also works but can be very slow, and has recurring correctness issues.
        See the Mixed-integer programs section of the cvxpy docs:
        https://www.cvxpy.org/tutorial/advanced/index.html
    """

    _cvx_parameter_constraints: dict[str, list[Any]] = {
        "alpha": [Interval(type=Real, left=0.0, right=None, closed="left")],
        **MIQPl0._cvx_parameter_constraints,
    }

    def __init__(
        self,
        groups: ArrayLike | None = None,
        alpha: float = 1.0,
        big_M: int = 100,
        hierarchy: list[list[int]] | None = None,
        ignore_psd_check: bool = True,
        fit_intercept: bool = False,
        copy_X: bool = True,
        warm_start: bool = False,
        solver: str | None = None,
        solver_options: dict | None = None,
    ):
        super().__init__(
            groups=groups,
            big_M=big_M,
            hierarchy=hierarchy,
            ignore_psd_check=ignore_psd_check,
            fit_intercept=fit_intercept,
            copy_X=copy_X,
            warm_start=warm_start,
            solver=solver,
            solver_options=solver_options,
        )
        self.alpha = alpha

    def _generate_objective(
        self,
        X: ArrayLike,
        y: ArrayLike,
        beta: cp.Variable,
        parameters: SimpleNamespace | None = None,
        auxiliaries: SimpleNamespace | None = None,
    ) -> cp.Expression:
        """Generate the quadratic form and l0 regularization portion of objective."""
        c0 = 2 * X.shape[0]  # keeps hyperparameter scale independent
        objective = super()._generate_objective(
            X, y, beta, parameters, auxiliaries
        ) + c0 * parameters.alpha * cp.sum(auxiliaries.z0)
        return objective


class MixedL0(RegularizedL0, metaclass=ABCMeta):
    """Abstract base class for mixed L0 regularization models: L1L0 and L2L0."""

    _cvx_parameter_constraints: dict[str, list[Any]] = {
        "eta": [Interval(type=Real, left=0.0, right=None, closed="left")],
        **RegularizedL0._cvx_parameter_constraints,
    }

    def __init__(
        self,
        groups: ArrayLike | None = None,
        alpha: float = 1.0,
        eta: float = 1.0,
        big_M: int = 100,
        hierarchy: list[list[int]] | None = None,
        ignore_psd_check: bool = True,
        fit_intercept: bool = False,
        copy_X: bool = True,
        warm_start: bool = False,
        solver: str | None = None,
        solver_options: dict | None = None,
    ):
        """Initialize Regressor.

        Args:
            groups (ArrayLike):
                1D array-like of integers specifying groups. Length should be the
                same as model, where each integer entry specifies the group
                each parameter corresponds to. If no grouping is needed pass a list
                of all distinct numbers (ie range(len(coefs)) to create singleton groups
                for each parameter.
            alpha (float):
                L0 pseudo-norm regularization hyper-parameter.
            eta (float):
                standard norm regularization hyper-parameter (usually l1 or l2).
            big_M (float):
                Upper bound on the norm of coefficients associated with each

            hierarchy (list):
                A list of lists of integers storing hierarchy relations between
                coefficients.
                Each sublist contains indices of other coefficients
                on which the coefficient associated with each element of
                the list depends. i.e. hierarchy = [[1, 2], [0], []] mean that
                coefficient 0 depends on 1, and 2; 1 depends on 0, and 2 has no
                dependence.
            ignore_psd_check (bool):
                Whether to ignore cvxpy's PSD checks  of matrix used in quadratic
                form. Default is True to avoid raising errors for poorly
                conditioned matrices. But if you want to be strict set to False.
            fit_intercept (bool):
                Whether the intercept should be estimated or not.
                If False, the data is assumed to be already centered.
            copy_X (bool):
                If True, X will be copied; else, it may be overwritten.
            warm_start (bool):
                When set to True, reuse the solution of the previous call to
                fit as initialization, otherwise, just erase the previous
                solution.
            solver (str):
                cvxpy backend solver to use. Supported solvers are listed here:
                https://www.cvxpy.org/tutorial/advanced/index.html#solve-method-options
            solver_options (dict):
                dictionary of keyword arguments passed to cvxpy solve.
                See docs in CVXRegressor for more information.
        """
        super().__init__(
            groups=groups,
            alpha=alpha,
            big_M=big_M,
            hierarchy=hierarchy,
            ignore_psd_check=ignore_psd_check,
            fit_intercept=fit_intercept,
            copy_X=copy_X,
            warm_start=warm_start,
            solver=solver,
            solver_options=solver_options,
        )
        self.eta = eta

    @abstractmethod
    def _generate_objective(
        self,
        X: ArrayLike,
        y: ArrayLike,
        beta: cp.Variable,
        parameters: SimpleNamespace | None = None,
        auxiliaries: SimpleNamespace | None = None,
    ) -> cp.Expression:
        """Generate optimization objective."""
        # implement in derived classes using super to call MIQP_L0 objective
        return super()._generate_objective(X, y, beta, parameters, auxiliaries)


[docs]class L1L0(MixedL0):
    r"""L1L0 regularized Regressor.

    Regressor with L1L0 regularization solved with mixed integer programming
    as discussed in:

    https://arxiv.org/abs/1807.10753

    Extended to allow grouping of coefficients and group-level hierarchy as described
    in:

    https://doi.org/10.1287/opre.2015.1436

    Regularized regression objective:

    .. math::

        \min_{\beta} || X \beta - y ||^2_2 + \alpha \sum_{G} z_G + \eta ||\beta||_1

    Where G represents groups of features/coefficients and :math:`z_G` is are boolean
    valued slack variables.

    Args:
        groups (ArrayLike):
            1D array-like of integers specifying groups. Length should be the
            same as model, where each integer entry specifies the group
            each parameter corresponds to. If no grouping is needed pass a list
            of all distinct numbers (ie range(len(coefs)) to create singleton groups
            for each parameter.
        alpha (float):
            L0 pseudo-norm regularization hyper-parameter.
        eta (float):
            L1 regularization hyper-parameter.
        big_M (float):
            Upper bound on the norm of coefficients associated with each
            groups of coefficients :math:`||\beta_c||_2`.
        hierarchy (list):
            A list of lists of integers storing hierarchy relations between
            coefficients.
            Each sublist contains indices of other coefficients
            on which the coefficient associated with each element of
            the list depends. i.e. hierarchy = [[1, 2], [0], []] mean that
            coefficient 0 depends on 1, and 2; 1 depends on 0, and 2 has no
            dependence.
        ignore_psd_check (bool):
            Whether to ignore cvxpy's PSD checks of matrix used in quadratic
            form. Default is True to avoid raising errors for poorly
            conditioned matrices. But if you want to be strict set to False.
        fit_intercept (bool):
            Whether the intercept should be estimated or not.
            If False, the data is assumed to be already centered.
        copy_X (bool):
            If True, X will be copied; else, it may be overwritten.
        warm_start (bool):
            When set to True, reuse the solution of the previous call to
            fit as initialization, otherwise, just erase the previous
            solution.
        solver (str):
            cvxpy backend solver to use. Supported solvers are listed here:
            https://www.cvxpy.org/tutorial/advanced/index.html#solve-method-options
        solver_options (dict):
            dictionary of keyword arguments passed to cvxpy solve.
            See docs in CVXRegressor for more information.

    Attributes:
        coef_ (NDArray):
            Parameter vector (:math:`\beta` in the cost function formula) of shape (n_features,).
        intercept_ (float):
            Independent term in decision function.
        canonicals_ (SimpleNamespace):
            Namespace that contains underlying cvxpy objects used to define
            the optimization problem. The objects included are the following:
                - objective - the objective function.
                - beta - variable to be optimized (corresponds to the estimated coef_ attribute).
                - parameters - hyper-parameters
                - auxiliaries - auxiliary variables and expressions
                - constraints - solution constraints

    Notes:
        Installation of Gurobi is not a must, but highly recommended. An open source alternative
        is SCIP. ECOS_BB also works but can be very slow, and has recurring correctness issues.
        See the Mixed-integer programs section of the cvxpy docs:
        https://www.cvxpy.org/tutorial/advanced/index.html
    """

    def __init__(
        self,
        groups: ArrayLike | None = None,
        alpha: float = 1.0,
        eta: float = 1.0,
        big_M: int = 100,
        hierarchy: list[list[int]] | None = None,
        ignore_psd_check: bool = True,
        fit_intercept: bool = False,
        copy_X: bool = True,
        warm_start: bool = False,
        solver: str | None = None,
        solver_options: dict | None = None,
    ):
        super().__init__(
            groups=groups,
            eta=eta,
            alpha=alpha,
            big_M=big_M,
            hierarchy=hierarchy,
            ignore_psd_check=ignore_psd_check,
            fit_intercept=fit_intercept,
            copy_X=copy_X,
            warm_start=warm_start,
            solver=solver,
            solver_options=solver_options,
        )

    def _generate_auxiliaries(
        self, X: ArrayLike, y: ArrayLike, beta: cp.Variable, parameters: SimpleNamespace
    ) -> SimpleNamespace | None:
        """Generate the boolean slack variable."""
        auxiliaries = super()._generate_auxiliaries(X, y, beta, parameters)
        X.shape[1] if self.groups is None else len(np.unique(self.groups))
        auxiliaries.z1 = cp.Variable(X.shape[1])
        return auxiliaries

    def _generate_constraints(
        self,
        X: ArrayLike,
        y: ArrayLike,
        beta: cp.Variable,
        parameters: SimpleNamespace | None = None,
        auxiliaries: SimpleNamespace | None = None,
    ) -> list[cp.constraints]:
        """Generate the constraints used to solve l1l0 regularization."""
        constraints = super()._generate_constraints(X, y, beta, parameters, auxiliaries)
        # L1 constraints (why not do an l1 norm in the objective instead?)
        constraints += [-auxiliaries.z1 <= beta, beta <= auxiliaries.z1]
        return constraints

    def _generate_objective(
        self,
        X: ArrayLike,
        y: ArrayLike,
        beta: cp.Variable,
        parameters: SimpleNamespace | None = None,
        auxiliaries: SimpleNamespace | None = None,
    ) -> cp.Expression:
        """Generate the objective function used in l1l0 regression model."""
        c0 = 2 * X.shape[0]  # keeps hyperparameter scale independent
        objective = super()._generate_objective(X, y, beta, parameters, auxiliaries)
        objective += c0 * parameters.eta * cp.sum(auxiliaries.z1)
        return objective


[docs]class L2L0(TikhonovMixin, MixedL0):
    r"""L2L0 regularized Regressor.

    Based on Regressor with L2L0 regularization solved with mixed integer programming
    proposed in:

    https://arxiv.org/abs/2204.13789

    Extended to allow grouping of coefficients and group-level hierarchy as described
    in:

    https://doi.org/10.1287/opre.2015.1436

    And allows using a Tihkonov matrix in the l2 term.

    Regularized regression objective:

    .. math::

        \min_{\beta} || X \beta - y ||^2_2 + \alpha \sum_{G} z_G + \eta ||W\beta||^2_2

    Where G represents groups of features/coefficients and :math:`z_G` is are boolean
    valued slack variables. W is a Tikhonov matrix.

    Args:
        groups (ArrayLike):
            1D array-like of integers specifying groups. Length should be the
            same as model, where each integer entry specifies the group
            each parameter corresponds to. If no grouping is needed pass a list
            of all distinct numbers (ie range(len(coefs)) to create singleton groups
            for each parameter.
        alpha (float):
            L0 pseudo-norm regularization hyper-parameter.
        eta (float):
            L2 regularization hyper-parameter.
        big_M (float):
            Upper bound on the norm of coefficients associated with each
            groups of coefficients :math:`||\beta_c||_2`.
        hierarchy (list):
            A list of lists of integers storing hierarchy relations between
            coefficients.
            Each sublist contains indices of other coefficients
            on which the coefficient associated with each element of
            the list depends. i.e. hierarchy = [[1, 2], [0], []] mean that
            coefficient 0 depends on 1, and 2; 1 depends on 0, and 2 has no
            dependence.
        tikhonov_w (np.array):
            Matrix to add weights to L2 regularization.
        ignore_psd_check (bool):
            Wether to ignore cvxpy's PSD checks of matrix used in quadratic
            form. Default is True to avoid raising errors for poorly
            conditioned matrices. But if you want to be strict set to False.
        fit_intercept (bool):
            Whether the intercept should be estimated or not.
            If False, the data is assumed to be already centered.
        copy_X (bool):
            If True, X will be copied; else, it may be overwritten.
        warm_start (bool):
            When set to True, reuse the solution of the previous call to
            fit as initialization, otherwise, just erase the previous
            solution.
        solver (str):
            cvxpy backend solver to use. Supported solvers are listed here:
            https://www.cvxpy.org/tutorial/advanced/index.html#solve-method-options
        solver_options (dict):
            dictionary of keyword arguments passed to cvxpy solve.
            See docs in CVXEstimator for more information.

    Attributes:
        coef_ (NDArray):
            Parameter vector (:math:`\beta` in the cost function formula) of shape (n_features,).
        intercept_ (float):
            Independent term in decision function.
        canonicals_ (SimpleNamespace):
            Namespace that contains underlying cvxpy objects used to define
            the optimization problem. The objects included are the following:
                - objective - the objective function.
                - beta - variable to be optimized (corresponds to the estimated coef_ attribute).
                - parameters - hyper-parameters
                - auxiliaries - auxiliary variables and expressions
                - constraints - solution constraints

    Notes:
        Installation of Gurobi is not a must, but highly recommended. An open source alternative
        is SCIP. ECOS_BB also works but can be very slow, and has recurring correctness issues.
        See the Mixed-integer programs section of the cvxpy docs:
        https://www.cvxpy.org/tutorial/advanced/index.html
    """

    def __init__(
        self,
        groups: ArrayLike | None = None,
        alpha: float = 1.0,
        eta: float = 1.0,
        big_M: int = 100,
        hierarchy: list[list[int]] | None = None,
        tikhonov_w: NDArray[float] | None = None,
        ignore_psd_check: bool = True,
        fit_intercept: bool = False,
        copy_X: bool = True,
        warm_start: bool = False,
        solver: str | None = None,
        solver_options: dict | None = None,
    ):
        super().__init__(
            groups=groups,
            alpha=alpha,
            eta=eta,
            big_M=big_M,
            hierarchy=hierarchy,
            ignore_psd_check=ignore_psd_check,
            fit_intercept=fit_intercept,
            copy_X=copy_X,
            warm_start=warm_start,
            solver=solver,
            solver_options=solver_options,
        )
        self.tikhonov_w = tikhonov_w