Source code for pyuoi.linear_model.logistic

import numbers, warnings

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import log_loss
from sklearn.exceptions import ConvergenceWarning
from sklearn.utils import (check_X_y, compute_class_weight,
                           check_consistent_length, check_array)
from sklearn.utils.multiclass import check_classification_targets
from sklearn.utils.extmath import safe_sparse_dot, log_logistic, squared_norm
from sklearn.preprocessing import StandardScaler

from scipy.optimize import minimize
from scipy import optimize
from scipy.special import expit, logsumexp

import numpy as np

from .base import AbstractUoIGeneralizedLinearRegressor
from ..utils import sigmoid, softmax
from ..lbfgs import fmin_lbfgs, AllZeroLBFGSError


[docs]class UoI_L1Logistic(AbstractUoIGeneralizedLinearRegressor, LogisticRegression):
    r"""UoI\ :sub:`L1-Logistic` model.

    Parameters
    ----------
    n_boots_sel : int
        The number of data bootstraps to use in the selection module.
        Increasing this number will make selection more strict.
    n_boots_est : int
        The number of data bootstraps to use in the estimation module.
        Increasing this number will relax selection and decrease variance.
    n_lambdas : int
        The number of regularization values to use for selection.
    alpha : list or ndarray
        The parameter that trades off L1 versus L2 regularization for a given
        lambda.
    selection_frac : float
        The fraction of the dataset to use for training in each resampled
        bootstrap, during the selection module. Small values of this parameter
        imply larger "perturbations" to the dataset.
    estimation_frac : float
        The fraction of the dataset to use for training in each resampled
        bootstrap, during the estimation module. The remaining data is used
        to obtain validation scores. Small values of this parameters imply
        larger "perturbations" to the dataset.
    estimation_target : string, "train" | "test"
        Decide whether to assess the estimation_score on the train
        or test data across each bootstrap. By deafult, a sensible
        choice is made based on the chosen estimation_score
    stability_selection : int, float, or array-like
        If int, treated as the number of bootstraps that a feature must
        appear in to guarantee placement in selection profile. If float,
        must be between 0 and 1, and is instead the proportion of
        bootstraps. If array-like, must consist of either ints or floats
        between 0 and 1. In this case, each entry in the array-like object
        will act as a separate threshold for placement in the selection
        profile.
    estimation_score : string, "acc" | "log" | "AIC", | "AICc" | "BIC"
        Objective used to choose the best estimates per bootstrap.
    multi_class : string, "auto" | "multinomial"
        For "multinomial" the loss minimised is the multinomial loss fit across
        the entire probability distribution, even when the data is binary.
        "auto" selects binary if the data is binary, and otherwise selects
        "multinomial".
    warm_start : bool
        When set to ``True``, reuse the solution of the previous call to fit as
        initialization, otherwise, just erase the previous solution
    eps : float
        Length of the L1 path. eps=1e-5 means that alpha_min / alpha_max = 1e-5
    fit_intercept : bool
        Whether to calculate the intercept for this model. If set to False, no
        intercept will be used in calculations (e.g. data is expected to be
        already centered).
    standardize : bool
        If True, the regressors X will be standardized before regression by
        subtracting the mean and dividing by their standard deviations.
    shared_support : bool
        For models with more than one output (multinomial logistic regression)
        this determines whether all outputs share the same support or can
        have independent supports.
    max_iter : int
        Maximum number of iterations for iterative fitting methods.
    tol : float
        Stopping criteria for solver.
    random_state : int, RandomState instance, or None
        The seed of the pseudo random number generator that selects a random
        feature to update.  If int, random_state is the seed used by the random
        number generator; If RandomState instance, random_state is the random
        number generator; If None, the random number generator is the
        RandomState instance used by `np.random`.
    comm : MPI communicator
        If passed, the selection and estimation steps are parallelized.

    Attributes
    ----------
    coef_ : array, shape (n_features,) or (n_targets, n_features)
        Estimated coefficients for the linear regression problem.
    intercept_ : float
        Independent term in the linear model.
    supports_ : array, shape
        boolean array indicating whether a given regressor (column) is selected
        for estimation for a given regularization parameter value (row).
    """

    metrics = AbstractUoIGeneralizedLinearRegressor._valid_estimation_metrics
    _valid_estimation_metrics = metrics + ('acc',)

    def __init__(self, n_boots_sel=24, n_boots_est=24, selection_frac=0.9,
                 estimation_frac=0.9, n_C=48, stability_selection=1.,
                 estimation_score='acc', estimation_target=None,
                 multi_class='auto', shared_support=True, warm_start=False,
                 eps=1e-5, fit_intercept=True, standardize=True,
                 max_iter=10000, tol=1e-3, random_state=None, comm=None,
                 logger=None):
        super(UoI_L1Logistic, self).__init__(
            n_boots_sel=n_boots_sel,
            n_boots_est=n_boots_est,
            selection_frac=selection_frac,
            estimation_frac=estimation_frac,
            stability_selection=stability_selection,
            estimation_score=estimation_score,
            estimation_target=estimation_target,
            random_state=random_state,
            fit_intercept=fit_intercept,
            standardize=standardize,
            shared_support=shared_support,
            comm=comm,
            logger=logger)
        self.n_C = n_C
        self.Cs = None
        self.multi_class = multi_class
        self.eps = eps
        self.tol = tol
        self.solver = 'lbfgs'
        self._selection_lm = MaskedCoefLogisticRegression(
            penalty='l1',
            max_iter=max_iter,
            warm_start=warm_start,
            multi_class=multi_class,
            fit_intercept=fit_intercept,
            tol=tol)

        self._estimation_lm = MaskedCoefLogisticRegression(
            C=np.inf,
            multi_class=multi_class,
            fit_intercept=fit_intercept,
            max_iter=max_iter,
            tol=tol)

    def get_reg_params(self, X, y):
        input_dim = X.shape[1]
        if self.Cs is None:
            if self.output_dim == 1:
                w = np.zeros(input_dim)
                if self.fit_intercept:
                    intercept = LogisticInterceptFitterNoFeatures(
                        y, 1).intercept_
                    w = np.concatenate([w, np.atleast_1d(intercept)])
                _, grad = _logistic_loss_and_grad(w, X, y, 0., None)
            else:
                w = np.zeros((self.output_dim, input_dim))
                yp = OneHotEncoder(categories=[range(self.output_dim)],
                                   sparse=False).fit_transform(y[:, np.newaxis])
                if self.fit_intercept:
                    intercept = LogisticInterceptFitterNoFeatures(
                        y, self.output_dim).intercept_
                    w = np.concatenate([w, intercept[:, np.newaxis]], axis=1)
                _, grad, _ = _multinomial_loss_grad(w, X, yp, 0., None,
                                                    np.ones(X.shape[0]))
            alpha_max = abs(grad[:-1]).max()
            logC = -np.log10(alpha_max)
            self.Cs = np.logspace(logC, logC - np.log10(self.eps), self.n_C)
        ret = list()
        for c in self.Cs:
            ret.append(dict(C=c))
        return ret

    def _fit_intercept_no_features(self, y):
        """"Fit a model with only an intercept.

        This is used in cases where the model has no support selected.
        """
        return LogisticInterceptFitterNoFeatures(y, self.output_dim)

    def _fit_intercept(self, X, y):
        """"Fit a model with an intercept and fixed coefficients.

        This is used to re-fit the intercept after the coefficients are
        estimated.
        """
        if self.fit_intercept:
            self.intercept_ = fit_intercept_fixed_coef(X, self.coef_, y,
                                                       self.output_dim)
        else:
            self.intercept_ = np.zeros(self.output_dim)

    def _pre_fit(self, X, y):
        X, y = super()._pre_fit(X, y)
        le = LabelEncoder()
        y = le.fit_transform(y)
        self.classes_ = le.classes_
        if self.classes_.size > 2:
            self.output_dim = self.classes_.size
        elif self.multi_class == 'multinomial':
            self.output_dim = 2
        else:
            self.output_dim = 1
        return X, y


def fit_intercept_fixed_coef(X, coef_, y, output_dim):
    """Optimize the likelihood w.r.t. the intercept for a logistic model."""
    if output_dim == 1:
        def f_df(intercept):
            py = sigmoid(X.dot(coef_.T) + intercept)
            dfdb = py.mean() - y.mean()
            return log_loss(y, py), np.atleast_1d(dfdb)

        opt = minimize(f_df, np.atleast_1d(np.zeros(1)),
                       method='L-BFGS-B', jac=True)
        return opt.x
    else:
        def f_df(short_intercept):
            intercept = np.concatenate([np.atleast_1d(1.), short_intercept])
            py = softmax(X.dot(coef_.T) + intercept)

            def dlogpi_dintk(ii, pyi):
                if ii == 0:
                    return -pyi[1:]
                else:
                    rval = np.eye(output_dim - 1)[ii - 1]
                    rval -= pyi[1:]
                    return rval

            dfdb = np.zeros_like(short_intercept)
            for yi, pyi in zip(y, py):
                dfdb -= dlogpi_dintk(yi, pyi) / y.size
            return log_loss(y, py, labels=np.arange(output_dim)), dfdb
        opt = minimize(f_df, np.atleast_1d(np.zeros(output_dim - 1)),
                       method='L-BFGS-B', jac=True)
        intercept = np.concatenate([np.atleast_1d(1.), opt.x])
        return intercept - intercept.max()


class LogisticInterceptFitterNoFeatures(object):
    """Intercept-only bernoulli logistic regression.

    Parameters
    ----------
    y : ndarray
        Class labels.
    """
    def __init__(self, y, output_dim):
        self.output_dim = output_dim
        eps = 1e-10
        if output_dim == 1:
            p = y.mean(axis=0)
            p = np.minimum(np.maximum(p, eps), 1 - eps)
            self.intercept_ = np.log(p / (1. - p))
        else:
            py = np.equal(y[:, np.newaxis],
                          np.arange(self.output_dim)[np.newaxis]).mean(axis=0)
            n_included = np.count_nonzero(py)
            if n_included < self.output_dim:
                new_mass = eps * (self.output_dim - n_included)
                py *= (1. - new_mass)
                py[np.equal(py, 0.)] = eps
            intercept = np.log(py)
            self.intercept_ = intercept - intercept.max()

    def predict(self, X, mask=None):
        n_samples = X.shape[0]
        if self.output_dim == 1:
            return np.tile(int(self.intercept_ >= 0.), n_samples)
        else:
            return np.tile(int(np.argmax(self.intercept_)), n_samples)

    def predict_proba(self, X, mask=None):
        n_samples = X.shape[0]
        if self.output_dim == 1:
            return np.tile(sigmoid(self.intercept_), n_samples)
        else:
            return np.tile(softmax(self.intercept_)[np.newaxis], (n_samples, 1))


class MaskedCoefLogisticRegression(LogisticRegression):
    """Logistic regression with a binary mask on the coef.

    Parameters
    ----------
    penalty : str
        Type of regularization: 'l1' or 'l2'.
    tol : float, optional (default=1e-4)
        Tolerance for stopping criteria.
    C : float, optional (default=1.0)
        Inverse of regularization strength; must be a positive float.
        Like in support vector machines, smaller values specify stronger
        regularization.
    fit_intercept : bool, optional (default=True)
        Specifies if a constant (a.k.a. bias or intercept) should be
        added to the decision function.
    standardize : bool, default False
        If True, centers the design matrix across samples and rescales them to
        have standard deviation of 1.
    class_weight : dict or 'balanced', optional (default=None)
        Weights associated with classes in the form ``{class_label: weight}``.
        If not given, all classes are supposed to have weight one.
        The "balanced" mode uses the values of y to automatically adjust
        weights inversely proportional to class frequencies in the input data
        as ``n_samples / (n_classes * np.bincount(y))``.
        Note that these weights will be multiplied with sample_weight (passed
        through the fit method) if sample_weight is specified.
    max_iter : int, optional (default=100)
        Maximum number of iterations taken for the solvers to converge.
    multi_class : str, {'multinomial', 'auto'}, optional (default='auto')
        For 'multinomial' the loss minimised is the multinomial loss fit
        across the entire probability distribution, *even when the data is
        binary*. 'auto' selects binary if the data is binary,
        and otherwise selects 'multinomial'.
    verbose : int, optional (default=0)
        For the liblinear and lbfgs solvers set verbose to any positive
        number for verbosity.
    warm_start : bool, optional (default=False)
        When set to True, reuse the solution of the previous call to fit as
        initialization, otherwise, just erase the previous solution.
        Useless for liblinear solver.
    """
    def __init__(self, penalty='l2', tol=1e-3, C=1.,
                 fit_intercept=True, standardize=False, class_weight=None,
                 max_iter=10000,
                 multi_class='auto', verbose=0, warm_start=False):
        if multi_class not in ('multinomial', 'auto'):
            raise ValueError("multi_class should be 'multinomial' or " +
                             "'auto'. Got %s." % multi_class)
        super().__init__(penalty=penalty, tol=tol, C=C,
                         fit_intercept=fit_intercept,
                         class_weight=class_weight,
                         max_iter=max_iter,
                         multi_class=multi_class, verbose=verbose,
                         warm_start=warm_start)
        self.standardize = standardize

    def fit(self, X, y, sample_weight=None, coef_mask=None):
        """Fit the model according to the given training data.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Training vector, where n_samples is the number of samples and
            n_features is the number of features.
        y : array-like, shape (n_samples,)
            Target vector relative to X.
        sample_weight : array-like, shape (n_samples,) optional
            Array of weights that are assigned to individual samples.
            If not provided, then each sample is given unit weight.
        coef_mask : array-like, shape (n_features), (n_classes, n_features)
                    optional
            Masking array for coef.

        Returns
        -------
        self : object
        """
        if not isinstance(self.C, numbers.Number) or self.C < 0:
            raise ValueError("Penalty term must be positive; got (C=%r)"
                             % self.C)
        if not isinstance(self.max_iter, numbers.Number) or self.max_iter < 0:
            raise ValueError("Maximum number of iteration must be positive;"
                             " got (max_iter=%r)" % self.max_iter)
        if not isinstance(self.tol, numbers.Number) or self.tol < 0:
            raise ValueError("Tolerance for stopping criteria must be "
                             "positive; got (tol=%r)" % self.tol)

        _dtype = np.float64

        X, y = self._pre_fit(X, y)

        X, y = check_X_y(X, y, accept_sparse='csr', dtype=_dtype, order="C",
                         accept_large_sparse=True)
        check_classification_targets(y)
        self.classes_ = np.unique(y)
        n_samples, n_features = X.shape

        multi_class = self.multi_class
        if multi_class == 'auto':
            if len(self.classes_) > 2:
                multi_class = 'multinomial'
            else:
                multi_class = 'ovr'

        n_classes = len(self.classes_)
        classes_ = self.classes_
        if n_classes < 2:
            raise ValueError("This solver needs samples of at least 2 classes"
                             " in the data, but the data contains only one"
                             " class: %r" % classes_[0])
        if len(self.classes_) == 2 and multi_class == 'ovr':
            n_classes = 1
            classes_ = classes_[1:]
        if multi_class == 'multinomial' and coef_mask is not None:
            coef_mask = coef_mask.reshape(n_classes, -1)

        if self.warm_start:
            warm_start_coef = getattr(self, 'coef_', None)
        else:
            warm_start_coef = None
        if warm_start_coef is not None and self.fit_intercept:
            warm_start_coef = np.append(warm_start_coef,
                                        self.intercept_[:, np.newaxis],
                                        axis=1)

        self.intercept_ = np.zeros(n_classes)

        fold_coefs_ = _logistic_regression_path(
            X, y, Cs=[self.C],
            fit_intercept=self.fit_intercept,
            tol=self.tol, verbose=self.verbose,
            multi_class=multi_class, max_iter=self.max_iter,
            class_weight=self.class_weight, penalty=self.penalty,
            check_input=False,
            coef=warm_start_coef,
            sample_weight=sample_weight, coef_mask=coef_mask)

        fold_coefs_, _, self.n_iter_ = fold_coefs_
        self.coef_ = np.asarray(fold_coefs_)
        self.coef_ = self.coef_.reshape(n_classes, n_features +
                                        int(self.fit_intercept))

        if self.fit_intercept:
            self.intercept_ = self.coef_[:, -1]
            self.coef_ = self.coef_[:, :-1]

        self._post_fit(X, y)

        return self

    def _pre_fit(self, X, y):
        if self.standardize:
            self._X_scaler = StandardScaler()
            X = self._X_scaler.fit_transform(X)
        le = LabelEncoder()
        y = le.fit_transform(y)
        self.classes_ = le.classes_
        if self.classes_.size > 2:
            self.output_dim = self.classes_.size
        elif self.multi_class == 'multinomial':
            self.output_dim = 2
        else:
            self.output_dim = 1
        return X, y

    def _post_fit(self, X, y):
        """Perform class-specific cleanup for fit().
        """
        if self.standardize:
            sX = self._X_scaler
            self.coef_ /= sX.scale_[np.newaxis]


def _logistic_regression_path(X, y, Cs=48, fit_intercept=True,
                              max_iter=100, tol=1e-4, verbose=0, coef=None,
                              class_weight=None, penalty='l2',
                              multi_class='auto',
                              check_input=True,
                              sample_weight=None,
                              l1_ratio=None, coef_mask=None):
    """Compute a Logistic Regression model for a list of regularization
    parameters.

    Parameters
    ----------
    X : array-like or sparse matrix, shape (n_samples, n_features)
        Input data.
    y : array-like, shape (n_samples,) or (n_samples, n_targets)
        Input data, target values.
    Cs : int | array-like, shape (n_cs,)
        List of values for the regularization parameter or integer specifying
        the number of regularization parameters that should be used. In this
        case, the parameters will be chosen in a logarithmic scale between
        1e-4 and 1e4.
    fit_intercept : bool
        Whether to fit an intercept for the model. In this case the shape of
        the returned array is (n_cs, n_features + 1).
    max_iter : int
        Maximum number of iterations for the solver.
    tol : float
        Stopping criterion. For the newton-cg and lbfgs solvers, the iteration
        will stop when ``max{|g_i | i = 1, ..., n} <= tol``
        where ``g_i`` is the i-th component of the gradient.
    verbose : int
        For the liblinear and lbfgs solvers set verbose to any positive
        number for verbosity.
    coef : array-like, shape (n_features,), default None
        Initialization value for coefficients of logistic regression.
        Useless for liblinear solver.
    class_weight : dict or 'balanced', optional
        Weights associated with classes in the form ``{class_label: weight}``.
        If not given, all classes are supposed to have weight one.
        The "balanced" mode uses the values of y to automatically adjust
        weights inversely proportional to class frequencies in the input data
        as ``n_samples / (n_classes * np.bincount(y))``.
        Note that these weights will be multiplied with sample_weight (passed
        through the fit method) if sample_weight is specified.
    multi_class : str, {'multinomial', 'auto'}, default: 'auto'
        For 'multinomial' the loss minimised is the multinomial loss fit
        across the entire probability distribution, *even when the data is
        binary*. 'auto' selects binary if the data is binary
        and otherwise selects 'multinomial'.
    check_input : bool, default True
        If False, the input arrays X and y will not be checked.
    sample_weight : array-like, shape(n_samples,) optional
        Array of weights that are assigned to individual samples.
        If not provided, then each sample is given unit weight.
    coef_mask : array-like, shape (n_features), (n_classes, n_features) optional
        Masking array for coef.
    Returns
    -------
    coefs : ndarray, shape (n_cs, n_features) or (n_cs, n_features + 1)
        List of coefficients for the Logistic Regression model. If
        fit_intercept is set to True then the second dimension will be
        n_features + 1, where the last item represents the intercept. For
        ``multiclass='multinomial'``, the shape is (n_classes, n_cs,
        n_features) or (n_classes, n_cs, n_features + 1).
    Cs : ndarray
        Grid of Cs used for cross-validation.
    n_iter : array, shape (n_cs,)
        Actual number of iteration for each Cs.
    """
    if isinstance(Cs, numbers.Integral):
        Cs = np.logspace(-4, 4, Cs)

    # Preprocessing.
    if check_input:
        X = check_array(X, accept_sparse='csr', dtype=np.float64,
                        accept_large_sparse=True)
        y = check_array(y, ensure_2d=False, dtype=None)
        check_consistent_length(X, y)
    _, n_features = X.shape

    classes = np.unique(y)
    n_classes = len(classes)

    if multi_class == 'auto':
        if n_classes > 2:
            multi_class = 'multinomial'
        else:
            multi_class = 'ovr'

    # If sample weights exist, convert them to array (support for lists)
    # and check length
    # Otherwise set them to 1 for all examples
    if sample_weight is not None:
        sample_weight = np.array(sample_weight, dtype=X.dtype, order='C')
        check_consistent_length(y, sample_weight)
    else:
        sample_weight = np.ones(X.shape[0], dtype=X.dtype)

    # If class_weights is a dict (provided by the user), the weights
    # are assigned to the original labels. If it is "balanced", then
    # the class_weights are assigned after masking the labels with a OvR.
    le = LabelEncoder()
    if isinstance(class_weight, dict) or multi_class == 'multinomial':
        class_weight_ = compute_class_weight(class_weight, classes=classes, y=y)
        sample_weight *= class_weight_[le.fit_transform(y)]

    # For doing a ovr, we need to mask the labels first. for the
    # multinomial case this is not necessary.
    if multi_class == 'ovr':
        coef_size = n_features
        w0 = np.zeros(n_features + int(fit_intercept), dtype=X.dtype)
        mask_classes = np.array([-1, 1])
        mask = (y == 1)
        y_bin = np.ones(y.shape, dtype=X.dtype)
        y_bin[~mask] = -1.
        # for compute_class_weight

        if class_weight == "balanced":
            class_weight_ = compute_class_weight(class_weight,
                                                 classes=mask_classes,
                                                 y=y_bin)
            sample_weight *= class_weight_[le.fit_transform(y_bin)]

    else:
        coef_size = classes.size * n_features
        lbin = OneHotEncoder(categories=[range(classes.size)], sparse=False)
        Y_multi = lbin.fit_transform(y[:, np.newaxis])
        if Y_multi.shape[1] == 1:
            Y_multi = np.hstack([1 - Y_multi, Y_multi])
        w0 = np.zeros((classes.size, n_features + int(fit_intercept)),
                      dtype=X.dtype)
        w0[:, -1] = LogisticInterceptFitterNoFeatures(y,
                                                      classes.size).intercept_

    if coef is not None:
        # it must work both giving the bias term and not
        if multi_class == 'ovr':
            if coef.size not in (n_features, w0.size):
                raise ValueError(
                    'Initialization coef is of shape %d, expected shape '
                    '%d or %d' % (coef.size, n_features, w0.size))
            w0[:coef.size] = coef
        else:
            w0[:, :coef.shape[1]] = coef

    # Mask initial array
    if coef_mask is not None:
        if multi_class == 'ovr':
            w0[:n_features] *= coef_mask
        else:
            w0[:, :n_features] *= coef_mask

    if multi_class == 'multinomial':
        # fmin_l_bfgs_b and newton-cg accepts only ravelled parameters.
        target = Y_multi
        if penalty == 'l2':
            w0 = w0.ravel()
            if coef_mask is not None:
                x0 = np.zeros_like(w0)

            def func(x, *args):
                mask = args[3]
                if mask is not None:
                    x0[mask] = x
                    args = args[:3] + (None,) + (args[-1],)
                    f, df = _multinomial_loss_grad(x0, *args)[0:2]
                    df = df[mask]
                else:
                    f, df = _multinomial_loss_grad(x, *args)[0:2]
                return f, df
        else:
            w0 = w0.T.ravel().copy()

            def inner_func(x, *args):
                return _multinomial_loss_grad(x, *args)[0:2]

            def func(x, g, *args):
                x = x.reshape(-1, classes.size).T.ravel()
                loss, grad = inner_func(x, *args)
                grad = grad.reshape(classes.size, -1).T.ravel()
                g[:] = grad
                return loss
    else:
        target = y_bin
        if penalty == 'l2':
            x0 = np.zeros_like(w0)

            def func(x, *args):
                mask = args[3]
                if mask is not None:
                    x0[mask] = x
                    args = args[:3] + (None,) + (args[-1],)
                    f, df = _logistic_loss_and_grad(x0, *args)
                    df = df[mask]
                else:
                    f, df = _logistic_loss_and_grad(x, *args)
                return f, df
        else:
            def func(x, g, *args):
                loss, grad = _logistic_loss_and_grad(x, *args)
                g[:] = grad
                return loss

    coefs = list()
    n_iter = np.zeros(len(Cs), dtype=np.int32)
    for i, C in enumerate(Cs):
        if coef_mask is None or coef_mask.sum():
            iprint = [-1, 50, 1, 100, 101][
                np.searchsorted(np.array([0, 1, 2, 3]), verbose)]
            if penalty == 'l2':
                if coef_mask is None:
                    w0, loss, info = optimize.fmin_l_bfgs_b(
                        func, w0, fprime=None,
                        args=(X, target, 1. / C, coef_mask, sample_weight),
                        iprint=iprint, pgtol=tol, maxiter=max_iter)
                else:
                    if fit_intercept:
                        if multi_class == 'multinomial':
                            mask = [coef_mask,
                                    np.ones(n_classes)[:, np.newaxis]]
                            mask = np.concatenate(mask, axis=1)
                        else:
                            mask = np.concatenate([coef_mask, np.ones(1)])
                    else:
                        mask = coef_mask
                    mask = np.nonzero(mask.ravel())[0]
                    wp = w0[mask]
                    wp, loss, info = optimize.fmin_l_bfgs_b(
                        func, wp, fprime=None,
                        args=(X, target, 1. / C, mask, sample_weight),
                        iprint=iprint, pgtol=tol, maxiter=max_iter)
                    w0 = np.zeros_like(w0)
                    w0[mask] = wp

            else:
                zeros_seen = [0]

                def zero_coef(x, *args):
                    if multi_class == 'multinomial':
                        x = x.reshape(-1, classes.size)[:-1]
                    else:
                        x = x[:-1]
                    now_zeros = np.array_equiv(x, 0.)
                    if now_zeros:
                        zeros_seen[0] += 1
                    else:
                        zeros_seen[0] = 0
                    if zeros_seen[0] > 1:
                        return -2048
                try:
                    args = (X, target, 0., coef_mask, sample_weight)
                    w0 = fmin_lbfgs(func, w0, orthantwise_c=1. / C,
                                    args=args,
                                    max_iterations=max_iter,
                                    epsilon=tol,
                                    orthantwise_end=coef_size,
                                    progress=zero_coef)
                except AllZeroLBFGSError:
                    w0 *= 0.
                info = None
            if info is not None and info["warnflag"] == 1:
                warnings.warn("lbfgs failed to converge. Increase the number "
                              "of iterations.", ConvergenceWarning)
            # In scipy <= 1.0.0, nit may exceed maxiter.
            # See https://github.com/scipy/scipy/issues/7854.
            if info is None:
                n_iter_i = -1
            else:
                n_iter_i = min(info['nit'], max_iter)

            n_iter[i] = n_iter_i

        if multi_class == 'multinomial':
            n_classes = max(2, classes.size)
            if penalty == 'l2':
                w0 = np.reshape(w0, (n_classes, -1))
            else:
                w0 = np.reshape(w0, (-1, n_classes)).T
        coefs.append(w0.copy())

    return np.array(coefs), np.array(Cs), n_iter


def _multinomial_loss(w, X, Y, alpha, sample_weight):
    """Computes multinomial loss and class probabilities.

    Parameters
    ----------
    w : ndarray, shape (n_classes * n_features,) or
        (n_classes * (n_features + 1),)
        Coefficient vector.
    X : {array-like, sparse matrix}, shape (n_samples, n_features)
        Training data.
    Y : ndarray, shape (n_samples, n_classes)
        Transformed labels according to the output of LabelBinarizer.
    alpha : float
        Regularization parameter. alpha is equal to 1 / C.
    sample_weight : array-like, shape (n_samples,)
        Array of weights that are assigned to individual samples.

    Returns
    -------
    loss : float
        Multinomial loss.
    p : ndarray, shape (n_samples, n_classes)
        Estimated class probabilities.
    w : ndarray, shape (n_classes, n_features)
        Reshaped param vector excluding intercept terms.
    """
    n_classes = Y.shape[1]
    n_samples, n_features = X.shape
    fit_intercept = w.size == (n_classes * (n_features + 1))
    w = w.reshape(n_classes, -1)
    sample_weight = sample_weight[:, np.newaxis]
    if fit_intercept:
        intercept = w[:, -1]
        w = w[:, :-1]
    else:
        intercept = 0
    p = safe_sparse_dot(X, w.T)
    p += intercept
    p -= logsumexp(p, axis=1)[:, np.newaxis]
    loss = -(sample_weight * Y * p).sum() / n_samples
    loss += 0.5 * alpha * squared_norm(w)
    p = np.exp(p, p)
    return loss, p, w


def _logistic_loss_and_grad(w, X, y, alpha, mask, sample_weight=None):
    """Computes the logistic loss and gradient.

    Parameters
    ----------
    w : ndarray, shape (n_features,) or (n_features + 1,)
        Coefficient vector.
    X : {array-like, sparse matrix}, shape (n_samples, n_features)
        Training data.
    y : ndarray, shape (n_samples,)
        Array of labels.
    alpha : float
        Regularization parameter. alpha is equal to 1 / C.
    mask : array-like, shape (n_features), (n_classes, n_features) optional
        Masking array for coef.
    sample_weight : array-like, shape (n_samples,) optional
        Array of weights that are assigned to individual samples.
        If not provided, then each sample is given unit weight.

    Returns
    -------
    out : float
        Logistic loss.
    grad : ndarray, shape (n_features,) or (n_features + 1,)
        Logistic gradient.
    """
    n_samples, n_features = X.shape
    if mask is not None:
        w[:n_features] *= mask
    grad = np.empty_like(w)

    w, c, yz = _intercept_dot(w, X, y)

    if sample_weight is None:
        sample_weight = np.ones(n_samples)

    # Logistic loss is the negative of the log of the logistic function.
    out = -np.sum(sample_weight * log_logistic(yz)) / n_samples
    out += .5 * alpha * np.dot(w, w)

    z = expit(yz)
    z0 = sample_weight * (z - 1) * y

    grad[:n_features] = (safe_sparse_dot(X.T, z0) / n_samples) + alpha * w
    if mask is not None:
        grad[:n_features] *= mask

    # Case where we fit the intercept.
    if grad.shape[0] > n_features:
        grad[-1] = z0.sum() / n_samples
    return out, grad


def _multinomial_loss_grad(w, X, Y, alpha, mask, sample_weight):
    """Computes the multinomial loss, gradient and class probabilities.

    Parameters
    ----------
    w : ndarray, shape (n_classes * n_features,) or
        (n_classes * (n_features + 1),)
        Coefficient vector.
    X : {array-like, sparse matrix}, shape (n_samples, n_features)
        Training data.
    Y : ndarray, shape (n_samples, n_classes)
        Transformed labels according to the output of LabelBinarizer.
    alpha : float
        Regularization parameter. alpha is equal to 1 / C.
    mask : array-like, shape (n_features), (n_classes, n_features) optional
        Masking array for coef.
    sample_weight : array-like, shape (n_samples,)
        Array of weights that are assigned to individual samples.

    Returns
    -------
    loss : float
        Multinomial loss.
    grad : ndarray, shape (n_classes * n_features,) or
        (n_classes * (n_features + 1),)
        Ravelled gradient of the multinomial loss.
    p : ndarray, shape (n_samples, n_classes)
        Estimated class probabilities
    """
    n_classes = Y.shape[1]
    n_samples, n_features = X.shape
    fit_intercept = (w.size == n_classes * (n_features + 1))
    if mask is not None:
        w = w.reshape(n_classes, n_features + bool(fit_intercept))
        w[:, :n_features] *= mask
        w = w.ravel()
    grad = np.zeros((n_classes, n_features + bool(fit_intercept)),
                    dtype=X.dtype)
    loss, p, w = _multinomial_loss(w, X, Y, alpha, sample_weight)
    sample_weight = sample_weight[:, np.newaxis]
    diff = sample_weight * (p - Y)
    grad[:, :n_features] = safe_sparse_dot(diff.T, X) / n_samples
    grad[:, :n_features] += alpha * w
    if mask is not None:
        grad[:, :n_features] *= mask
    if fit_intercept:
        grad[:, -1] = diff.sum(axis=0) / n_samples
    return loss, grad.ravel(), p


def _intercept_dot(w, X, y):
    """Computes y * np.dot(X, w).
    It takes into consideration if the intercept should be fit or not.
    Parameters
    ----------
    w : ndarray of shape (n_features,) or (n_features + 1,)
        Coefficient vector.
    X : {array-like, sparse matrix} of shape (n_samples, n_features)
        Training data.
    y : ndarray of shape (n_samples,)
        Array of labels.
    Returns
    -------
    w : ndarray of shape (n_features,)
        Coefficient vector without the intercept weight (w[-1]) if the
        intercept should be fit. Unchanged otherwise.
    c : float
        The intercept.
    yz : float
        y * np.dot(X, w).
    """
    c = 0.
    if w.size == X.shape[1] + 1:
        c = w[-1]
        w = w[:-1]

    z = safe_sparse_dot(X, w) + c
    yz = y * z
    return w, c, yz