Source code for pyuoi.linear_model.lasso

import numpy as np

from sklearn.exceptions import NotFittedError
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.linear_model._coordinate_descent import _alpha_grid
try:
    import pycasso
except ImportError:
    pycasso = None

from .base import AbstractUoILinearRegressor


[docs]class PycLasso(): """Lasso using the pycasso solver. Solves for an entire regularization path at once. Parameters ---------- alphas : nd-array The regularization path. Defaults to None for compatibility with UoI, but needs to be set prior to fitting. fit_intercept : bool Whether to calculate the intercept for this model. If set to ``False``, no intercept will be used in calculations. max_iter : int Maximum number of iterations for pycasso solver. tol : float Stopping criteria for solver. Attributes ---------- coef_ : ndarray, shape (n_features,) or (n_targets, n_features) Estimated coefficients for the linear regression problem. intercept_ : float Independent term in the linear model. """ def __init__(self, alphas=None, fit_intercept=True, max_iter=1000, tol=1e-4): if fit_intercept is False: string = ('There is currently a bug in picasso that prevents ' + 'its use with `fit_intercept=False.`' + 'See https://github.com/jasonge27/picasso/' + 'issues/25 for resolution.') raise ValueError(string) self.max_iter = max_iter self.tol = tol self.fit_intercept = fit_intercept self.alphas = alphas # Flag to prevent us from predicting before fitting self.isfitted = False self.tol = tol
[docs] def set_params(self, **kwargs): """Sets the parameters of this estimator.""" _valid_params = ['alphas', 'fit_intercept', 'max_iter'] for key, value in kwargs.items(): if key in _valid_params: setattr(self, key, value) else: raise ValueError('Invalid parameter %s' % key)
[docs] def predict(self, X): """Predicts responses given a design matrix. Parameters ---------- X : ndarray, (n_samples, n_features) The design matrix. Returns ------- y : ndarray, shape (n_samples,) Predicted response vector. """ if self.isfitted: return np.matmul(X, self.coef_.T) + self.intercept_ else: raise NotFittedError('Estimator is not fit.')
[docs] def fit(self, X, y): """Fit data according to the pycasso object. Parameters ---------- X : ndarray, (n_samples, n_features) The design matrix. y : ndarray, shape (n_samples,) Response vector. Will be cast to X's dtype if necessary. Currently, this implementation does not handle multiple response variables. """ if self.alphas is None: raise Exception('Set alphas before fitting.') if self.fit_intercept is False: string = ('There is currently a bug in picasso that prevents ' + 'its use with `fit_intercept=False.`' + 'See https://github.com/jasonge27/picasso/' + 'issues/25 for resolution.') raise ValueError(string) self.solver = pycasso.Solver(X, y, family='gaussian', useintercept=self.fit_intercept, lambdas=self.alphas, penalty='l1', max_ite=self.max_iter, prec=self.tol) self.solver.train() # Coefs across the entire solution path self.coef_ = self.solver.result['beta'] self.intercept_ = self.solver.result['intercept'] self.isfitted = True return self
[docs]class UoI_Lasso(AbstractUoILinearRegressor, LinearRegression): r"""UoI\ :sub:`Lasso` solver. Parameters ---------- n_boots_sel : int The number of data bootstraps/resamples to use in the selection module. Increasing this number will make selection more strict. n_boots_est : int The number of data bootstraps/resamples to use in the estimation module. Increasing this number will relax selection and decrease variance. n_lambdas : int The number of regularization values to use for selection. selection_frac : float The fraction of the dataset to use for training in each resampled bootstrap, during the selection module. Small values of this parameter imply larger "perturbations" to the dataset. estimation_frac : float The fraction of the dataset to use for training in each resampled bootstrap, during the estimation module. The remaining data is used to obtain validation scores. Small values of this parameters imply larger "perturbations" to the dataset. stability_selection : int, float, or array-like If int, treated as the number of bootstraps that a feature must appear in order to guarantee placement in selection profile. If float, must be between 0 and 1, and is instead the proportion of bootstraps. If array-like, must consist of either ints or floats between 0 and 1. In this case, each entry in the array-like object will act as a separate threshold for placement in the selection profile. estimation_score : string, "r2" | "AIC" | "AICc" | "BIC" Objective used to choose the best estimates per bootstrap. estimation_target : string, "train" | "test" Decide whether to assess the estimation_score on the train or test data across each bootstrap. By deafult, a sensible choice is made based on the chosen estimation_score warm_start : bool When set to ``True``, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution eps : float Length of the lasso path. ``eps=1e-3`` means that ``lambda_min / lambda_max = 1e-3`` copy_X : bool If ``True``, X will be copied; else, it may be overwritten. fit_intercept : bool Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (e.g. data is expected to be already centered). standardize : bool If True, the regressors X will be standardized before regression by subtracting the mean and dividing by their standard deviations. This parameter is equivalent to ``normalize`` in ``scikit-learn`` models. max_iter : int Maximum number of iterations for iterative fitting methods. tol : float Stopping criteria for solver. random_state : int, RandomState instance, or None The seed of the pseudo random number generator that selects a random feature to update. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by ``np.random``. comm : MPI communicator If passed, the selection and estimation steps are parallelized. logger : Logger The logger to use for messages when ``verbose=True`` in ``fit``. If *None* is passed, a logger that writes to ``sys.stdout`` will be used. solver : string, 'cd' | 'pyc' If cd, will use the ``scikit-learn`` lasso implementation (via coordinate descent). If pyc, will use pyclasso, built off of the pycasso path-wise solver. Attributes ---------- coef_ : nd-array, shape (n_features,) or (n_targets, n_features) Estimated coefficients for the linear regression problem. intercept_ : float Independent term in the linear model. supports_ : array, shape boolean array indicating whether a given regressor (column) is selected for estimation for a given regularization parameter value (row). """ def __init__(self, n_boots_sel=24, n_boots_est=24, selection_frac=0.9, estimation_frac=0.9, n_lambdas=48, stability_selection=1., estimation_score='r2', estimation_target=None, eps=1e-3, warm_start=True, copy_X=True, fit_intercept=True, standardize=True, max_iter=1000, tol=1e-4, random_state=None, comm=None, logger=None, solver='cd'): super(UoI_Lasso, self).__init__( n_boots_sel=n_boots_sel, n_boots_est=n_boots_est, selection_frac=selection_frac, estimation_frac=estimation_frac, estimation_target=estimation_target, stability_selection=stability_selection, copy_X=copy_X, fit_intercept=fit_intercept, standardize=standardize, random_state=random_state, comm=comm, estimation_score=estimation_score, max_iter=max_iter, tol=tol, logger=logger) self.n_lambdas = n_lambdas self.eps = eps self.solver = solver self.tol = tol if solver == 'cd': self._selection_lm = Lasso( max_iter=max_iter, tol=tol, warm_start=warm_start, random_state=random_state, fit_intercept=fit_intercept) elif solver == 'pyc': if pycasso is None: raise ImportError('pycasso is not installed.') self._selection_lm = PycLasso( fit_intercept=fit_intercept, max_iter=max_iter, tol=tol) self._estimation_lm = LinearRegression(fit_intercept=fit_intercept) def get_reg_params(self, X, y): alphas = _alpha_grid( X=X, y=y, l1_ratio=1.0, fit_intercept=self.fit_intercept, eps=self.eps, n_alphas=self.n_lambdas) return [{'alpha': a} for a in alphas]
[docs] def uoi_selection_sweep(self, X, y, reg_param_values): """Overwrite base class selection sweep to accommodate pycasso path-wise solution""" if self.solver == 'pyc': alphas = np.array([reg_param['alpha'] for reg_param in reg_param_values]) self._selection_lm.set_params(alphas=alphas) self._selection_lm.fit(X, y) return self._selection_lm.coef_ else: return super(UoI_Lasso, self).uoi_selection_sweep(X, y, reg_param_values)