import numpy as np
from sklearn.exceptions import NotFittedError
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.linear_model._coordinate_descent import _alpha_grid
try:
import pycasso
except ImportError:
pycasso = None
from .base import AbstractUoILinearRegressor
[docs]class PycLasso():
"""Lasso using the pycasso solver. Solves for an entire regularization path
at once.
Parameters
----------
alphas : nd-array
The regularization path. Defaults to None for compatibility with UoI,
but needs to be set prior to fitting.
fit_intercept : bool
Whether to calculate the intercept for this model. If set to ``False``,
no intercept will be used in calculations.
max_iter : int
Maximum number of iterations for pycasso solver.
Attributes
----------
coef_ : ndarray, shape (n_features,) or (n_targets, n_features)
Estimated coefficients for the linear regression problem.
intercept_ : float
Independent term in the linear model.
"""
def __init__(self, alphas=None, fit_intercept=False, max_iter=1000):
self.max_iter = max_iter
self.fit_intercept = fit_intercept
self.alphas = alphas
# Flag to prevent us from predicting before fitting
self.isfitted = False
[docs] def set_params(self, **kwargs):
"""Sets the parameters of this estimator."""
_valid_params = ['alphas', 'fit_intercept', 'max_iter']
for key, value in kwargs.items():
if key in _valid_params:
setattr(self, key, value)
else:
raise ValueError('Invalid parameter %s' % key)
[docs] def predict(self, X):
"""Predicts responses given a design matrix.
Parameters
----------
X : ndarray, (n_samples, n_features)
The design matrix.
Returns
-------
y : ndarray, shape (n_samples,)
Predicted response vector.
"""
if self.isfitted:
return np.matmul(X, self.coef_.T) + self.intercept_
else:
raise NotFittedError('Estimator is not fit.')
[docs] def fit(self, X, y):
"""Fit data according to the pycasso object.
Parameters
----------
X : ndarray, (n_samples, n_features)
The design matrix.
y : ndarray, shape (n_samples,)
Response vector. Will be cast to X's dtype if necessary.
Currently, this implementation does not handle multiple response
variables.
"""
if self.alphas is None:
raise Exception('Set alphas before fitting.')
self.solver = pycasso.Solver(X, y, family='gaussian',
useintercept=self.fit_intercept,
lambdas=self.alphas,
penalty='l1',
max_ite=self.max_iter)
self.solver.train()
# Coefs across the entire solution path
self.coef_ = self.solver.result['beta']
self.intercept_ = self.solver.result['intercept']
self.isfitted = True
return self
[docs]class UoI_Lasso(AbstractUoILinearRegressor, LinearRegression):
r"""UoI\ :sub:`Lasso` solver.
Parameters
----------
n_boots_sel : int
The number of data bootstraps/resamples to use in the selection module.
Increasing this number will make selection more strict.
n_boots_est : int
The number of data bootstraps/resamples to use in the estimation
module. Increasing this number will relax selection and decrease
variance.
n_lambdas : int
The number of regularization values to use for selection.
selection_frac : float
The fraction of the dataset to use for training in each resampled
bootstrap, during the selection module. Small values of this parameter
imply larger "perturbations" to the dataset.
estimation_frac : float
The fraction of the dataset to use for training in each resampled
bootstrap, during the estimation module. The remaining data is used
to obtain validation scores. Small values of this parameters imply
larger "perturbations" to the dataset.
stability_selection : int, float, or array-like
If int, treated as the number of bootstraps that a feature must
appear in order to guarantee placement in selection profile. If float,
must be between 0 and 1, and is instead the proportion of
bootstraps. If array-like, must consist of either ints or floats
between 0 and 1. In this case, each entry in the array-like object
will act as a separate threshold for placement in the selection
profile.
estimation_score : string, "r2" | "AIC" | "AICc" | "BIC"
Objective used to choose the best estimates per bootstrap.
estimation_target : string, "train" | "test"
Decide whether to assess the estimation_score on the train
or test data across each bootstrap. By deafult, a sensible
choice is made based on the chosen estimation_score
warm_start : bool
When set to ``True``, reuse the solution of the previous call to fit as
initialization, otherwise, just erase the previous solution
eps : float
Length of the lasso path. ``eps=1e-3`` means that
``lambda_min / lambda_max = 1e-3``
copy_X : bool
If ``True``, X will be copied; else, it may be overwritten.
fit_intercept : bool
Whether to calculate the intercept for this model. If set
to False, no intercept will be used in calculations
(e.g. data is expected to be already centered).
standardize : bool
If True, the regressors X will be standardized before regression by
subtracting the mean and dividing by their standard deviations. This
parameter is equivalent to ``normalize`` in ``scikit-learn`` models.
max_iter : int
Maximum number of iterations for iterative fitting methods.
random_state : int, RandomState instance, or None
The seed of the pseudo random number generator that selects a random
feature to update. If int, random_state is the seed used by the random
number generator; If RandomState instance, random_state is the random
number generator; If None, the random number generator is the
RandomState instance used by ``np.random``.
comm : MPI communicator
If passed, the selection and estimation steps are parallelized.
logger : Logger
The logger to use for messages when ``verbose=True`` in ``fit``.
If *None* is passed, a logger that writes to ``sys.stdout`` will be
used.
solver : string, 'cd' | 'pyc'
If cd, will use the ``scikit-learn`` lasso implementation (via
coordinate descent). If pyc, will use pyclasso, built off of the
pycasso path-wise solver.
Attributes
----------
coef_ : nd-array, shape (n_features,) or (n_targets, n_features)
Estimated coefficients for the linear regression problem.
intercept_ : float
Independent term in the linear model.
supports_ : array, shape
boolean array indicating whether a given regressor (column) is selected
for estimation for a given regularization parameter value (row).
"""
def __init__(self, n_boots_sel=24, n_boots_est=24, selection_frac=0.9,
estimation_frac=0.9, n_lambdas=48, stability_selection=1.,
estimation_score='r2', estimation_target=None, eps=1e-3,
warm_start=True, copy_X=True, fit_intercept=True,
standardize=True, max_iter=1000, random_state=None,
comm=None, logger=None,
solver='cd'):
super(UoI_Lasso, self).__init__(
n_boots_sel=n_boots_sel,
n_boots_est=n_boots_est,
selection_frac=selection_frac,
estimation_frac=estimation_frac,
estimation_target=estimation_target,
stability_selection=stability_selection,
copy_X=copy_X,
fit_intercept=fit_intercept,
standardize=standardize,
random_state=random_state,
comm=comm,
estimation_score=estimation_score,
max_iter=max_iter,
logger=logger)
self.n_lambdas = n_lambdas
self.eps = eps
self.solver = solver
if solver == 'cd':
self._selection_lm = Lasso(
max_iter=max_iter,
warm_start=warm_start,
random_state=random_state,
fit_intercept=fit_intercept)
elif solver == 'pyc':
if pycasso is None:
raise ImportError('pycasso is not installed.')
self._selection_lm = PycLasso(
fit_intercept=fit_intercept,
max_iter=max_iter)
self._estimation_lm = LinearRegression(fit_intercept=fit_intercept)
def get_reg_params(self, X, y):
alphas = _alpha_grid(
X=X, y=y,
l1_ratio=1.0,
fit_intercept=self.fit_intercept,
eps=self.eps,
n_alphas=self.n_lambdas)
return [{'alpha': a} for a in alphas]
[docs] def uoi_selection_sweep(self, X, y, reg_param_values):
"""Overwrite base class selection sweep to accommodate pycasso
path-wise solution"""
if self.solver == 'pyc':
alphas = np.array([reg_param['alpha']
for reg_param in reg_param_values])
self._selection_lm.set_params(alphas=alphas)
self._selection_lm.fit(X, y)
return self._selection_lm.coef_
else:
return super(UoI_Lasso, self).uoi_selection_sweep(X, y,
reg_param_values)