Source code for pyuoi.linear_model.poisson

import numpy as np

from .base import AbstractUoIGeneralizedLinearRegressor

from pyuoi import utils

from sklearn.base import BaseEstimator
from sklearn.preprocessing import StandardScaler
from sklearn.utils.validation import check_is_fitted

from ..lbfgs import fmin_lbfgs


[docs]class Poisson(BaseEstimator): r"""Generalized Linear Model with exponential link function (i.e. Poisson) trained with L1/L2 regularizer (i.e. Elastic net penalty). The log-likelihood of the Poisson GLM is optimized by performing coordinate descent on a linearized quadratic approximation. See Chapter 5 of Hastie, Tibshirani, and Wainwright (2016) for more details. Parameters ---------- alpha : float Constant that multiplies the L1 term. Defaults to 1.0. l1_ratio : float Float between 0 and 1 acting as a scaling between l1 and l2 penalties. For ``l1_ratio = 0`` the penalty is an L2 penalty. For ``l1_ratio = 1`` it is an L1 penalty. For ``0 < l1_ratio < 1``, the penalty is a combination of L1 and L2 penalties. fit_intercept : bool Whether to fit an intercept or not. standardize : bool If ``True``, centers the design matrix across samples and rescales them to have standard deviation of 1. tol : float The tolerance for the optimization: if the updates are smaller than ``tol``, the optimization code checks the dual gap for optimality and continues until it is smaller than ``tol``. warm_start : bool When set to ``True``, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution. solver : string, 'lbfgs' | 'cd' The solver to use. Options are 'lbfgs' (orthant-wise LBFGS) and 'cd' (coordinate descent). Attributes ---------- coef_ : ndarray, shape (n_features,) The fitted parameter vector. intercept_ : float The fitted intercept. """ def __init__(self, alpha=1.0, l1_ratio=1., fit_intercept=True, standardize=False, max_iter=1000, tol=1e-5, warm_start=False, solver='lbfgs'): self.alpha = alpha self.l1_ratio = l1_ratio self.fit_intercept = fit_intercept self.standardize = standardize self.max_iter = max_iter self.tol = tol self.warm_start = warm_start self.solver = solver
[docs] def fit(self, X, y, sample_weight=None): """Fit the Poisson GLM. Parameters ---------- X : ndarray, shape (n_samples, n_features) The design matrix. y : ndarray, shape (n_samples,) Response vector. Will be cast to X's dtype if necessary. Currently, this implementation does not handle multiple response variables. sample_weight : array-like, shape (n_samples,) Array of weights assigned to the individual samples. If ``None``, then each sample is provided an equal weight. """ self.n_samples, self.n_features = X.shape X, y = self._pre_fit(X, y) # check if the response variable is constant if np.unique(y).size == 1: self.coef_ = np.zeros(self.n_features) if np.any(y): self.intercept_ = np.log(y.mean()) else: self.intercept_ = -np.inf return self if self.solver == 'lbfgs': coef = np.zeros(self.n_features) intercept = 0 if self.warm_start: if hasattr(self, 'coef_'): if self.standardize: coef = self.coef_ * self._X_scaler.scale_ else: coef = self.coef_ if hasattr(self, 'intercept_') and self.fit_intercept: intercept = self.intercept_ if self.fit_intercept: coef = np.append(coef, intercept) # create lbfgs function def func(x, g, *args): loss, grad = _poisson_loss_and_grad(x, *args) g[:] = grad return loss l1_penalty = self.alpha * self.l1_ratio l2_penalty = self.alpha * (1 - self.l1_ratio) # orthant-wise lbfgs optimization coef = fmin_lbfgs(func, coef, orthantwise_c=l1_penalty, args=(X, y, l2_penalty, sample_weight), max_iterations=self.max_iter, epsilon=self.tol, orthantwise_end=self.n_features) if self.fit_intercept: self.coef_ = coef[:self.n_features] self.intercept_ = coef[-1] else: self.coef_ = coef # coordinate descent elif self.solver == 'cd': self.coef_, intercept = self._cd(X=X, y=y, sample_weight=sample_weight) if self.fit_intercept: self.intercept_ = intercept else: raise ValueError('Solver not available.') self._post_fit(X, y) return self
[docs] def predict(self, X): """Predicts the response variable given a design matrix. The output is the mode of the Poisson distribution. Parameters ---------- X : array_like, shape (n_samples, n_features) Design matrix to predict on. Returns ------- mode : array_like, shape (n_samples) The predicted response values, i.e. the modes. """ check_is_fitted(self, ['coef_', 'intercept_']) mu = np.exp(self.intercept_ + np.dot(X, self.coef_)) mode = np.floor(mu) return mode
[docs] def predict_mean(self, X): """Calculates the mean response variable given a design matrix. Parameters ---------- X : array_like, shape (n_samples, n_features) Design matrix to predict on. Returns ------- mu : array_like, shape (n_samples) The predicted response values, i.e. the conditional means. """ check_is_fitted(self, ['coef_', 'intercept_']) mu = np.exp(self.intercept_ + np.dot(X, self.coef_)) return mu
def _cd(self, X, y, sample_weight=None): """Performs coordinate descent on a dataset. Parameters ---------- X : ndarray, shape (n_samples, n_features) The design matrix. y : ndarray, shape (n_samples,) The response vector. sample_weight : array-like, shape (n_samples,), default None Array of weights assigned to the individual samples. If None, then each sample is provided an equal weight. Returns ------- coef : ndarray, shape (n_features,) The fitted coefficients. intercept : float The fitted intercept. If fit_intercept = False, this will be equal to zero. """ n_samples, n_features = X.shape if sample_weight is None: sample_weight = np.ones(n_samples) # initialization of coef and intercepts coef = np.zeros(n_features) intercept = 0 # warm start coefficients, if necessary if self.warm_start: if hasattr(self, 'coef_'): if self.standardize: coef = self.coef_ * self._X_scaler.scale_ else: coef = self.coef_ if self.fit_intercept: if hasattr(self, 'intercept_'): intercept = self.intercept_ # every coefficient is active unless there's a warm start if self.warm_start: active_idx = np.argwhere(coef != 0) else: active_idx = np.arange(self.n_features) prev_coef = np.copy(coef) # perform coordinate descent updates for iteration in range(self.max_iter): # linearize the log-likelihood w, z = self.adjusted_response(X, y, coef, intercept) # rescale weights by sample_weight w *= sample_weight # perform an update of coordinate descent coef, intercept = self._cd_sweep( coef=coef, intercept=intercept, X=X, w=w, z=z, active_idx=active_idx) # check convergence if np.max(np.abs(prev_coef - coef)) < self.tol: break prev_coef = np.copy(coef) # update the active features active_idx = np.argwhere(coef != 0).ravel() return coef, intercept def _cd_sweep(self, coef, X, w, z, active_idx, intercept=0): """Performs one sweep of coordinate descent updates over a set of 'active' features. Parameters ---------- coef : ndarray, shape (n_features,) The current estimates of the parameters. X : ndarray, shape (n_samples, n_features) The design matrix. w : ndarray, shape (n_samples,) The weights applied to each sample after linearization of the log-likelihood. z : ndarray, shape (n_samples,) The working response after linearization of the log-likelihood. active_idx : ndarray, shape (n_features,) An array of ints denoting the indices of features that should be updated. intercept : float The current estimate of the intercept. Returns ------- coef : ndarray, shape (n_features,) The updated parameters after one coordinate descent sweep. intercept : float The update intercept after one coordinate descent sweep. """ n_features = coef.size n_samples = X.shape[0] # intercept is not penalized if self.fit_intercept: z_hat = np.dot(X, coef) residuals = z - z_hat # update intercept intercept = np.dot(w, residuals) / np.sum(w) else: intercept = 0 # iterate over the active features for idx in active_idx: # remove the current feature from the update rules mask = np.ones(n_features) mask[idx] = 0 z_hat = intercept + np.dot(X, coef * mask) x = X[:, idx] # equation 5.44; Hastie, Tibshirani, Wainwright (2016) num_update = np.dot(w, x * (z - z_hat)) / n_samples den_update = np.dot(w, x**2) / n_samples # replace coefficients sequentially coef[idx] = \ self.soft_threshold(num_update, self.l1_ratio * self.alpha) \ / (den_update + self.alpha * (1 - self.l1_ratio)) return coef, intercept def _pre_fit(self, X, y): """Perform standardization, if needed, before fitting.""" if self.standardize: self._X_scaler = StandardScaler() X = self._X_scaler.fit_transform(X) return X, y def _post_fit(self, X, y): """Rescale coefficients, if needed, after fitting.""" if self.standardize: sX = self._X_scaler if self.fit_intercept: self.intercept_ -= np.sum((sX.mean_ * self.coef_) / sX.scale_) self.coef_ /= sX.scale_
[docs] @staticmethod def soft_threshold(X, threshold): """Performs the soft-thresholding necessary for coordinate descent lasso updates. Parameters ---------- X : array-like, shape (n_features, n_samples) Matrix to be thresholded. threshold : float Soft threshold. Returns ------- X_soft_threshold : array-like Soft thresholded X. """ X_thresholded = (np.abs(X) > threshold).astype('int') X_soft_threshold = np.sign(X) * (np.abs(X) - threshold) * X_thresholded return X_soft_threshold
[docs] @staticmethod def adjusted_response(X, y, coef, intercept=0): """Calculates the adjusted response when posing the fitting procedure as Iteratively Reweighted Least Squares (Newton update on log likelihood). Parameters ---------- X : array-like, shape (n_features, n_samples) The design matrix. y : array-like, shape (n_samples) The response vector. coef : array-like, shape (n_features) Current estimate of the parameters. intercept : float The current estimate of the intercept. Returns ------- w : array-like, shape (n_samples) Weights for samples. The log-likelihood for a GLM, when posed as a linear regression problem, requires reweighting the samples. z : array-like, shape (n_samples) Working response. The linearized response when rewriting coordinate descent for the Poisson likelihood as a iteratively reweighted least squares. """ Xbeta = intercept + np.dot(X, coef) w = np.exp(Xbeta) z = Xbeta + (y / w) - 1. return w, z
[docs]class UoI_Poisson(AbstractUoIGeneralizedLinearRegressor, Poisson): r"""UoI\ :sub:`Poisson` solver. Parameters ---------- n_boots_sel : int The number of data bootstraps to use in the selection module. Increasing this number will make selection more strict. n_boots_est : int The number of data bootstraps to use in the estimation module. Increasing this number will relax selection and decrease variance. selection_frac : float The fraction of the dataset to use for training in each resampled bootstrap, during the selection module. Small values of this parameter imply larger "perturbations" to the dataset. estimation_frac : float The fraction of the dataset to use for training in each resampled bootstrap, during the estimation module. The remaining data is used to obtain validation scores. Small values of this parameters imply larger "perturbations" to the dataset. n_lambdas : int The number of regularization values to use for selection. alphas : list or ndarray of floats The parameter that trades off L1 versus L2 regularization for a given lambda. stability_selection : int, float, or array-like If int, treated as the number of bootstraps that a feature must appear in to guarantee placement in selection profile. If float, must be between 0 and 1, and is instead the proportion of bootstraps. If array-like, must consist of either ints or floats between 0 and 1. In this case, each entry in the array-like object will act as a separate threshold for placement in the selection profile. estimation_score : string, "log" | "AIC", | "AICc" | "BIC" Objective used to choose the best estimates per bootstrap. estimation_target : string, "train" | "test" Decide whether to assess the estimation_score on the train or test data across each bootstrap. By deafult, a sensible choice is made based on the chosen ``estimation_score``. solver : string, 'lbfgs' | 'cd' The solver to use. Options are 'lbfgs' (orthant-wise LBFGS) and 'cd' (coordinate descent). warm_start : bool When set to ``True``, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution eps : float Length of the lasso path. ``eps=1e-3`` means that ``alpha_min / alpha_max = 1e-3``. tol : float Stopping criteria for solver. copy_X : boolean If ``True``, X will be copied; else, it may be overwritten. fit_intercept : boolean Whether to calculate the intercept for this model. If set to ``False``, no intercept will be used in calculations. standardize : bool If True, the regressors X will be standardized before regression by subtracting the mean and dividing by their standard deviations. max_iter : int Maximum number of iterations for iterative fitting methods. random_state : int, RandomState instance, or None The seed of the pseudo random number generator that selects a random feature to update. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by ``np.random``. comm : MPI communicator If passed, the selection and estimation steps are parallelized. logger : Logger The logger to use for messages when ``verbose=True`` in ``fit``. If *None* is passed, a logger that writes to ``sys.stdout`` will be used. Attributes ---------- coef_ : ndarray, shape (n_features,) or (n_targets, n_features) Estimated coefficients for the linear regression problem. intercept_ : float Independent term in the linear model. supports_ : ndarray, shape (n_supports, n_features) Boolean array indicating whether a given regressor (column) is selected for estimation for a given regularization parameter value (row). """ def __init__(self, n_boots_sel=24, n_boots_est=24, n_lambdas=48, alphas=np.array([1.]), selection_frac=0.8, estimation_frac=0.8, stability_selection=1., estimation_score='log', estimation_target=None, solver='lbfgs', warm_start=True, eps=1e-3, tol=1e-5, copy_X=True, fit_intercept=True, standardize=True, max_iter=1000, random_state=None, comm=None, logger=None): super(UoI_Poisson, self).__init__( n_boots_sel=n_boots_sel, n_boots_est=n_boots_est, selection_frac=selection_frac, estimation_frac=estimation_frac, stability_selection=stability_selection, estimation_score=estimation_score, estimation_target=estimation_target, copy_X=copy_X, fit_intercept=fit_intercept, random_state=random_state, comm=comm, logger=logger) self.n_lambdas = n_lambdas self.alphas = alphas self.n_alphas = len(alphas) self.warm_start = warm_start self.eps = eps self.lambdas = None self._selection_lm = Poisson( fit_intercept=fit_intercept, standardize=standardize, max_iter=max_iter, tol=tol, warm_start=warm_start, solver=solver) # estimation is a Poisson regression with no regularization self._estimation_lm = Poisson( alpha=0., l1_ratio=1., fit_intercept=fit_intercept, max_iter=max_iter, tol=tol, warm_start=False, solver=solver)
[docs] def get_reg_params(self, X, y): r"""Calculates the regularization parameters (alpha and lambda) to be used for the provided data. Parameters ---------- X : array-like, shape (n_samples, n_features) The design matrix. y : array-like, shape (n_samples) The response vector. Returns ------- reg_params : a list of dictionaries A list containing dictionaries with the value of each (lambda, alpha) describing the type of regularization to impose. The keys adhere to scikit-learn's terminology (lambda->alpha, alpha->l1_ratio). This allows easy passing into the ElasticNet object. """ n_samples = X.shape[0] if self.lambdas is None: self.lambdas = np.zeros((self.n_alphas, self.n_lambdas)) # a set of lambdas are generated for each alpha value (l1_ratio in # sci-kit learn parlance) for alpha_idx, alpha in enumerate(self.alphas): # calculate upper bound for lambda sweep ybar = y.mean() lambda_max = np.max(np.abs(np.dot(X.T, y - ybar))) lambda_max /= n_samples * alpha self.lambdas[alpha_idx, :] = np.logspace( start=np.log10(lambda_max), stop=np.log10(self.eps * lambda_max), num=self.n_lambdas) # place the regularization parameters into a list of dictionaries reg_params = list() for alpha_idx, alpha in enumerate(self.alphas): for lamb_idx, lamb in enumerate(self.lambdas[alpha_idx]): # reset the regularization parameter reg_params.append(dict(alpha=lamb, l1_ratio=alpha)) return reg_params
def _score_predictions(self, metric, fitter, X, y, support, boot_idxs=None): """Score, according to some metric, predictions provided by a model. The resulting score will be negated if an information criterion is specified. Parameters ---------- metric : string The type of score to run on the prediction. Valid options include 'r2' (explained variance), 'BIC' (Bayesian information criterion), 'AIC' (Akaike information criterion), and 'AICc' (corrected AIC). fitter : Poisson object The Poisson object that has been fit to the data with the respective hyperparameters. X : ndarray, shape (n_samples, n_features) The design matrix. y : ndarray, shape (n_samples,) The response vector. support: ndarray The indices of the non-zero features. boot_idxs : 2-tuple of array-like objects Tuple of (train_idxs, test_idxs) generated from a bootstrap sample. If this is specified, then the appropriate set of data will be used for evaluating scores: test data for r^2, and training data for information criteria Returns ------- score : float The score. """ # Select the train data if boot_idxs is not None: X = X[boot_idxs[self._estimation_target]] y = y[boot_idxs[self._estimation_target]] # for Poisson, use predict_mean to calculate the "predicted" values y_pred = fitter.predict_mean(X[:, support]) # calculate the log-likelihood ll = utils.log_likelihood_glm(model='poisson', y_true=y, y_pred=y_pred) if metric == 'log': score = ll # information criteria else: n_features = np.count_nonzero(support) if fitter.intercept_ != 0: n_features += 1 n_samples = X.shape[0] if metric == 'BIC': score = utils.BIC(n_samples * ll, n_features, n_samples) elif metric == 'AIC': score = utils.AIC(n_samples * ll, n_features) elif metric == 'AICc': score = utils.AICc(n_samples * ll, n_features, n_samples) else: raise ValueError(metric + ' is not a valid metric.') # negate the score since lower information criterion is preferable score = -score return score def _pre_fit(self, X, y): """Perform standardization, if needed, before fitting.""" if self.standardize: self._X_scaler = StandardScaler() X = self._X_scaler.fit_transform(X) if y.ndim == 2: self.output_dim = y.shape[1] else: self.output_dim = 1 return X, y def _post_fit(self, X, y): """Rescale coefficients, if needed, after fitting.""" if self.output_dim == 1: self.coef_ = np.squeeze(self.coef_) if self.standardize: sX = self._X_scaler if self.fit_intercept: self.intercept_ -= np.sum((sX.mean_ * self.coef_) / sX.scale_) self.coef_ /= sX.scale_ def _fit_intercept(self, X, y): """"Fit a model with an intercept and fixed coefficients. This is used to re-fit the intercept after the coefficients are estimated. """ if self.fit_intercept: mu = np.exp(np.dot(X, self.coef_.T)) if np.any(y): self.intercept_ = np.log( np.mean(y, keepdims=True) / np.mean(mu) ) else: self.intercept_ = np.array(-np.inf) else: self.intercept_ = np.zeros(1) def _fit_intercept_no_features(self, y): """"Fit a model with only an intercept. This is used in cases where the model has no support selected. """ return PoissonInterceptFitterNoFeatures(y)
class PoissonInterceptFitterNoFeatures(object): def __init__(self, y): if np.any(y): self.intercept_ = np.log(y.mean()) else: self.intercept_ = -np.inf def predict(self, X): """Predicts the response variable given a design matrix. The output is the mode of the Poisson distribution. Parameters ---------- X : array_like, shape (n_samples, n_features) Design matrix to predict on. Returns ------- mode : array_like, shape (n_samples) The predicted response values, i.e. the modes. """ mu = np.exp(self.intercept_) mode = np.floor(mu) return mode def predict_mean(self, X): """Calculates the mean response variable given a design matrix. Parameters ---------- X : array_like, shape (n_samples, n_features) Design matrix to predict on. Returns ------- mu : array_like, shape (n_samples) The predicted response values, i.e. the conditional means. """ mu = np.exp(self.intercept_) return mu def _poisson_loss_and_grad(coef, X, y, l2_penalty, sample_weight=None): """Computes the Poisson loss and gradient. Parameters ---------- coef : ndarray, shape (n_features,) or (n_features + 1,) Coefficient vector. X : array-like, shape (n_samples, n_features) Design matrix. y : ndarray, shape (n_samples,) Response vector. l2_penalty : float Regularization parameter on l2-norm. sample_weight : array-like, shape (n_samples,), default None Array of weights assigned to the individual samples. If None, then each sample is provided an equal weight. Returns ------- out : float Negative log-likelihood of Poisson GLM. grad : ndarray, shape (n_features,) or (n_features + 1,) Poisson gradient. """ n_samples, n_features = X.shape grad = np.empty_like(coef) if sample_weight is None: sample_weight = np.ones(n_samples) # extract intercept if grad.shape[0] > n_features: intercept = coef[-1] coef = coef[:n_features] else: intercept = 0 # calculate likelihood eta = intercept + np.dot(X, coef) out = -np.sum(sample_weight * (y * eta - np.exp(eta))) / n_samples out += 0.5 * l2_penalty * np.dot(coef, coef) # gradient of parameters y_res = sample_weight * (y - np.exp(eta)) grad[:n_features] = -np.dot(X.T, y_res) / n_samples + l2_penalty * coef # gradient of intercept if grad.shape[0] > n_features: grad[-1] = -np.mean(y_res) return out, grad