Source code for pyuoi.datasets

import numpy as np
from sklearn.utils import check_random_state

from ..utils import softmax, sigmoid


def load_swimmer(flatten=True):
    from pkg_resources import resource_filename
    import h5py
    with h5py.File(resource_filename('pyuoi', 'data/Swimmer.h5'), 'r+') as f:
        swimmers = f['Y'][:].astype(float)
    if flatten:
        swimmers = swimmers.T.reshape(256, 1024)
    return swimmers


[docs]def make_linear_regression(n_samples=100, n_features=5, n_informative=2,
                           X_loc=3., X_scale=1., snr=5.,
                           beta=None, beta_low=1., beta_high=3.,
                           include_intercept=False, random_state=None):
    """Make a Linear regression dataset.

    Parameters
    ----------
    n_samples : int
        The number of samples to make.
    n_features : int
        The number of feature to use.
    n_informative : int
        The number of feature with non-zero weights.
    X_loc : float
        The mean of the features in the design matrix.
    X_scale : float
        The standard deviation of the features in the design matrix.
    snr : float
        The signal-to-noise ratio, which informs the variance of the noise
        term.
    beta : np.ndarray or None
        The beta values to use. If None, beta values will be drawn from a
        uniform distribution.
    beta_low : float
        The lower bound for the beta values.
    beta_high : float
        The upper bound for the beta values.
    include_intercept : bool
        If true, includes an intercept in the model, if False, the intercept is
        set to 0.
    random_state : int, np.random.RandomState instance, or None
        Random number seed or state.

    Returns
    -------
    X : ndarray, shape (n_samples, n_features)
        The design matrix.
    y : ndarray, shape (n_samples,)
        The response vector.
    beta : ndarray, shape (n_features,)
        The feature coefficients.
    intercept : float
        The intercept. If include_intercept is False, then intercept is zero.
    """
    rng = check_random_state(random_state)

    # create design matrix
    X = rng.normal(loc=X_loc,
                   scale=X_scale,
                   size=(n_samples, n_features))

    # create coefficients
    if beta is None:
        # draw beta values from gamma distribution
        beta = rng.uniform(low=beta_low,
                           high=beta_high,
                           size=n_features)

        # choose sparsity mask
        zero_idx = np.zeros(n_features)
        zero_idx[:n_informative] = 1
        rng.shuffle(zero_idx)
        # randomly assign beta elements to zero
        beta = beta * zero_idx

    # create intercept
    if include_intercept:
        intercept = rng.uniform(low=beta_low, high=beta_high)
    else:
        intercept = 0

    # draw response variable
    eta = intercept + np.dot(X, beta)
    signal_var = np.var(eta)
    noise_var = signal_var / snr
    noise = rng.normal(loc=0, scale=np.sqrt(noise_var), size=eta.shape)
    y = eta + noise

    return X, y, beta, intercept


[docs]def make_classification(n_samples=100, n_features=20, n_informative=2,
                        n_classes=2, shared_support=False, random_state=None,
                        w_scale=1., include_intercept=False):
    """Make a linear classification dataset.

    Parameters
    ----------
    n_samples : int
        The number of samples to make.
    n_features : int
        The number of feature to use.
    n_informative : int
        The number of feature with non-zero weights.
    n_classes : int
        The number of classes.
    shared_support : bool
        If True, all classes will share the same random support. If False, they
        will each have randomly chooses support.
    random_state : int or np.random.RandomState instance
        Random number seed or state.
    w_scale : float
        The model parameter matrix, w, will be drawn from a normal distribution
        with std=w_scale.
    include_intercept : bool
        If true, includes an intercept in the model, if False, the intercept is
        set to 0.
    """
    if isinstance(random_state, int):
        rng = np.random.RandomState(random_state)
    else:
        if random_state is None:
            rng = np.random
        else:
            rng = random_state
    n_not_informative = n_features - n_informative

    X = rng.randn(n_samples, n_features)
    X -= X.mean(axis=-1, keepdims=True)
    X /= X.std(axis=-1, keepdims=True)

    if n_classes > 2:
        w = rng.randn(n_features, n_classes)
        if include_intercept:
            intercept = rng.randn(1, n_classes)
            intercept -= intercept.max()
        else:
            intercept = np.zeros((1, n_classes))
        if n_not_informative > 0:
            if shared_support:
                idxs = rng.permutation(n_features)[:n_not_informative]
                w[idxs] = 0.
            else:
                for ii in range(n_classes):
                    idxs = rng.permutation(n_features)[:n_not_informative]
                    w[idxs, ii * np.ones_like(idxs, dtype=int)] = 0.
    else:
        w = rng.randn(n_features, 1)
        if include_intercept:
            intercept = rng.randn(1, 1)
        else:
            intercept = np.zeros((1, 1))
        if n_not_informative > 0:
            idxs = rng.permutation(n_features)[:n_not_informative]
            w[idxs] = 0.
    w *= w_scale

    log_p = X.dot(w)
    if include_intercept:
        log_p += intercept
    if n_classes > 2:
        p = softmax(log_p)
        y = np.array([rng.multinomial(1, pi) for pi in p])
        y = y.argmax(axis=-1)
    else:
        p = sigmoid(np.squeeze(log_p))
        y = np.array([rng.binomial(1, pi) for pi in p])

    return X, y, w.T, intercept


[docs]def make_poisson_regression(n_samples=100, n_features=5, n_informative=2,
                            X_loc=0., X_scale=1. / 8,
                            beta=None, beta_shape=1., beta_scale=3.,
                            include_intercept=False, random_state=None):
    """Make a Poisson regression dataset.

    Parameters
    ----------
    n_samples : int
        The number of samples to make.
    n_features : int
        The number of feature to use.
    n_informative : int
        The number of feature with non-zero weights.
    X_loc : float
        The mean of the features in the design matrix.
    X_scale : float
        The standard deviation of the features in the design matrix.
    beta : np.ndarray or None
        The beta values to use. If None, beta values will be drawn from a gamma
        distribution.
    beta_shape : float
        The shape parameter for the beta values.
    beta_scale : float
        The scale parameter for the beta values.
    include_intercept : bool
        If true, includes an intercept in the model, if False, the intercept is
        set to 0.
    random_state : int, np.random.RandomState instance, or None
        Random number seed or state.

    Returns
    -------
    X : ndarray, shape (n_samples, n_features)
        The design matrix.
    y : ndarray, shape (n_samples,)
        The response vector.
    beta : ndarray, shape (n_features,)
        The feature coefficients.
    intercept : float
        The intercept. If include_intercept is False, then intercept is zero.
    """
    rng = check_random_state(random_state)

    # create design matrix
    X = rng.normal(loc=X_loc,
                   scale=X_scale,
                   size=(n_samples, n_features))

    # create coefficients
    if beta is None:
        # draw beta values from gamma distribution
        beta = rng.gamma(shape=beta_shape,
                         scale=beta_scale,
                         size=n_features)
        # choose sparsity mask
        zero_idx = np.zeros(n_features)
        zero_idx[:n_informative] = 1
        rng.shuffle(zero_idx)
        # randomly assign beta elements to zero
        beta = beta * zero_idx

    # create intercept
    if include_intercept:
        intercept = rng.gamma(shape=beta_shape, scale=beta_scale)
    else:
        intercept = 0

    # draw response variable
    eta = intercept + np.dot(X, beta)
    y = rng.poisson(np.exp(eta))

    return X, y, beta, intercept