diff options
author | Christopher Baines <mail@cbaines.net> | 2015-12-13 16:20:50 +0000 |
---|---|---|
committer | Christopher Baines <mail@cbaines.net> | 2015-12-13 16:20:50 +0000 |
commit | 31d70519b84ea5d4b6df194d6f251ace6bc74ffc (patch) | |
tree | 25561e8ac7b2faa9dc3a7a72a224050f1d74f99f | |
parent | 147d916d9cc641d496b8bbb32b7db99701038491 (diff) | |
download | sklearn-pandas-upstream/1.1.0.tar sklearn-pandas-upstream/1.1.0.tar.gz |
Imported Upstream version 1.1.0upstream/1.1.0upstream
-rw-r--r-- | PKG-INFO | 2 | ||||
-rw-r--r-- | README.rst | 47 | ||||
-rw-r--r-- | sklearn_pandas.egg-info/PKG-INFO | 2 | ||||
-rw-r--r-- | sklearn_pandas.egg-info/SOURCES.txt | 3 | ||||
-rw-r--r-- | sklearn_pandas.egg-info/requires.txt | 2 | ||||
-rw-r--r-- | sklearn_pandas/__init__.py | 161 | ||||
-rw-r--r-- | sklearn_pandas/cross_validation.py | 37 | ||||
-rw-r--r-- | sklearn_pandas/dataframe_mapper.py | 132 | ||||
-rw-r--r-- | sklearn_pandas/pipeline.py | 64 |
9 files changed, 283 insertions, 167 deletions
@@ -1,6 +1,6 @@ Metadata-Version: 1.0 Name: sklearn-pandas -Version: 0.0.12 +Version: 1.1.0 Summary: Pandas integration with sklearn Home-page: https://github.com/paulgb/sklearn-pandas Author: Israel Saeta Pérez @@ -6,8 +6,8 @@ This module provides a bridge between `Scikit-Learn <http://scikit-learn.org/sta In particular, it provides: -1. a way to map DataFrame columns to transformations, which are later recombined into features -2. a way to cross-validate a pipeline that takes a pandas DataFrame as input. +1. A way to map ``DataFrame`` columns to transformations, which are later recombined into features. +2. A way to cross-validate a pipeline that takes a pandas ``DataFrame`` as input. Installation ------------ @@ -32,7 +32,7 @@ Import Import what you need from the ``sklearn_pandas`` package. The choices are: * ``DataFrameMapper``, a class for mapping pandas data frame columns to different sklearn transformations -* ``cross_val_score``, similar to `sklearn.cross_validation.cross_val_score` but working on pandas DataFrames +* ``cross_val_score``, similar to ``sklearn.cross_validation.cross_val_score`` but working on pandas DataFrames For this demonstration, we will import both:: @@ -44,6 +44,7 @@ For these examples, we'll also use pandas, numpy, and sklearn:: >>> import numpy as np >>> import sklearn.preprocessing, sklearn.decomposition, \ ... sklearn.linear_model, sklearn.pipeline, sklearn.metrics + >>> from sklearn.feature_extraction.text import CountVectorizer Load some Data ************** @@ -67,16 +68,16 @@ The mapper takes a list of pairs. The first is a column name from the pandas Dat ... (['children'], sklearn.preprocessing.StandardScaler()) ... ]) -The difference between specifying the column selector as `'column'` (as a simple string) and `['column']` (as a list with one element) is the shape of the array that is passed to the transformer. In the first case, a one dimensional array with be passed, while in the second case it will be a 2-dimensional array with one column, i.e. a column vector. +The difference between specifying the column selector as ``'column'`` (as a simple string) and ``['column']`` (as a list with one element) is the shape of the array that is passed to the transformer. In the first case, a one dimensional array with be passed, while in the second case it will be a 2-dimensional array with one column, i.e. a column vector. -This behaviour mimics the same pattern as pandas' dataframes `__getitem__` indexing: +This behaviour mimics the same pattern as pandas' dataframes ``__getitem__`` indexing: >>> data['children'].shape (8,) >>> data[['children']].shape (8, 1) -Be aware that some transformers expect a 1-dimensional input (the label-oriented ones) while some others, like `OneHotEncoder` or `Imputer`, expect 2-dimensional input, with the shape `[n_samples, n_features]`. +Be aware that some transformers expect a 1-dimensional input (the label-oriented ones) while some others, like ``OneHotEncoder`` or ``Imputer``, expect 2-dimensional input, with the shape ``[n_samples, n_features]``. Test the Transformation *********************** @@ -156,6 +157,20 @@ Only columns that are listed in the DataFrameMapper are kept. To keep a column b [ 1., 0., 0., 5.], [ 0., 0., 1., 4.]]) + +Working with sparse features +**************************** + +``DataFrameMapper``s will return a dense feature array by default. Setting ``sparse=True`` in the mapper will return a sparse array whenever any of the extracted features is sparse. Example: + + >>> mapper4 = DataFrameMapper([ + ... ('pet', CountVectorizer()), + ... ], sparse=True) + >>> type(mapper4.fit_transform(data)) + <class 'scipy.sparse.csr.csr_matrix'> + +The stacking of the sparse features is done without ever densifying them. + Cross-Validation ---------------- @@ -175,6 +190,25 @@ Sklearn-pandas' ``cross_val_score`` function provides exactly the same interface Changelog --------- +1.1.0 (2015-12-06) +******************* + +* Delete obsolete ``PassThroughTransformer``. If no transformation is desired for a given column, use ``None`` as transformer. +* Factor out code in several modules, to avoid having everything in ``__init__.py``. +* Use custom ``TransformerPipeline`` class to allow transformation steps accepting only a X argument. Fixes #46. +* Add compatibility shim for unpickling mappers with list of transformers created before 1.0.0. Fixes #45. + + +1.0.0 (2015-11-28) +******************* + +* Change version numbering scheme to SemVer. +* Use ``sklearn.pipeline.Pipeline`` instead of copying its code. Resolves #43. +* Raise ``KeyError`` when selecting unexistent columns in the dataframe. Fixes #30. +* Return sparse feature array if any of the features is sparse and ``sparse`` argument is ``True``. Defaults to ``False`` to avoid potential breaking of existing code. Resolves #34. +* Return model and prediction in custom CV classes. Fixes #27. + + 0.0.12 (2015-11-07) ******************** @@ -191,4 +225,5 @@ Other contributors: * Paul Butler * Cal Paterson * Israel Saeta Pérez +* Zac Stewart * Olivier Grisel diff --git a/sklearn_pandas.egg-info/PKG-INFO b/sklearn_pandas.egg-info/PKG-INFO index c95774a..72b5b59 100644 --- a/sklearn_pandas.egg-info/PKG-INFO +++ b/sklearn_pandas.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 1.0 Name: sklearn-pandas -Version: 0.0.12 +Version: 1.1.0 Summary: Pandas integration with sklearn Home-page: https://github.com/paulgb/sklearn-pandas Author: Israel Saeta Pérez diff --git a/sklearn_pandas.egg-info/SOURCES.txt b/sklearn_pandas.egg-info/SOURCES.txt index 107a3a6..f0d23d2 100644 --- a/sklearn_pandas.egg-info/SOURCES.txt +++ b/sklearn_pandas.egg-info/SOURCES.txt @@ -4,6 +4,9 @@ README.rst setup.cfg setup.py sklearn_pandas/__init__.py +sklearn_pandas/cross_validation.py +sklearn_pandas/dataframe_mapper.py +sklearn_pandas/pipeline.py sklearn_pandas.egg-info/PKG-INFO sklearn_pandas.egg-info/SOURCES.txt sklearn_pandas.egg-info/dependency_links.txt diff --git a/sklearn_pandas.egg-info/requires.txt b/sklearn_pandas.egg-info/requires.txt index e8d13b2..1cece5e 100644 --- a/sklearn_pandas.egg-info/requires.txt +++ b/sklearn_pandas.egg-info/requires.txt @@ -1,4 +1,4 @@ scikit-learn>=0.13 scipy>=0.14 pandas>=0.11.0 -numpy>=1.6.1 +numpy>=1.6.1
\ No newline at end of file diff --git a/sklearn_pandas/__init__.py b/sklearn_pandas/__init__.py index 0f5d94c..537ab56 100644 --- a/sklearn_pandas/__init__.py +++ b/sklearn_pandas/__init__.py @@ -1,159 +1,4 @@ -__version__ = '0.0.12' +__version__ = '1.1.0' -import numpy as np -import pandas as pd -from sklearn.base import BaseEstimator, TransformerMixin -from sklearn import cross_validation -from sklearn import grid_search -import sys - -# load in the correct stringtype: str for py3, basestring for py2 -string_types = str if sys.version_info >= (3, 0) else basestring - - -def cross_val_score(model, X, *args, **kwargs): - X = DataWrapper(X) - return cross_validation.cross_val_score(model, X, *args, **kwargs) - - -class GridSearchCV(grid_search.GridSearchCV): - def fit(self, X, *params, **kwparams): - super(GridSearchCV, self).fit(DataWrapper(X), *params, **kwparams) - - def predict(self, X, *params, **kwparams): - super(GridSearchCV, self).fit(DataWrapper(X), *params, **kwparams) - - -try: - class RandomizedSearchCV(grid_search.RandomizedSearchCV): - def fit(self, X, *params, **kwparams): - super(RandomizedSearchCV, self).fit(DataWrapper(X), *params, **kwparams) - - def predict(self, X, *params, **kwparams): - super(RandomizedSearchCV, self).fit(DataWrapper(X), *params, **kwparams) -except AttributeError: - pass - - -class DataWrapper(object): - def __init__(self, df): - self.df = df - - def __len__(self): - return len(self.df) - - def __getitem__(self, key): - return self.df.iloc[key] - - -class PassthroughTransformer(TransformerMixin): - def fit(self, X, y=None, **fit_params): - return self - - def transform(self, X): - return np.array(X).astype(np.float) - - -def _handle_feature(fea): - if hasattr(fea, 'toarray'): - # sparse arrays should be converted to regular arrays - # for hstack. - fea = fea.toarray() - - if len(fea.shape) == 1: - fea = np.array([fea]).T - - return fea - - -class DataFrameMapper(BaseEstimator, TransformerMixin): - """ - Map Pandas data frame column subsets to their own - sklearn transformation. - """ - - def __init__(self, features): - """ - Params: - - features a list of pairs. The first element is the pandas column - selector. This can be a string (for one column) or a list - of strings. The second element is an object that supports - sklearn's transform interface. - """ - self.features = features - - def _get_col_subset(self, X, cols): - """ - Get a subset of columns from the given table X. - - X a Pandas dataframe; the table to select columns from - cols a string or list of strings representing the columns - to select - - Returns a numpy array with the data from the selected columns - """ - return_vector = False - if isinstance(cols, string_types): - return_vector = True - cols = [cols] - - if isinstance(X, list): - X = [x[cols] for x in X] - X = pd.DataFrame(X) - - elif isinstance(X, DataWrapper): - # if it's a datawrapper, unwrap it - X = X.df - - if return_vector: - t = X[cols[0]].values - else: - t = X.as_matrix(cols) - - return t - - def fit(self, X, y=None): - """ - Fit a transformation from the pipeline - - X the data to fit - """ - for columns, transformers in self.features: - if transformers is not None: - if isinstance(transformers, list): - # first fit_transform all transformers except the last one - Xt = self._get_col_subset(X, columns) - for transformer in transformers[:-1]: - Xt = transformer.fit_transform(Xt) - # then fit the last one without transformation - transformers[-1].fit(Xt) - else: - transformers.fit(self._get_col_subset(X, columns)) - return self - - def transform(self, X): - """ - Transform the given data. Assumes that fit has already been called. - - X the data to transform - """ - extracted = [] - for columns, transformers in self.features: - # columns could be a string or list of - # strings; we don't care because pandas - # will handle either. - Xt = self._get_col_subset(X, columns) - if transformers is not None: - if isinstance(transformers, list): - for transformer in transformers: - Xt = transformer.transform(Xt) - else: - Xt = transformers.transform(Xt) - extracted.append(_handle_feature(Xt)) - - # combine the feature outputs into one array. - # at this point we lose track of which features - # were created from which input columns, so it's - # assumed that that doesn't matter to the model. - return np.hstack(extracted) +from .dataframe_mapper import DataFrameMapper # NOQA +from .cross_validation import cross_val_score, GridSearchCV, RandomizedSearchCV # NOQA diff --git a/sklearn_pandas/cross_validation.py b/sklearn_pandas/cross_validation.py new file mode 100644 index 0000000..9cd8cbe --- /dev/null +++ b/sklearn_pandas/cross_validation.py @@ -0,0 +1,37 @@ +from sklearn import cross_validation +from sklearn import grid_search + + +def cross_val_score(model, X, *args, **kwargs): + X = DataWrapper(X) + return cross_validation.cross_val_score(model, X, *args, **kwargs) + + +class GridSearchCV(grid_search.GridSearchCV): + def fit(self, X, *params, **kwparams): + return super(GridSearchCV, self).fit(DataWrapper(X), *params, **kwparams) + + def predict(self, X, *params, **kwparams): + return super(GridSearchCV, self).predict(DataWrapper(X), *params, **kwparams) + + +try: + class RandomizedSearchCV(grid_search.RandomizedSearchCV): + def fit(self, X, *params, **kwparams): + return super(RandomizedSearchCV, self).fit(DataWrapper(X), *params, **kwparams) + + def predict(self, X, *params, **kwparams): + return super(RandomizedSearchCV, self).predict(DataWrapper(X), *params, **kwparams) +except AttributeError: + pass + + +class DataWrapper(object): + def __init__(self, df): + self.df = df + + def __len__(self): + return len(self.df) + + def __getitem__(self, key): + return self.df.iloc[key] diff --git a/sklearn_pandas/dataframe_mapper.py b/sklearn_pandas/dataframe_mapper.py new file mode 100644 index 0000000..9a59f6d --- /dev/null +++ b/sklearn_pandas/dataframe_mapper.py @@ -0,0 +1,132 @@ +import sys +import pandas as pd +import numpy as np +from scipy import sparse +from sklearn.base import BaseEstimator, TransformerMixin + +from .cross_validation import DataWrapper +from .pipeline import make_transformer_pipeline + +# load in the correct stringtype: str for py3, basestring for py2 +string_types = str if sys.version_info >= (3, 0) else basestring + + +def _handle_feature(fea): + """ + Convert 1-dimensional arrays to 2-dimensional column vectors. + """ + if len(fea.shape) == 1: + fea = np.array([fea]).T + + return fea + + +def _build_transformer(transformers): + if isinstance(transformers, list): + transformers = make_transformer_pipeline(*transformers) + return transformers + + +class DataFrameMapper(BaseEstimator, TransformerMixin): + """ + Map Pandas data frame column subsets to their own + sklearn transformation. + """ + + def __init__(self, features, sparse=False): + """ + Params: + + features a list of pairs. The first element is the pandas column + selector. This can be a string (for one column) or a list + of strings. The second element is an object that supports + sklearn's transform interface, or a list of such objects. + sparse will return sparse matrix if set True and any of the + extracted features is sparse. Defaults to False. + """ + if isinstance(features, list): + features = [(columns, _build_transformer(transformers)) + for (columns, transformers) in features] + self.features = features + self.sparse = sparse + + def __setstate__(self, state): + # compatibility shim for pickles created with sklearn-pandas<1.0.0 + self.features = [(columns, _build_transformer(transformers)) + for (columns, transformers) in state['features']] + self.sparse = state.get('sparse', False) + + def _get_col_subset(self, X, cols): + """ + Get a subset of columns from the given table X. + + X a Pandas dataframe; the table to select columns from + cols a string or list of strings representing the columns + to select + + Returns a numpy array with the data from the selected columns + """ + return_vector = False + if isinstance(cols, string_types): + return_vector = True + cols = [cols] + + if isinstance(X, list): + X = [x[cols] for x in X] + X = pd.DataFrame(X) + + elif isinstance(X, DataWrapper): + # if it's a datawrapper, unwrap it + X = X.df + + if return_vector: + t = X[cols[0]].values + else: + t = X[cols].values + + return t + + def fit(self, X, y=None): + """ + Fit a transformation from the pipeline + + X the data to fit + """ + for columns, transformers in self.features: + if transformers is not None: + transformers.fit(self._get_col_subset(X, columns)) + return self + + def transform(self, X): + """ + Transform the given data. Assumes that fit has already been called. + + X the data to transform + """ + extracted = [] + for columns, transformers in self.features: + # columns could be a string or list of + # strings; we don't care because pandas + # will handle either. + Xt = self._get_col_subset(X, columns) + if transformers is not None: + Xt = transformers.transform(Xt) + extracted.append(_handle_feature(Xt)) + + # combine the feature outputs into one array. + # at this point we lose track of which features + # were created from which input columns, so it's + # assumed that that doesn't matter to the model. + + # If any of the extracted features is sparse, combine sparsely. + # Otherwise, combine as normal arrays. + if any(sparse.issparse(fea) for fea in extracted): + stacked = sparse.hstack(extracted).tocsr() + # return a sparse matrix only if the mapper was initialized + # with sparse=True + if not self.sparse: + stacked = stacked.toarray() + else: + stacked = np.hstack(extracted) + + return stacked diff --git a/sklearn_pandas/pipeline.py b/sklearn_pandas/pipeline.py new file mode 100644 index 0000000..04cb053 --- /dev/null +++ b/sklearn_pandas/pipeline.py @@ -0,0 +1,64 @@ +import six +from sklearn.pipeline import _name_estimators, Pipeline +from sklearn.utils import tosequence + + +class TransformerPipeline(Pipeline): + """ + Pipeline that expects all steps to be transformers taking a single argument + and having fit and transform methods. + + Code is copied from sklearn's Pipeline, leaving out the `y=None` argument. + """ + def __init__(self, steps): + names, estimators = zip(*steps) + if len(dict(steps)) != len(steps): + raise ValueError("Provided step names are not unique: %s" % (names,)) + + # shallow copy of steps + self.steps = tosequence(steps) + estimator = estimators[-1] + + for e in estimators: + if (not (hasattr(e, "fit") or hasattr(e, "fit_transform")) or not + hasattr(e, "transform")): + raise TypeError("All steps of the chain should " + "be transforms and implement fit and transform" + " '%s' (type %s) doesn't)" % (e, type(e))) + + if not hasattr(estimator, "fit"): + raise TypeError("Last step of chain should implement fit " + "'%s' (type %s) doesn't)" + % (estimator, type(estimator))) + + def _pre_transform(self, X, **fit_params): + fit_params_steps = dict((step, {}) for step, _ in self.steps) + for pname, pval in six.iteritems(fit_params): + step, param = pname.split('__', 1) + fit_params_steps[step][param] = pval + Xt = X + for name, transform in self.steps[:-1]: + if hasattr(transform, "fit_transform"): + Xt = transform.fit_transform(Xt, **fit_params_steps[name]) + else: + Xt = transform.fit(Xt, **fit_params_steps[name]) \ + .transform(Xt) + return Xt, fit_params_steps[self.steps[-1][0]] + + def fit(self, X, **fit_params): + Xt, fit_params = self._pre_transform(X, **fit_params) + self.steps[-1][-1].fit(Xt, **fit_params) + return self + + def fit_transform(self, X, **fit_params): + Xt, fit_params = self._pre_transform(X, **fit_params) + if hasattr(self.steps[-1][-1], 'fit_transform'): + return self.steps[-1][-1].fit_transform(Xt, **fit_params) + else: + return self.steps[-1][-1].fit(Xt, **fit_params).transform(Xt) + + +def make_transformer_pipeline(*steps): + """Construct a TransformerPipeline from the given estimators. + """ + return TransformerPipeline(_name_estimators(steps)) |