aboutsummaryrefslogtreecommitdiff
path: root/sklearn_pandas
diff options
context:
space:
mode:
authorChristopher Baines <mail@cbaines.net>2015-12-13 16:20:50 +0000
committerChristopher Baines <mail@cbaines.net>2015-12-13 16:20:50 +0000
commit31d70519b84ea5d4b6df194d6f251ace6bc74ffc (patch)
tree25561e8ac7b2faa9dc3a7a72a224050f1d74f99f /sklearn_pandas
parent147d916d9cc641d496b8bbb32b7db99701038491 (diff)
downloadsklearn-pandas-31d70519b84ea5d4b6df194d6f251ace6bc74ffc.tar
sklearn-pandas-31d70519b84ea5d4b6df194d6f251ace6bc74ffc.tar.gz
Imported Upstream version 1.1.0upstream/1.1.0upstream
Diffstat (limited to 'sklearn_pandas')
-rw-r--r--sklearn_pandas/__init__.py161
-rw-r--r--sklearn_pandas/cross_validation.py37
-rw-r--r--sklearn_pandas/dataframe_mapper.py132
-rw-r--r--sklearn_pandas/pipeline.py64
4 files changed, 236 insertions, 158 deletions
diff --git a/sklearn_pandas/__init__.py b/sklearn_pandas/__init__.py
index 0f5d94c..537ab56 100644
--- a/sklearn_pandas/__init__.py
+++ b/sklearn_pandas/__init__.py
@@ -1,159 +1,4 @@
-__version__ = '0.0.12'
+__version__ = '1.1.0'
-import numpy as np
-import pandas as pd
-from sklearn.base import BaseEstimator, TransformerMixin
-from sklearn import cross_validation
-from sklearn import grid_search
-import sys
-
-# load in the correct stringtype: str for py3, basestring for py2
-string_types = str if sys.version_info >= (3, 0) else basestring
-
-
-def cross_val_score(model, X, *args, **kwargs):
- X = DataWrapper(X)
- return cross_validation.cross_val_score(model, X, *args, **kwargs)
-
-
-class GridSearchCV(grid_search.GridSearchCV):
- def fit(self, X, *params, **kwparams):
- super(GridSearchCV, self).fit(DataWrapper(X), *params, **kwparams)
-
- def predict(self, X, *params, **kwparams):
- super(GridSearchCV, self).fit(DataWrapper(X), *params, **kwparams)
-
-
-try:
- class RandomizedSearchCV(grid_search.RandomizedSearchCV):
- def fit(self, X, *params, **kwparams):
- super(RandomizedSearchCV, self).fit(DataWrapper(X), *params, **kwparams)
-
- def predict(self, X, *params, **kwparams):
- super(RandomizedSearchCV, self).fit(DataWrapper(X), *params, **kwparams)
-except AttributeError:
- pass
-
-
-class DataWrapper(object):
- def __init__(self, df):
- self.df = df
-
- def __len__(self):
- return len(self.df)
-
- def __getitem__(self, key):
- return self.df.iloc[key]
-
-
-class PassthroughTransformer(TransformerMixin):
- def fit(self, X, y=None, **fit_params):
- return self
-
- def transform(self, X):
- return np.array(X).astype(np.float)
-
-
-def _handle_feature(fea):
- if hasattr(fea, 'toarray'):
- # sparse arrays should be converted to regular arrays
- # for hstack.
- fea = fea.toarray()
-
- if len(fea.shape) == 1:
- fea = np.array([fea]).T
-
- return fea
-
-
-class DataFrameMapper(BaseEstimator, TransformerMixin):
- """
- Map Pandas data frame column subsets to their own
- sklearn transformation.
- """
-
- def __init__(self, features):
- """
- Params:
-
- features a list of pairs. The first element is the pandas column
- selector. This can be a string (for one column) or a list
- of strings. The second element is an object that supports
- sklearn's transform interface.
- """
- self.features = features
-
- def _get_col_subset(self, X, cols):
- """
- Get a subset of columns from the given table X.
-
- X a Pandas dataframe; the table to select columns from
- cols a string or list of strings representing the columns
- to select
-
- Returns a numpy array with the data from the selected columns
- """
- return_vector = False
- if isinstance(cols, string_types):
- return_vector = True
- cols = [cols]
-
- if isinstance(X, list):
- X = [x[cols] for x in X]
- X = pd.DataFrame(X)
-
- elif isinstance(X, DataWrapper):
- # if it's a datawrapper, unwrap it
- X = X.df
-
- if return_vector:
- t = X[cols[0]].values
- else:
- t = X.as_matrix(cols)
-
- return t
-
- def fit(self, X, y=None):
- """
- Fit a transformation from the pipeline
-
- X the data to fit
- """
- for columns, transformers in self.features:
- if transformers is not None:
- if isinstance(transformers, list):
- # first fit_transform all transformers except the last one
- Xt = self._get_col_subset(X, columns)
- for transformer in transformers[:-1]:
- Xt = transformer.fit_transform(Xt)
- # then fit the last one without transformation
- transformers[-1].fit(Xt)
- else:
- transformers.fit(self._get_col_subset(X, columns))
- return self
-
- def transform(self, X):
- """
- Transform the given data. Assumes that fit has already been called.
-
- X the data to transform
- """
- extracted = []
- for columns, transformers in self.features:
- # columns could be a string or list of
- # strings; we don't care because pandas
- # will handle either.
- Xt = self._get_col_subset(X, columns)
- if transformers is not None:
- if isinstance(transformers, list):
- for transformer in transformers:
- Xt = transformer.transform(Xt)
- else:
- Xt = transformers.transform(Xt)
- extracted.append(_handle_feature(Xt))
-
- # combine the feature outputs into one array.
- # at this point we lose track of which features
- # were created from which input columns, so it's
- # assumed that that doesn't matter to the model.
- return np.hstack(extracted)
+from .dataframe_mapper import DataFrameMapper # NOQA
+from .cross_validation import cross_val_score, GridSearchCV, RandomizedSearchCV # NOQA
diff --git a/sklearn_pandas/cross_validation.py b/sklearn_pandas/cross_validation.py
new file mode 100644
index 0000000..9cd8cbe
--- /dev/null
+++ b/sklearn_pandas/cross_validation.py
@@ -0,0 +1,37 @@
+from sklearn import cross_validation
+from sklearn import grid_search
+
+
+def cross_val_score(model, X, *args, **kwargs):
+ X = DataWrapper(X)
+ return cross_validation.cross_val_score(model, X, *args, **kwargs)
+
+
+class GridSearchCV(grid_search.GridSearchCV):
+ def fit(self, X, *params, **kwparams):
+ return super(GridSearchCV, self).fit(DataWrapper(X), *params, **kwparams)
+
+ def predict(self, X, *params, **kwparams):
+ return super(GridSearchCV, self).predict(DataWrapper(X), *params, **kwparams)
+
+
+try:
+ class RandomizedSearchCV(grid_search.RandomizedSearchCV):
+ def fit(self, X, *params, **kwparams):
+ return super(RandomizedSearchCV, self).fit(DataWrapper(X), *params, **kwparams)
+
+ def predict(self, X, *params, **kwparams):
+ return super(RandomizedSearchCV, self).predict(DataWrapper(X), *params, **kwparams)
+except AttributeError:
+ pass
+
+
+class DataWrapper(object):
+ def __init__(self, df):
+ self.df = df
+
+ def __len__(self):
+ return len(self.df)
+
+ def __getitem__(self, key):
+ return self.df.iloc[key]
diff --git a/sklearn_pandas/dataframe_mapper.py b/sklearn_pandas/dataframe_mapper.py
new file mode 100644
index 0000000..9a59f6d
--- /dev/null
+++ b/sklearn_pandas/dataframe_mapper.py
@@ -0,0 +1,132 @@
+import sys
+import pandas as pd
+import numpy as np
+from scipy import sparse
+from sklearn.base import BaseEstimator, TransformerMixin
+
+from .cross_validation import DataWrapper
+from .pipeline import make_transformer_pipeline
+
+# load in the correct stringtype: str for py3, basestring for py2
+string_types = str if sys.version_info >= (3, 0) else basestring
+
+
+def _handle_feature(fea):
+ """
+ Convert 1-dimensional arrays to 2-dimensional column vectors.
+ """
+ if len(fea.shape) == 1:
+ fea = np.array([fea]).T
+
+ return fea
+
+
+def _build_transformer(transformers):
+ if isinstance(transformers, list):
+ transformers = make_transformer_pipeline(*transformers)
+ return transformers
+
+
+class DataFrameMapper(BaseEstimator, TransformerMixin):
+ """
+ Map Pandas data frame column subsets to their own
+ sklearn transformation.
+ """
+
+ def __init__(self, features, sparse=False):
+ """
+ Params:
+
+ features a list of pairs. The first element is the pandas column
+ selector. This can be a string (for one column) or a list
+ of strings. The second element is an object that supports
+ sklearn's transform interface, or a list of such objects.
+ sparse will return sparse matrix if set True and any of the
+ extracted features is sparse. Defaults to False.
+ """
+ if isinstance(features, list):
+ features = [(columns, _build_transformer(transformers))
+ for (columns, transformers) in features]
+ self.features = features
+ self.sparse = sparse
+
+ def __setstate__(self, state):
+ # compatibility shim for pickles created with sklearn-pandas<1.0.0
+ self.features = [(columns, _build_transformer(transformers))
+ for (columns, transformers) in state['features']]
+ self.sparse = state.get('sparse', False)
+
+ def _get_col_subset(self, X, cols):
+ """
+ Get a subset of columns from the given table X.
+
+ X a Pandas dataframe; the table to select columns from
+ cols a string or list of strings representing the columns
+ to select
+
+ Returns a numpy array with the data from the selected columns
+ """
+ return_vector = False
+ if isinstance(cols, string_types):
+ return_vector = True
+ cols = [cols]
+
+ if isinstance(X, list):
+ X = [x[cols] for x in X]
+ X = pd.DataFrame(X)
+
+ elif isinstance(X, DataWrapper):
+ # if it's a datawrapper, unwrap it
+ X = X.df
+
+ if return_vector:
+ t = X[cols[0]].values
+ else:
+ t = X[cols].values
+
+ return t
+
+ def fit(self, X, y=None):
+ """
+ Fit a transformation from the pipeline
+
+ X the data to fit
+ """
+ for columns, transformers in self.features:
+ if transformers is not None:
+ transformers.fit(self._get_col_subset(X, columns))
+ return self
+
+ def transform(self, X):
+ """
+ Transform the given data. Assumes that fit has already been called.
+
+ X the data to transform
+ """
+ extracted = []
+ for columns, transformers in self.features:
+ # columns could be a string or list of
+ # strings; we don't care because pandas
+ # will handle either.
+ Xt = self._get_col_subset(X, columns)
+ if transformers is not None:
+ Xt = transformers.transform(Xt)
+ extracted.append(_handle_feature(Xt))
+
+ # combine the feature outputs into one array.
+ # at this point we lose track of which features
+ # were created from which input columns, so it's
+ # assumed that that doesn't matter to the model.
+
+ # If any of the extracted features is sparse, combine sparsely.
+ # Otherwise, combine as normal arrays.
+ if any(sparse.issparse(fea) for fea in extracted):
+ stacked = sparse.hstack(extracted).tocsr()
+ # return a sparse matrix only if the mapper was initialized
+ # with sparse=True
+ if not self.sparse:
+ stacked = stacked.toarray()
+ else:
+ stacked = np.hstack(extracted)
+
+ return stacked
diff --git a/sklearn_pandas/pipeline.py b/sklearn_pandas/pipeline.py
new file mode 100644
index 0000000..04cb053
--- /dev/null
+++ b/sklearn_pandas/pipeline.py
@@ -0,0 +1,64 @@
+import six
+from sklearn.pipeline import _name_estimators, Pipeline
+from sklearn.utils import tosequence
+
+
+class TransformerPipeline(Pipeline):
+ """
+ Pipeline that expects all steps to be transformers taking a single argument
+ and having fit and transform methods.
+
+ Code is copied from sklearn's Pipeline, leaving out the `y=None` argument.
+ """
+ def __init__(self, steps):
+ names, estimators = zip(*steps)
+ if len(dict(steps)) != len(steps):
+ raise ValueError("Provided step names are not unique: %s" % (names,))
+
+ # shallow copy of steps
+ self.steps = tosequence(steps)
+ estimator = estimators[-1]
+
+ for e in estimators:
+ if (not (hasattr(e, "fit") or hasattr(e, "fit_transform")) or not
+ hasattr(e, "transform")):
+ raise TypeError("All steps of the chain should "
+ "be transforms and implement fit and transform"
+ " '%s' (type %s) doesn't)" % (e, type(e)))
+
+ if not hasattr(estimator, "fit"):
+ raise TypeError("Last step of chain should implement fit "
+ "'%s' (type %s) doesn't)"
+ % (estimator, type(estimator)))
+
+ def _pre_transform(self, X, **fit_params):
+ fit_params_steps = dict((step, {}) for step, _ in self.steps)
+ for pname, pval in six.iteritems(fit_params):
+ step, param = pname.split('__', 1)
+ fit_params_steps[step][param] = pval
+ Xt = X
+ for name, transform in self.steps[:-1]:
+ if hasattr(transform, "fit_transform"):
+ Xt = transform.fit_transform(Xt, **fit_params_steps[name])
+ else:
+ Xt = transform.fit(Xt, **fit_params_steps[name]) \
+ .transform(Xt)
+ return Xt, fit_params_steps[self.steps[-1][0]]
+
+ def fit(self, X, **fit_params):
+ Xt, fit_params = self._pre_transform(X, **fit_params)
+ self.steps[-1][-1].fit(Xt, **fit_params)
+ return self
+
+ def fit_transform(self, X, **fit_params):
+ Xt, fit_params = self._pre_transform(X, **fit_params)
+ if hasattr(self.steps[-1][-1], 'fit_transform'):
+ return self.steps[-1][-1].fit_transform(Xt, **fit_params)
+ else:
+ return self.steps[-1][-1].fit(Xt, **fit_params).transform(Xt)
+
+
+def make_transformer_pipeline(*steps):
+ """Construct a TransformerPipeline from the given estimators.
+ """
+ return TransformerPipeline(_name_estimators(steps))