diff options
author | Christopher Baines <chris@dheneb.cbaines.net> | 2015-11-22 16:43:24 +0000 |
---|---|---|
committer | Christopher Baines <chris@dheneb.cbaines.net> | 2015-11-22 16:45:16 +0000 |
commit | 147d916d9cc641d496b8bbb32b7db99701038491 (patch) | |
tree | adace5c67cf71210a14cdbcb2a979e4865272257 /sklearn_pandas | |
download | sklearn-pandas-147d916d9cc641d496b8bbb32b7db99701038491.tar sklearn-pandas-147d916d9cc641d496b8bbb32b7db99701038491.tar.gz |
Imported Upstream version 0.0.12upstream/0.0.12
Diffstat (limited to 'sklearn_pandas')
-rw-r--r-- | sklearn_pandas/__init__.py | 159 |
1 files changed, 159 insertions, 0 deletions
diff --git a/sklearn_pandas/__init__.py b/sklearn_pandas/__init__.py new file mode 100644 index 0000000..0f5d94c --- /dev/null +++ b/sklearn_pandas/__init__.py @@ -0,0 +1,159 @@ +__version__ = '0.0.12' + +import numpy as np +import pandas as pd +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn import cross_validation +from sklearn import grid_search +import sys + +# load in the correct stringtype: str for py3, basestring for py2 +string_types = str if sys.version_info >= (3, 0) else basestring + + +def cross_val_score(model, X, *args, **kwargs): + X = DataWrapper(X) + return cross_validation.cross_val_score(model, X, *args, **kwargs) + + +class GridSearchCV(grid_search.GridSearchCV): + def fit(self, X, *params, **kwparams): + super(GridSearchCV, self).fit(DataWrapper(X), *params, **kwparams) + + def predict(self, X, *params, **kwparams): + super(GridSearchCV, self).fit(DataWrapper(X), *params, **kwparams) + + +try: + class RandomizedSearchCV(grid_search.RandomizedSearchCV): + def fit(self, X, *params, **kwparams): + super(RandomizedSearchCV, self).fit(DataWrapper(X), *params, **kwparams) + + def predict(self, X, *params, **kwparams): + super(RandomizedSearchCV, self).fit(DataWrapper(X), *params, **kwparams) +except AttributeError: + pass + + +class DataWrapper(object): + def __init__(self, df): + self.df = df + + def __len__(self): + return len(self.df) + + def __getitem__(self, key): + return self.df.iloc[key] + + +class PassthroughTransformer(TransformerMixin): + def fit(self, X, y=None, **fit_params): + return self + + def transform(self, X): + return np.array(X).astype(np.float) + + +def _handle_feature(fea): + if hasattr(fea, 'toarray'): + # sparse arrays should be converted to regular arrays + # for hstack. + fea = fea.toarray() + + if len(fea.shape) == 1: + fea = np.array([fea]).T + + return fea + + +class DataFrameMapper(BaseEstimator, TransformerMixin): + """ + Map Pandas data frame column subsets to their own + sklearn transformation. + """ + + def __init__(self, features): + """ + Params: + + features a list of pairs. The first element is the pandas column + selector. This can be a string (for one column) or a list + of strings. The second element is an object that supports + sklearn's transform interface. + """ + self.features = features + + def _get_col_subset(self, X, cols): + """ + Get a subset of columns from the given table X. + + X a Pandas dataframe; the table to select columns from + cols a string or list of strings representing the columns + to select + + Returns a numpy array with the data from the selected columns + """ + return_vector = False + if isinstance(cols, string_types): + return_vector = True + cols = [cols] + + if isinstance(X, list): + X = [x[cols] for x in X] + X = pd.DataFrame(X) + + elif isinstance(X, DataWrapper): + # if it's a datawrapper, unwrap it + X = X.df + + if return_vector: + t = X[cols[0]].values + else: + t = X.as_matrix(cols) + + return t + + def fit(self, X, y=None): + """ + Fit a transformation from the pipeline + + X the data to fit + """ + for columns, transformers in self.features: + if transformers is not None: + if isinstance(transformers, list): + # first fit_transform all transformers except the last one + Xt = self._get_col_subset(X, columns) + for transformer in transformers[:-1]: + Xt = transformer.fit_transform(Xt) + # then fit the last one without transformation + transformers[-1].fit(Xt) + else: + transformers.fit(self._get_col_subset(X, columns)) + return self + + def transform(self, X): + """ + Transform the given data. Assumes that fit has already been called. + + X the data to transform + """ + extracted = [] + for columns, transformers in self.features: + # columns could be a string or list of + # strings; we don't care because pandas + # will handle either. + Xt = self._get_col_subset(X, columns) + if transformers is not None: + if isinstance(transformers, list): + for transformer in transformers: + Xt = transformer.transform(Xt) + else: + Xt = transformers.transform(Xt) + extracted.append(_handle_feature(Xt)) + + # combine the feature outputs into one array. + # at this point we lose track of which features + # were created from which input columns, so it's + # assumed that that doesn't matter to the model. + return np.hstack(extracted) |