diff options
Diffstat (limited to 'sklearn_pandas/__init__.py')
-rw-r--r-- | sklearn_pandas/__init__.py | 161 |
1 files changed, 3 insertions, 158 deletions
diff --git a/sklearn_pandas/__init__.py b/sklearn_pandas/__init__.py index 0f5d94c..537ab56 100644 --- a/sklearn_pandas/__init__.py +++ b/sklearn_pandas/__init__.py @@ -1,159 +1,4 @@ -__version__ = '0.0.12' +__version__ = '1.1.0' -import numpy as np -import pandas as pd -from sklearn.base import BaseEstimator, TransformerMixin -from sklearn import cross_validation -from sklearn import grid_search -import sys - -# load in the correct stringtype: str for py3, basestring for py2 -string_types = str if sys.version_info >= (3, 0) else basestring - - -def cross_val_score(model, X, *args, **kwargs): - X = DataWrapper(X) - return cross_validation.cross_val_score(model, X, *args, **kwargs) - - -class GridSearchCV(grid_search.GridSearchCV): - def fit(self, X, *params, **kwparams): - super(GridSearchCV, self).fit(DataWrapper(X), *params, **kwparams) - - def predict(self, X, *params, **kwparams): - super(GridSearchCV, self).fit(DataWrapper(X), *params, **kwparams) - - -try: - class RandomizedSearchCV(grid_search.RandomizedSearchCV): - def fit(self, X, *params, **kwparams): - super(RandomizedSearchCV, self).fit(DataWrapper(X), *params, **kwparams) - - def predict(self, X, *params, **kwparams): - super(RandomizedSearchCV, self).fit(DataWrapper(X), *params, **kwparams) -except AttributeError: - pass - - -class DataWrapper(object): - def __init__(self, df): - self.df = df - - def __len__(self): - return len(self.df) - - def __getitem__(self, key): - return self.df.iloc[key] - - -class PassthroughTransformer(TransformerMixin): - def fit(self, X, y=None, **fit_params): - return self - - def transform(self, X): - return np.array(X).astype(np.float) - - -def _handle_feature(fea): - if hasattr(fea, 'toarray'): - # sparse arrays should be converted to regular arrays - # for hstack. - fea = fea.toarray() - - if len(fea.shape) == 1: - fea = np.array([fea]).T - - return fea - - -class DataFrameMapper(BaseEstimator, TransformerMixin): - """ - Map Pandas data frame column subsets to their own - sklearn transformation. - """ - - def __init__(self, features): - """ - Params: - - features a list of pairs. The first element is the pandas column - selector. This can be a string (for one column) or a list - of strings. The second element is an object that supports - sklearn's transform interface. - """ - self.features = features - - def _get_col_subset(self, X, cols): - """ - Get a subset of columns from the given table X. - - X a Pandas dataframe; the table to select columns from - cols a string or list of strings representing the columns - to select - - Returns a numpy array with the data from the selected columns - """ - return_vector = False - if isinstance(cols, string_types): - return_vector = True - cols = [cols] - - if isinstance(X, list): - X = [x[cols] for x in X] - X = pd.DataFrame(X) - - elif isinstance(X, DataWrapper): - # if it's a datawrapper, unwrap it - X = X.df - - if return_vector: - t = X[cols[0]].values - else: - t = X.as_matrix(cols) - - return t - - def fit(self, X, y=None): - """ - Fit a transformation from the pipeline - - X the data to fit - """ - for columns, transformers in self.features: - if transformers is not None: - if isinstance(transformers, list): - # first fit_transform all transformers except the last one - Xt = self._get_col_subset(X, columns) - for transformer in transformers[:-1]: - Xt = transformer.fit_transform(Xt) - # then fit the last one without transformation - transformers[-1].fit(Xt) - else: - transformers.fit(self._get_col_subset(X, columns)) - return self - - def transform(self, X): - """ - Transform the given data. Assumes that fit has already been called. - - X the data to transform - """ - extracted = [] - for columns, transformers in self.features: - # columns could be a string or list of - # strings; we don't care because pandas - # will handle either. - Xt = self._get_col_subset(X, columns) - if transformers is not None: - if isinstance(transformers, list): - for transformer in transformers: - Xt = transformer.transform(Xt) - else: - Xt = transformers.transform(Xt) - extracted.append(_handle_feature(Xt)) - - # combine the feature outputs into one array. - # at this point we lose track of which features - # were created from which input columns, so it's - # assumed that that doesn't matter to the model. - return np.hstack(extracted) +from .dataframe_mapper import DataFrameMapper # NOQA +from .cross_validation import cross_val_score, GridSearchCV, RandomizedSearchCV # NOQA |