1 files changed, 159 insertions, 0 deletions
diff --git a/sklearn_pandas/__init__.py b/sklearn_pandas/__init__.py
new file mode 100644
index 0000000..0f5d94c
--- /dev/null
+++ b/sklearn_pandas/__init__.py
@@ -0,0 +1,159 @@
+__version__ = '0.0.12'
+
+import numpy as np
+import pandas as pd
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn import cross_validation
+from sklearn import grid_search
+import sys
+
+# load in the correct stringtype: str for py3, basestring for py2
+string_types = str if sys.version_info >= (3, 0) else basestring
+
+
+def cross_val_score(model, X, *args, **kwargs):
+    X = DataWrapper(X)
+    return cross_validation.cross_val_score(model, X, *args, **kwargs)
+
+
+class GridSearchCV(grid_search.GridSearchCV):
+    def fit(self, X, *params, **kwparams):
+        super(GridSearchCV, self).fit(DataWrapper(X), *params, **kwparams)
+
+    def predict(self, X, *params, **kwparams):
+        super(GridSearchCV, self).fit(DataWrapper(X), *params, **kwparams)
+
+
+try:
+    class RandomizedSearchCV(grid_search.RandomizedSearchCV):
+        def fit(self, X, *params, **kwparams):
+            super(RandomizedSearchCV, self).fit(DataWrapper(X), *params, **kwparams)
+
+        def predict(self, X, *params, **kwparams):
+            super(RandomizedSearchCV, self).fit(DataWrapper(X), *params, **kwparams)
+except AttributeError:
+    pass
+
+
+class DataWrapper(object):
+    def __init__(self, df):
+        self.df = df
+
+    def __len__(self):
+        return len(self.df)
+
+    def __getitem__(self, key):
+        return self.df.iloc[key]
+
+
+class PassthroughTransformer(TransformerMixin):
+    def fit(self, X, y=None, **fit_params):
+        return self
+
+    def transform(self, X):
+        return np.array(X).astype(np.float)
+
+
+def _handle_feature(fea):
+    if hasattr(fea, 'toarray'):
+        # sparse arrays should be converted to regular arrays
+        # for hstack.
+        fea = fea.toarray()
+
+    if len(fea.shape) == 1:
+        fea = np.array([fea]).T
+
+    return fea
+
+
+class DataFrameMapper(BaseEstimator, TransformerMixin):
+    """
+    Map Pandas data frame column subsets to their own
+    sklearn transformation.
+    """
+
+    def __init__(self, features):
+        """
+        Params:
+
+        features    a list of pairs. The first element is the pandas column
+                    selector. This can be a string (for one column) or a list
+                    of strings. The second element is an object that supports
+                    sklearn's transform interface.
+        """
+        self.features = features
+
+    def _get_col_subset(self, X, cols):
+        """
+        Get a subset of columns from the given table X.
+
+        X       a Pandas dataframe; the table to select columns from
+        cols    a string or list of strings representing the columns
+                to select
+
+        Returns a numpy array with the data from the selected columns
+        """
+        return_vector = False
+        if isinstance(cols, string_types):
+            return_vector = True
+            cols = [cols]
+
+        if isinstance(X, list):
+            X = [x[cols] for x in X]
+            X = pd.DataFrame(X)
+
+        elif isinstance(X, DataWrapper):
+            # if it's a datawrapper, unwrap it
+            X = X.df
+
+        if return_vector:
+            t = X[cols[0]].values
+        else:
+            t = X.as_matrix(cols)
+
+        return t
+
+    def fit(self, X, y=None):
+        """
+        Fit a transformation from the pipeline
+
+        X       the data to fit
+        """
+        for columns, transformers in self.features:
+            if transformers is not None:
+                if isinstance(transformers, list):
+                    # first fit_transform all transformers except the last one
+                    Xt = self._get_col_subset(X, columns)
+                    for transformer in transformers[:-1]:
+                        Xt = transformer.fit_transform(Xt)
+                    # then fit the last one without transformation
+                    transformers[-1].fit(Xt)
+                else:
+                    transformers.fit(self._get_col_subset(X, columns))
+        return self
+
+    def transform(self, X):
+        """
+        Transform the given data. Assumes that fit has already been called.
+
+        X       the data to transform
+        """
+        extracted = []
+        for columns, transformers in self.features:
+            # columns could be a string or list of
+            # strings; we don't care because pandas
+            # will handle either.
+            Xt = self._get_col_subset(X, columns)
+            if transformers is not None:
+                if isinstance(transformers, list):
+                    for transformer in transformers:
+                        Xt = transformer.transform(Xt)
+                else:
+                    Xt = transformers.transform(Xt)
+            extracted.append(_handle_feature(Xt))
+
+        # combine the feature outputs into one array.
+        # at this point we lose track of which features
+        # were created from which input columns, so it's
+        # assumed that that doesn't matter to the model.
+        return np.hstack(extracted)