graphistry · tanmoyio · May 15, 2023 · May 15, 2023 · May 15, 2023 · Jun 13, 2023
diff --git a/docker/test-gpu-local.sh b/docker/test-gpu-local.sh
@@ -44,5 +44,4 @@ docker run \
     ${NETWORK} \
     graphistry/test-gpu:${TEST_CPU_VERSION} \
         --maxfail=1 \
-        --ignore=graphistry/tests/test_feature_utils.py \
         $@
diff --git a/graphistry/feature_utils.py b/graphistry/feature_utils.py
@@ -49,6 +49,16 @@
         SuperVectorizer = Any
         GapEncoder = Any
         SimilarityEncoder = Any
+    try:
+        from cu_cat import (
+            SuperVectorizer,
+            GapEncoder,
+            SimilarityEncoder,
+        )  # type: ignore
+    except:
+        SuperVectorizer = Any
+        GapEncoder = Any
+        SimilarityEncoder = Any
     try:
         from sklearn.preprocessing import FunctionTransformer
         from sklearn.base import BaseEstimator, TransformerMixin
@@ -93,6 +103,28 @@ def lazy_import_has_min_dependancy():
     except ModuleNotFoundError as e:
         return False, e
 
+def lazy_import_has_dependancy_cu_cat():
+    import warnings
+    warnings.filterwarnings("ignore")
+    try:
+        import scipy.sparse  # noqa
+        from scipy import __version__ as scipy_version
+        from cu_cat import __version__ as cu_cat_version
+        import cu_cat
+        from sklearn import __version__ as sklearn_version
+        from cuml import __version__ as cuml_version
+        import cuml
+        from cudf import __version__ as cudf_version
+        import cudf
+        logger.debug(f"SCIPY VERSION: {scipy_version}")
+        logger.debug(f"Cuda CAT VERSION: {cu_cat_version}")
+        logger.debug(f"sklearn VERSION: {sklearn_version}")
+        logger.debug(f"cuml VERSION: {cuml_version}")
+        logger.debug(f"cudf VERSION: {cudf_version}")
+        return True, 'ok', cudf
+    except ModuleNotFoundError as e:
+        return False, e, None
+
 
 def assert_imported_text():
     has_dependancy_text_, import_text_exn, _ = lazy_import_has_dependancy_text()
@@ -114,6 +146,33 @@ def assert_imported():
         raise import_min_exn
 
 
+def assert_cuml_cucat():
+    has_cuml_dependancy_, import_cuml_exn, cudf = lazy_import_has_dependancy_cu_cat()
+    if not has_cuml_dependancy_:
+        logger.error(  # noqa
+                     "cuml not found, trying running"  # noqa
+                     "`pip install rapids`"  # noqa
+        )
+        raise import_cuml_exn
+
+
+def make_safe_gpu_dataframes(X, y, engine):
+    has_cudf_dependancy_, _, cudf = lazy_import_has_dependancy_cu_cat()
+    if has_cudf_dependancy_:
+        new_kwargs = {}
+        kwargs = {'X': X, 'y': y}
+        for key, value in kwargs.items():
+            if isinstance(value, cudf.DataFrame) and engine in ["pandas", "dirty_cat", "torch"]:
+                new_kwargs[key] = value.to_pandas()
+            elif isinstance(value, pd.DataFrame) and engine in ["cuml", "cu_cat"]:
+                new_kwargs[key] = cudf.from_pandas(value)
+            else:
+                new_kwargs[key] = value
+        return new_kwargs['X'], new_kwargs['y']
+    else:
+        return X, y
+
+
 # ############################################################################
 #
 #     Rough calltree
@@ -137,29 +196,32 @@ def assert_imported():
 #
 #      _featurize_or_get_edges_dataframe_if_X_is_None
 
-FeatureEngineConcrete = Literal["none", "pandas", "dirty_cat", "torch"]
+FeatureEngineConcrete = Literal["none", "pandas", "dirty_cat", "torch", "cu_cat"]
 FeatureEngine = Literal[FeatureEngineConcrete, "auto"]
 
 
 def resolve_feature_engine(
     feature_engine: FeatureEngine,
 ) -> FeatureEngineConcrete:  # noqa
 
-    if feature_engine in ["none", "pandas", "dirty_cat", "torch"]:
+    if feature_engine in ["none", "pandas", "dirty_cat", "torch", "cu_cat"]:
         return feature_engine  # type: ignore
 
     if feature_engine == "auto":
         has_dependancy_text_, _, _ = lazy_import_has_dependancy_text()
         if has_dependancy_text_:
             return "torch"
+        has_cuml_dependancy_, _, cudf = lazy_import_has_dependancy_cu_cat()
+        if has_cuml_dependancy_:
+            return "cu_cat"
         has_min_dependancy_, _ = lazy_import_has_min_dependancy()
         if has_min_dependancy_:
             return "dirty_cat"
         return "pandas"
 
     raise ValueError(  # noqa
         f'feature_engine expected to be "none", '
-        '"pandas", "dirty_cat", "torch", or "auto"'
+        '"pandas", "dirty_cat", "torch", "cu_cat", or "auto"'
         f'but received: {feature_engine} :: {type(feature_engine)}'
     )
 
@@ -230,18 +292,19 @@ def features_without_target(
     :param y: target DataFrame
     :return: DataFrames of model and target
     """
+    _, _, cudf = lazy_import_has_dependancy_cu_cat()
     if y is None:
         return df
     remove_cols = []
     if y is None:
         pass
-    elif isinstance(y, pd.DataFrame):
+    elif isinstance(y, pd.DataFrame) or isinstance(y, cudf.DataFrame):
         yc = y.columns
         xc = df.columns
         for c in yc:
             if c in xc:
                 remove_cols.append(c)
-    elif isinstance(y, pd.Series):
+    elif isinstance(y, pd.Series) or isinstance(y, cudf.Series):
         if y.name and (y.name in df.columns):
             remove_cols = [y.name]
     elif isinstance(y, List):
@@ -265,7 +328,7 @@ def remove_node_column_from_symbolic(X_symbolic, node):
             logger.info(f"Removing `{node}` from input X_symbolic list")
             X_symbolic.remove(node)
         return X_symbolic
-    if isinstance(X_symbolic, pd.DataFrame):
+    if isinstance(X_symbolic, pd.DataFrame) or 'cudf' in str(getmodule(X_symbolic)):
         logger.info(f"Removing `{node}` from input X_symbolic DataFrame")
         return X_symbolic.drop(columns=[node], errors="ignore")
 
@@ -619,11 +682,19 @@ def fit_pipeline(
     columns = X.columns
     index = X.index
 
-    X = transformer.fit_transform(X)
-    if keep_n_decimals:
-        X = np.round(X, decimals=keep_n_decimals)  #  type: ignore  # noqa
-
-    return pd.DataFrame(X, columns=columns, index=index)
+    X_type = str(getmodule(X))
+    if 'cudf' not in X_type:
+        X = transformer.fit_transform(X)
+        if keep_n_decimals:
+            X = np.round(X, decimals=keep_n_decimals)  #  type: ignore  # noqa
+        X = pd.DataFrame(X, columns=columns, index=index)
+    else:
+        X = transformer.fit_transform(X.to_numpy())
+        if keep_n_decimals:
+            X = np.round(X, decimals=keep_n_decimals)  #  type: ignore  # noqa
+        _, _, cudf = lazy_import_has_dependancy_cu_cat()
+        X = cudf.DataFrame(X, columns=columns, index=index)
+    return X
 
 
 def impute_and_scale_df(
@@ -848,6 +919,7 @@ def process_dirty_dataframes(
     similarity: Optional[str] = None,  # "ngram",
     categories: Optional[str] = "auto",
     multilabel: bool = False,
+    feature_engine: Optional[str] = "dirty_cat",
 ) -> Tuple[
     pd.DataFrame,
     Optional[pd.DataFrame],
@@ -873,8 +945,16 @@ def process_dirty_dataframes(
     :return: Encoded data matrix and target (if not None),
             the data encoder, and the label encoder.
     """
-    from dirty_cat import SuperVectorizer, GapEncoder, SimilarityEncoder
-    from sklearn.preprocessing import FunctionTransformer
+
+    if feature_engine == 'cu_cat':
+        lazy_import_has_dependancy_cu_cat()
+        from cu_cat import SuperVectorizer, GapEncoder, SimilarityEncoder
+        from cuml.preprocessing import FunctionTransformer
+
+    else:
+        from dirty_cat import SuperVectorizer, GapEncoder, SimilarityEncoder
+        from sklearn.preprocessing import FunctionTransformer
+
     t = time()
 
     if not is_dataframe_all_numeric(ndf):
@@ -911,12 +991,19 @@ def process_dirty_dataframes(
         )
         #  now just set the feature names, since dirty cat changes them in
         #  a weird way...
-        data_encoder.get_feature_names_out = callThrough(features_transformed)
-
-        X_enc = pd.DataFrame(
-            X_enc, columns=features_transformed, index=ndf.index
-        )
-        X_enc = X_enc.fillna(0.0)
+        data_encoder.get_feature_names_out = callThrough(features_transformed) 
+        if 'cudf' not in str(getmodule(ndf)):
+            X_enc = pd.DataFrame(
+                X_enc, columns=features_transformed, index=ndf.index
+            )
+            X_enc = X_enc.fillna(0.0)
+        else:
+            _, _, cudf = lazy_import_has_dependancy_cu_cat()
+            X_enc = cudf.DataFrame(
+                X_enc, columns=features_transformed, index=ndf.index
+            )
+            X_enc = X_enc.fillna(0.0).to_pandas()  # will be removed for future cu_cat release
+
     else:
         logger.info("-*-*- DataFrame is completely numeric")
         X_enc, _, data_encoder, _ = get_numeric_transformers(ndf, None)
@@ -1117,7 +1204,8 @@ def process_nodes_dataframes(
         n_topics_target=n_topics_target,
         similarity=similarity,
         categories=categories,
-        multilabel=multilabel
+        multilabel=multilabel,
+        feature_engine=feature_engine,
     )
 
     if embedding:
@@ -1235,20 +1323,31 @@ def encode_edges(edf, src, dst, mlb, fit=False):
     """
     # uses mlb with fit=T/F so we can use it in transform mode
     # to recreate edge feature concat definition
+    edf_type = str(getmodule(edf))
     source = edf[src]
     destination = edf[dst]
+    source_dtype = str(getmodule(source))
     logger.debug("Encoding Edges using MultiLabelBinarizer")
-    if fit:
+    if fit and 'cudf' not in source_dtype:
         T = mlb.fit_transform(zip(source, destination))
-    else:
+    elif fit and 'cudf' in source_dtype:
+        T = mlb.fit_transform(zip(source.to_pandas(), destination.to_pandas()))
+    elif not fit and 'cudf' not in source_dtype:
         T = mlb.transform(zip(source, destination))
+    elif not fit and 'cudf' in source_dtype:
+        T = mlb.transform(zip(source.to_pandas(), destination.to_pandas()))
+
     T = 1.0 * T  # coerce to float
     columns = [
         str(k) for k in mlb.classes_
     ]  # stringify the column names or scikits.base throws error
     mlb.get_feature_names_out = callThrough(columns)
     mlb.columns_ = [src, dst]
-    T = pd.DataFrame(T, columns=columns, index=edf.index)
+    if 'cudf' in edf_type:
+        _, _, cudf = lazy_import_has_dependancy_cu_cat()
+        T = cudf.DataFrame(T, columns=columns, index=edf.index)
+    else:
+        T = pd.DataFrame(T, columns=columns, index=edf.index)
     logger.info(f"Shape of Edge Encoding: {T.shape}")
     return T, mlb
 
@@ -1321,6 +1420,7 @@ def process_edge_dataframes(
         MultiLabelBinarizer()
     )  # create new one so we can use encode_edges later in
     # transform with fit=False
+    _, _, cudf = lazy_import_has_dependancy_cu_cat()
     T, mlb_pairwise_edge_encoder = encode_edges(
         edf, src, dst, mlb_pairwise_edge_encoder, fit=True
     )
@@ -1406,7 +1506,11 @@ def process_edge_dataframes(
     if not X_enc.empty and not T.empty:
         logger.debug("-" * 60)
         logger.debug("<= Found Edges and Dirty_cat encoding =>")
-        X_enc = pd.concat([T, X_enc], axis=1)
+        T_type = str(getmodule(T))
+        if 'cudf' in T_type:
+            X_enc = cudf.concat([T, X_enc], axis=1)
+        else:
+            X_enc = pd.concat([T, X_enc], axis=1)
     elif not T.empty and X_enc.empty:
         logger.debug("-" * 60)
         logger.debug("<= Found only Edges =>")
@@ -1811,7 +1915,7 @@ def prune_weighted_edges_df_and_relabel_nodes(
         " -- Pruning weighted edge DataFrame "
         f"from {len(wdf):,} to {len(wdf2):,} edges."
     )
-    if index_to_nodes_dict is not None:
+    if index_to_nodes_dict is not None and type(index_to_nodes_dict) == dict:
         wdf2[config.SRC] = wdf2[config.SRC].map(index_to_nodes_dict)
         wdf2[config.DST] = wdf2[config.DST].map(index_to_nodes_dict)
     return wdf2
@@ -1952,7 +2056,8 @@ def _featurize_nodes(
         X_resolved = resolve_X(ndf, X)
         y_resolved = resolve_y(ndf, y)
 
-        feature_engine = resolve_feature_engine(feature_engine)
+        res.feature_engine = feature_engine
+        X_resolved, y_resolved = make_safe_gpu_dataframes(X_resolved, y_resolved, engine=feature_engine)
 
         from .features import ModelDict
 
@@ -2076,6 +2181,9 @@ def _featurize_edges(
                 **{res._destination: res._edges[res._destination]}
             )
 
+        res.feature_engine = feature_engine
+        X_resolved, y_resolved = make_safe_gpu_dataframes(X_resolved, y_resolved, engine=feature_engine)
+
         # now that everything is set
         fkwargs = dict(
             X=X_resolved,
@@ -2487,13 +2595,18 @@ def featurize(
                 default True.
         :return: graphistry instance with new attributes set by the featurization process.
         """
-        assert_imported()
+        feature_engine = resolve_feature_engine(feature_engine)
+
+        if feature_engine == 'dirty_cat':
+            assert_imported()
+        elif feature_engine == 'cu_cat':
+            assert_cuml_cucat()
+
         if inplace:
             res = self
         else:
             res = self.bind()
 
-        feature_engine = resolve_feature_engine(feature_engine)
 
         if kind == "nodes":
             res = res._featurize_nodes(