From cdda3e71c8620dc793fa6a2abe7b8d78b7721f1e Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 29 Dec 2023 10:19:44 +0800 Subject: [PATCH] better dc default umap match transpose index type-spec concat type-spec concat dc for comp_cluster dirty_cat as default, cc passes most tests ;) source cu_cat from pypi source cu_cat from pypi remove cc tests, tested for in dc place remove cc tests, tested for in dc place init 1dc > 2cc init 1dc > 2cc use constants throughout revert from constants revert from constants init 1dc > 2cc better dc default better dc default --- graphistry/feature_utils.py | 80 +++++++++++++++++--------- graphistry/tests/test_feature_utils.py | 43 -------------- graphistry/umap_utils.py | 25 ++++++-- setup.py | 2 +- 4 files changed, 74 insertions(+), 76 deletions(-) diff --git a/graphistry/feature_utils.py b/graphistry/feature_utils.py index 4b5bb0adf..752ab11cd 100644 --- a/graphistry/feature_utils.py +++ b/graphistry/feature_utils.py @@ -201,7 +201,7 @@ def resolve_feature_engine( feature_engine: FeatureEngine, ) -> FeatureEngineConcrete: # noqa - if feature_engine in ["none", "pandas", DIRTY_CAT, "torch", CUDA_CAT]: + if feature_engine in ["none", "pandas", "dirty_cat", "torch", "cu_cat"]: return feature_engine # type: ignore if feature_engine == "auto": has_dependancy_text_, _, _ = lazy_import_has_dependancy_text() @@ -967,19 +967,19 @@ def process_dirty_dataframes( the data encoder, and the label encoder. """ - if feature_engine == CUDA_CAT: + if feature_engine == "cu_cat": assert_imported_cucat() - from cu_cat import SuperVectorizer, GapEncoder # , SimilarityEncoder + from cu_cat import SuperVectorizer, GapEncoder from cuml.preprocessing import FunctionTransformer - - else: # if feature_engine == "dirty_cat": # DIRTY_CAT - from dirty_cat import SuperVectorizer, GapEncoder # , SimilarityEncoder + + else: + from dirty_cat import SuperVectorizer, GapEncoder from sklearn.preprocessing import FunctionTransformer t = time() if not is_dataframe_all_numeric(ndf): - if feature_engine == CUDA_CAT: + if feature_engine == "cu_cat": data_encoder = SuperVectorizer( auto_cast=True, cardinality_threshold=cardinality_threshold_target, @@ -1010,9 +1010,9 @@ def process_dirty_dataframes( features_transformed = data_encoder.get_feature_names_out() all_transformers = data_encoder.transformers - if feature_engine == CUDA_CAT: + if feature_engine == "cu_cat": logger.info(f"-Shape of [[cu_cat fit]] data {X_enc.shape}") - elif feature_engine == DIRTY_CAT: + else: logger.info(f"-Shape of [[dirty_cat fit]] data {X_enc.shape}") logger.debug(f"-Transformers: \n{all_transformers}\n") logger.debug( @@ -1058,7 +1058,7 @@ def process_dirty_dataframes( t2 = time() logger.debug("-Fitting Targets --\n%s", y.columns) - if feature_engine == CUDA_CAT: + if feature_engine == "cu_cat": label_encoder = SuperVectorizer( auto_cast=True, cardinality_threshold=cardinality_threshold_target, @@ -1486,10 +1486,17 @@ def process_edge_dataframes( other_df, y ) # add the two datasets together - if feature_engine == 'pandas': - X_enc = pd.concat([T, X_enc], axis=1) - elif feature_engine == 'cudf': + has_dependancy_cudf_, import_exn, cudf = lazy_import_has_dependancy_cudf() + T_type = str(getmodule(T)) + X_type = str(getmodule(X_enc)) + if 'cudf' in T_type and 'cudf' in X_type: X_enc = cudf.concat([T, X_enc], axis=1) + elif 'pd' in T_type and 'pd' in X_type: + X_enc = pd.concat([T, X_enc], axis=1) + elif 'cudf' in T_type and 'pd' in X_type: + X_enc = cudf.concat([cudf.from_pandas(T), X_enc], axis=1) + elif 'pd' in T_type and 'cudf' in X_type: + X_enc = cudf.concat([T, cudf.from_pandas(X_enc)], axis=1) # then scale them X_encs, y_encs, scaling_pipeline, scaling_pipeline_target = smart_scaler( # noqa X_enc, @@ -1556,21 +1563,17 @@ def process_edge_dataframes( if not X_enc.empty and not T.empty: logger.debug("-" * 60) logger.debug("<= Found Edges and Dirty_cat encoding =>") + has_dependancy_cudf_, import_exn, cudf = lazy_import_has_dependancy_cudf() T_type = str(getmodule(T)) X_type = str(getmodule(X_enc)) if 'cudf' in T_type and 'cudf' in X_type: X_enc = cudf.concat([T, X_enc], axis=1) elif 'pd' in T_type and 'pd' in X_type: X_enc = pd.concat([T, X_enc], axis=1) - else: - try: - X_enc = cudf.concat([cudf.from_pandas(T), X_enc], axis=1) - except: - pass - try: - X_enc = cudf.concat([T, cudf.from_pandas(X_enc)], axis=1) - except: - pass + elif 'cudf' in T_type and 'pd' in X_type: + X_enc = cudf.concat([cudf.from_pandas(T), X_enc], axis=1) + elif 'pd' in T_type and 'cudf' in X_type: + X_enc = cudf.concat([T, cudf.from_pandas(X_enc)], axis=1) elif not T.empty and X_enc.empty: logger.debug("-" * 60) logger.debug("<= Found only Edges =>") @@ -1750,7 +1753,18 @@ def transform( # concat text to dirty_cat, with text in front. if not tX.empty and not X.empty: - X = pd.concat([tX, X], axis=1) + has_dependancy_cudf_, import_exn, cudf = lazy_import_has_dependancy_cudf() + T_type = str(getmodule(tX)) + X_type = str(getmodule(X)) + if 'cudf' in T_type and 'cudf' in X_type: + X = cudf.concat([tX, X], axis=1) + elif 'pd' in T_type and 'pd' in X_type: + X = pd.concat([tX, X], axis=1) + elif 'cudf' in T_type and 'pd' in X_type: + X = cudf.concat([cudf.from_pandas(tX), X], axis=1) + elif 'pd' in T_type and 'cudf' in X_type: + X = cudf.concat([tX, cudf.from_pandas(X)], axis=1) + # X = pd.concat([tX, X], axis=1) logger.info("--Combining both Textual and Numeric/Dirty_Cat") elif not tX.empty and X.empty: X = tX # textual @@ -1765,7 +1779,18 @@ def transform( # now if edges, add T at front if kind == "edges": - X = pd.concat([T, X], axis=1) # edges, text, dirty_cat + # X = pd.concat([T, X], axis=1) # edges, text, dirty_cat + has_dependancy_cudf_, import_exn, cudf = lazy_import_has_dependancy_cudf() + T_type = str(getmodule(T)) + X_type = str(getmodule(X)) + if 'cudf' in T_type and 'cudf' in X_type: + X = cudf.concat([T, X], axis=1) + elif 'pd' in T_type and 'pd' in X_type: + X = pd.concat([T, X], axis=1) + elif 'cudf' in T_type and 'pd' in X_type: + X = cudf.concat([cudf.from_pandas(T), X], axis=1) + elif 'pd' in T_type and 'cudf' in X_type: + X = cudf.concat([T, cudf.from_pandas(X)], axis=1) logger.info("-Combining MultiLabelBinarizer with previous features") logger.info("-" * 40) @@ -2656,10 +2681,11 @@ def featurize( """ feature_engine = resolve_feature_engine(feature_engine) - if feature_engine == 'dirty_cat': - assert_imported_min() - elif feature_engine == 'cu_cat': + + if feature_engine == "cu_cat": assert_imported_cucat() + else: + assert_imported_min() if inplace: res = self diff --git a/graphistry/tests/test_feature_utils.py b/graphistry/tests/test_feature_utils.py index 33550f90b..81afa09d7 100644 --- a/graphistry/tests/test_feature_utils.py +++ b/graphistry/tests/test_feature_utils.py @@ -444,48 +444,5 @@ def test_edge_scaling(self): return_scalers=True) -class TestFeaturizeGetMethodsCucat(unittest.TestCase): - - @pytest.mark.skipif(not has_min_dependancy or not has_min_dependancy_text, reason="requires ai feature dependencies") - @pytest.mark.skipif(not is_test_cudf, reason="requires cudf") - def setUp(self) -> None: - _, _, cudf = lazy_import_has_dependancy_cudf() - ndf_malware = pd.read_csv("graphistry/tests/data/malware_capture_bot.csv", index_col=0) - g = graphistry.nodes(cudf.from_pandas(ndf_malware)) - - g2 = g.featurize(y=cudf.from_pandas(double_target_reddit), # ngrams - use_ngrams=True, - ngram_range=(1, 4) - ) - - g3 = g.featurize(**topic_model, feature_engine="cu_cat") # topic model - self.g = g - self.g2 = g2 - self.g3 = g3 - - @pytest.mark.skipif(not has_min_dependancy or not has_min_dependancy_text, reason="requires ai feature dependencies") - @pytest.mark.skipif(not is_test_cudf, reason="requires cudf") - def test_get_col_matrix(self): - _, _, cudf = lazy_import_has_dependancy_cudf() - # no edges so this should be None - assert self.g2.get_matrix(kind='edges') is None - - # test target methods - assert all(self.g2.get_matrix(target=True).columns == self.g2._node_target.columns) - # assert self.g2.get_matrix('Anxiety', target=True).shape[0] == len(self.g2._node_target) - # test str vs list - # assert (self.g2.get_matrix('Anxiety', target=True) == self.g2.get_matrix(['Anxiety'], target=True)).all().values[0] - - # assert list(self.g2.get_matrix(['Anxiety', 'education', 'computer'], target=True).columns) == ['label_Anxiety', 'label_education', 'label_computervision'] - - # test feature methods - # ngrams - assert (self.g2.get_matrix().columns == self.g2._node_features.columns).all() - # assert list(self.g2.get_matrix('what').columns) == what, list(self.g2.get_matrix('what').columns) - - # topic - assert all(self.g3.get_matrix().columns == self.g3._node_features.columns) - - if __name__ == "__main__": unittest.main() diff --git a/graphistry/umap_utils.py b/graphistry/umap_utils.py index 6e23a11f3..374d9eb76 100644 --- a/graphistry/umap_utils.py +++ b/graphistry/umap_utils.py @@ -728,12 +728,27 @@ def _bind_xy_from_umap( emb = res._edge_embedding if isinstance(df, type(emb)): - df[x_name] = emb.values.T[0] - df[y_name] = emb.values.T[1] + try: + df[x_name] = emb.values.T[0] + df[y_name] = emb.values.T[1] + except: + pass + try: + df[x_name] = emb.values[0] + df[y_name] = emb.values[1] + except: + pass elif isinstance(df, pd.DataFrame) and 'cudf' in str(getmodule(emb)): - df[x_name] = emb.to_numpy().T[0] - df[y_name] = emb.to_numpy().T[1] - + try: + df[x_name] = emb.to_numpy().T[0] + df[y_name] = emb.to_numpy().T[1] + except: + pass + try: + df[x_name] = emb.to_numpy()[0] + df[y_name] = emb.to_numpy()[1] + except: + pass res = res.nodes(df) if kind == "nodes" else res.edges(df) if encode_weight and kind == "nodes": diff --git a/setup.py b/setup.py index 65a4a16e8..2ceda8c0a 100755 --- a/setup.py +++ b/setup.py @@ -47,7 +47,7 @@ def unique_flatten_dict(d): # https://github.com/facebookresearch/faiss/issues/1589 for faiss-cpu 1.6.1, #'setuptools==67.4.0' removed base_extras_heavy['ai'] = base_extras_heavy['umap-learn'] + ['scipy', 'dgl', 'torch<2', 'sentence-transformers', 'faiss-cpu', 'joblib'] -base_extras_heavy['cu_cat'] = base_extras_heavy['ai'] + ['cu_cat @ git+http://github.com/graphistry/cu-cat.git@v0.05.0'] +base_extras_heavy['cu_cat'] = base_extras_heavy['ai'] + ['cu_cat'] base_extras = {**base_extras_light, **base_extras_heavy}