Skip to content

Commit

Permalink
better dc default
Browse files Browse the repository at this point in the history
umap match transpose index

type-spec concat

type-spec concat

dc for comp_cluster

dirty_cat as default, cc passes most tests ;)

source cu_cat from pypi

source cu_cat from pypi

remove cc tests, tested for in dc place

remove cc tests, tested for in dc place

init 1dc > 2cc

init 1dc > 2cc

use constants throughout

revert from constants

revert from constants

init 1dc > 2cc

better dc default

better dc default
  • Loading branch information
dcolinmorgan committed Jan 2, 2024
1 parent 5a69233 commit cdda3e7
Show file tree
Hide file tree
Showing 4 changed files with 74 additions and 76 deletions.
80 changes: 53 additions & 27 deletions graphistry/feature_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ def resolve_feature_engine(
feature_engine: FeatureEngine,
) -> FeatureEngineConcrete: # noqa

if feature_engine in ["none", "pandas", DIRTY_CAT, "torch", CUDA_CAT]:
if feature_engine in ["none", "pandas", "dirty_cat", "torch", "cu_cat"]:
return feature_engine # type: ignore
if feature_engine == "auto":
has_dependancy_text_, _, _ = lazy_import_has_dependancy_text()
Expand Down Expand Up @@ -967,19 +967,19 @@ def process_dirty_dataframes(
the data encoder, and the label encoder.
"""

if feature_engine == CUDA_CAT:
if feature_engine == "cu_cat":
assert_imported_cucat()
from cu_cat import SuperVectorizer, GapEncoder # , SimilarityEncoder
from cu_cat import SuperVectorizer, GapEncoder
from cuml.preprocessing import FunctionTransformer

else: # if feature_engine == "dirty_cat": # DIRTY_CAT
from dirty_cat import SuperVectorizer, GapEncoder # , SimilarityEncoder
else:
from dirty_cat import SuperVectorizer, GapEncoder
from sklearn.preprocessing import FunctionTransformer

t = time()

if not is_dataframe_all_numeric(ndf):
if feature_engine == CUDA_CAT:
if feature_engine == "cu_cat":
data_encoder = SuperVectorizer(
auto_cast=True,
cardinality_threshold=cardinality_threshold_target,
Expand Down Expand Up @@ -1010,9 +1010,9 @@ def process_dirty_dataframes(
features_transformed = data_encoder.get_feature_names_out()

all_transformers = data_encoder.transformers
if feature_engine == CUDA_CAT:
if feature_engine == "cu_cat":
logger.info(f"-Shape of [[cu_cat fit]] data {X_enc.shape}")
elif feature_engine == DIRTY_CAT:
else:
logger.info(f"-Shape of [[dirty_cat fit]] data {X_enc.shape}")
logger.debug(f"-Transformers: \n{all_transformers}\n")
logger.debug(
Expand Down Expand Up @@ -1058,7 +1058,7 @@ def process_dirty_dataframes(
t2 = time()
logger.debug("-Fitting Targets --\n%s", y.columns)

if feature_engine == CUDA_CAT:
if feature_engine == "cu_cat":
label_encoder = SuperVectorizer(
auto_cast=True,
cardinality_threshold=cardinality_threshold_target,
Expand Down Expand Up @@ -1486,10 +1486,17 @@ def process_edge_dataframes(
other_df, y
)
# add the two datasets together
if feature_engine == 'pandas':
X_enc = pd.concat([T, X_enc], axis=1)
elif feature_engine == 'cudf':
has_dependancy_cudf_, import_exn, cudf = lazy_import_has_dependancy_cudf()
T_type = str(getmodule(T))
X_type = str(getmodule(X_enc))
if 'cudf' in T_type and 'cudf' in X_type:
X_enc = cudf.concat([T, X_enc], axis=1)
elif 'pd' in T_type and 'pd' in X_type:
X_enc = pd.concat([T, X_enc], axis=1)
elif 'cudf' in T_type and 'pd' in X_type:
X_enc = cudf.concat([cudf.from_pandas(T), X_enc], axis=1)
elif 'pd' in T_type and 'cudf' in X_type:
X_enc = cudf.concat([T, cudf.from_pandas(X_enc)], axis=1)
# then scale them
X_encs, y_encs, scaling_pipeline, scaling_pipeline_target = smart_scaler( # noqa
X_enc,
Expand Down Expand Up @@ -1556,21 +1563,17 @@ def process_edge_dataframes(
if not X_enc.empty and not T.empty:
logger.debug("-" * 60)
logger.debug("<= Found Edges and Dirty_cat encoding =>")
has_dependancy_cudf_, import_exn, cudf = lazy_import_has_dependancy_cudf()
T_type = str(getmodule(T))
X_type = str(getmodule(X_enc))
if 'cudf' in T_type and 'cudf' in X_type:
X_enc = cudf.concat([T, X_enc], axis=1)
elif 'pd' in T_type and 'pd' in X_type:
X_enc = pd.concat([T, X_enc], axis=1)
else:
try:
X_enc = cudf.concat([cudf.from_pandas(T), X_enc], axis=1)
except:
pass
try:
X_enc = cudf.concat([T, cudf.from_pandas(X_enc)], axis=1)
except:
pass
elif 'cudf' in T_type and 'pd' in X_type:
X_enc = cudf.concat([cudf.from_pandas(T), X_enc], axis=1)
elif 'pd' in T_type and 'cudf' in X_type:
X_enc = cudf.concat([T, cudf.from_pandas(X_enc)], axis=1)
elif not T.empty and X_enc.empty:
logger.debug("-" * 60)
logger.debug("<= Found only Edges =>")
Expand Down Expand Up @@ -1750,7 +1753,18 @@ def transform(

# concat text to dirty_cat, with text in front.
if not tX.empty and not X.empty:
X = pd.concat([tX, X], axis=1)
has_dependancy_cudf_, import_exn, cudf = lazy_import_has_dependancy_cudf()
T_type = str(getmodule(tX))
X_type = str(getmodule(X))
if 'cudf' in T_type and 'cudf' in X_type:
X = cudf.concat([tX, X], axis=1)
elif 'pd' in T_type and 'pd' in X_type:
X = pd.concat([tX, X], axis=1)
elif 'cudf' in T_type and 'pd' in X_type:
X = cudf.concat([cudf.from_pandas(tX), X], axis=1)
elif 'pd' in T_type and 'cudf' in X_type:
X = cudf.concat([tX, cudf.from_pandas(X)], axis=1)
# X = pd.concat([tX, X], axis=1)
logger.info("--Combining both Textual and Numeric/Dirty_Cat")
elif not tX.empty and X.empty:
X = tX # textual
Expand All @@ -1765,7 +1779,18 @@ def transform(

# now if edges, add T at front
if kind == "edges":
X = pd.concat([T, X], axis=1) # edges, text, dirty_cat
# X = pd.concat([T, X], axis=1) # edges, text, dirty_cat
has_dependancy_cudf_, import_exn, cudf = lazy_import_has_dependancy_cudf()
T_type = str(getmodule(T))
X_type = str(getmodule(X))
if 'cudf' in T_type and 'cudf' in X_type:
X = cudf.concat([T, X], axis=1)
elif 'pd' in T_type and 'pd' in X_type:
X = pd.concat([T, X], axis=1)
elif 'cudf' in T_type and 'pd' in X_type:
X = cudf.concat([cudf.from_pandas(T), X], axis=1)
elif 'pd' in T_type and 'cudf' in X_type:
X = cudf.concat([T, cudf.from_pandas(X)], axis=1)
logger.info("-Combining MultiLabelBinarizer with previous features")

logger.info("-" * 40)
Expand Down Expand Up @@ -2656,10 +2681,11 @@ def featurize(
"""
feature_engine = resolve_feature_engine(feature_engine)

if feature_engine == 'dirty_cat':
assert_imported_min()
elif feature_engine == 'cu_cat':

if feature_engine == "cu_cat":
assert_imported_cucat()
else:
assert_imported_min()

if inplace:
res = self
Expand Down
43 changes: 0 additions & 43 deletions graphistry/tests/test_feature_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -444,48 +444,5 @@ def test_edge_scaling(self):
return_scalers=True)


class TestFeaturizeGetMethodsCucat(unittest.TestCase):

@pytest.mark.skipif(not has_min_dependancy or not has_min_dependancy_text, reason="requires ai feature dependencies")
@pytest.mark.skipif(not is_test_cudf, reason="requires cudf")
def setUp(self) -> None:
_, _, cudf = lazy_import_has_dependancy_cudf()
ndf_malware = pd.read_csv("graphistry/tests/data/malware_capture_bot.csv", index_col=0)
g = graphistry.nodes(cudf.from_pandas(ndf_malware))

g2 = g.featurize(y=cudf.from_pandas(double_target_reddit), # ngrams
use_ngrams=True,
ngram_range=(1, 4)
)

g3 = g.featurize(**topic_model, feature_engine="cu_cat") # topic model
self.g = g
self.g2 = g2
self.g3 = g3

@pytest.mark.skipif(not has_min_dependancy or not has_min_dependancy_text, reason="requires ai feature dependencies")
@pytest.mark.skipif(not is_test_cudf, reason="requires cudf")
def test_get_col_matrix(self):
_, _, cudf = lazy_import_has_dependancy_cudf()
# no edges so this should be None
assert self.g2.get_matrix(kind='edges') is None

# test target methods
assert all(self.g2.get_matrix(target=True).columns == self.g2._node_target.columns)
# assert self.g2.get_matrix('Anxiety', target=True).shape[0] == len(self.g2._node_target)
# test str vs list
# assert (self.g2.get_matrix('Anxiety', target=True) == self.g2.get_matrix(['Anxiety'], target=True)).all().values[0]

# assert list(self.g2.get_matrix(['Anxiety', 'education', 'computer'], target=True).columns) == ['label_Anxiety', 'label_education', 'label_computervision']

# test feature methods
# ngrams
assert (self.g2.get_matrix().columns == self.g2._node_features.columns).all()
# assert list(self.g2.get_matrix('what').columns) == what, list(self.g2.get_matrix('what').columns)

# topic
assert all(self.g3.get_matrix().columns == self.g3._node_features.columns)


if __name__ == "__main__":
unittest.main()
25 changes: 20 additions & 5 deletions graphistry/umap_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -728,12 +728,27 @@ def _bind_xy_from_umap(
emb = res._edge_embedding

if isinstance(df, type(emb)):
df[x_name] = emb.values.T[0]
df[y_name] = emb.values.T[1]
try:
df[x_name] = emb.values.T[0]
df[y_name] = emb.values.T[1]
except:
pass
try:
df[x_name] = emb.values[0]
df[y_name] = emb.values[1]
except:
pass
elif isinstance(df, pd.DataFrame) and 'cudf' in str(getmodule(emb)):
df[x_name] = emb.to_numpy().T[0]
df[y_name] = emb.to_numpy().T[1]

try:
df[x_name] = emb.to_numpy().T[0]
df[y_name] = emb.to_numpy().T[1]
except:
pass
try:
df[x_name] = emb.to_numpy()[0]
df[y_name] = emb.to_numpy()[1]
except:
pass
res = res.nodes(df) if kind == "nodes" else res.edges(df)

if encode_weight and kind == "nodes":
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def unique_flatten_dict(d):
# https://github.com/facebookresearch/faiss/issues/1589 for faiss-cpu 1.6.1, #'setuptools==67.4.0' removed
base_extras_heavy['ai'] = base_extras_heavy['umap-learn'] + ['scipy', 'dgl', 'torch<2', 'sentence-transformers', 'faiss-cpu', 'joblib']

base_extras_heavy['cu_cat'] = base_extras_heavy['ai'] + ['cu_cat @ git+http://github.com/graphistry/[email protected]']
base_extras_heavy['cu_cat'] = base_extras_heavy['ai'] + ['cu_cat']

base_extras = {**base_extras_light, **base_extras_heavy}

Expand Down

0 comments on commit cdda3e7

Please sign in to comment.