-
Notifications
You must be signed in to change notification settings - Fork 209
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Cucat Featurization base #486
base: master
Are you sure you want to change the base?
Changes from 3 commits
cf07249
d73a2db
382e18b
44200ac
777afd4
c1bc6f1
48e4017
6b0b52b
e4b0c0a
7c0c0c6
f344dd8
b6f6388
f185a2f
410c40d
b9067c0
8f0bc3a
b7b8e63
e8eb85a
501ff3b
918ebee
ccf6f47
60de1cf
0b66776
9f086c8
78015f1
616009b
dc38d3b
376890e
b9828c5
8d13cbe
92769bf
af0fc8a
4f78b76
b8a0db2
6e11117
b0d36cd
5677bea
8e15e5e
20200d6
26cd39c
c4c1bd8
d889581
48a7308
ba25c89
8a0ab5c
151ab5b
d63d729
ada126e
21a475d
49976e8
f24411e
d303afb
b34ee85
2456b70
8fc0b22
ee6c523
4808428
a22e85e
86fc662
614fff4
b88e3ea
a72d4b1
4eef71c
0522981
73ba5d1
9da0b11
f9e9260
58d1461
d5acc1a
614d9f3
31b5f5e
bc4f290
74a2460
624c721
2fc6be5
178adba
90bd8b7
5d16a9e
e931456
fc212a8
d4b1fbe
498a4de
aab2ad9
f0eb1bf
867874d
5a69233
cdda3e7
63398b3
b720bc1
ed824ec
1735134
824d940
c7ce92c
30a04a4
50df365
a654f9f
a86be5c
4bd056c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -49,6 +49,16 @@ | |
SuperVectorizer = Any | ||
GapEncoder = Any | ||
SimilarityEncoder = Any | ||
try: | ||
from cu_cat import ( | ||
SuperVectorizer, | ||
GapEncoder, | ||
SimilarityEncoder, | ||
) # type: ignore | ||
except: | ||
SuperVectorizer = Any | ||
GapEncoder = Any | ||
SimilarityEncoder = Any | ||
try: | ||
from sklearn.preprocessing import FunctionTransformer | ||
from sklearn.base import BaseEstimator, TransformerMixin | ||
|
@@ -93,6 +103,28 @@ def lazy_import_has_min_dependancy(): | |
except ModuleNotFoundError as e: | ||
return False, e | ||
|
||
def lazy_import_has_dependancy_cu_cat(): | ||
import warnings | ||
warnings.filterwarnings("ignore") | ||
try: | ||
import scipy.sparse # noqa | ||
from scipy import __version__ as scipy_version | ||
from cu_cat import __version__ as cu_cat_version | ||
import cu_cat | ||
from sklearn import __version__ as sklearn_version | ||
from cuml import __version__ as cuml_version | ||
import cuml | ||
from cudf import __version__ as cudf_version | ||
import cudf | ||
logger.debug(f"SCIPY VERSION: {scipy_version}") | ||
logger.debug(f"Cuda CAT VERSION: {cu_cat_version}") | ||
logger.debug(f"sklearn VERSION: {sklearn_version}") | ||
logger.debug(f"cuml VERSION: {cuml_version}") | ||
logger.debug(f"cudf VERSION: {cudf_version}") | ||
return True, 'ok', cudf | ||
except ModuleNotFoundError as e: | ||
return False, e, None | ||
|
||
|
||
def assert_imported_text(): | ||
has_dependancy_text_, import_text_exn, _ = lazy_import_has_dependancy_text() | ||
|
@@ -114,6 +146,33 @@ def assert_imported(): | |
raise import_min_exn | ||
|
||
|
||
def assert_cuml_cucat(): | ||
has_cuml_dependancy_, import_cuml_exn, cudf = lazy_import_has_dependancy_cu_cat() | ||
if not has_cuml_dependancy_: | ||
logger.error( # noqa | ||
"cuml not found, trying running" # noqa | ||
"`pip install rapids`" # noqa | ||
) | ||
raise import_cuml_exn | ||
|
||
|
||
def make_safe_gpu_dataframes(X, y, engine): | ||
has_cudf_dependancy_, _, cudf = lazy_import_has_dependancy_cu_cat() | ||
if has_cudf_dependancy_: | ||
new_kwargs = {} | ||
kwargs = {'X': X, 'y': y} | ||
for key, value in kwargs.items(): | ||
if isinstance(value, cudf.DataFrame) and engine in ["pandas", "dirty_cat", "torch"]: | ||
new_kwargs[key] = value.to_pandas() | ||
elif isinstance(value, pd.DataFrame) and engine in ["cuml", "cu_cat"]: | ||
new_kwargs[key] = cudf.from_pandas(value) | ||
else: | ||
new_kwargs[key] = value | ||
return new_kwargs['X'], new_kwargs['y'] | ||
else: | ||
return X, y | ||
|
||
|
||
# ############################################################################ | ||
# | ||
# Rough calltree | ||
|
@@ -137,29 +196,32 @@ def assert_imported(): | |
# | ||
# _featurize_or_get_edges_dataframe_if_X_is_None | ||
|
||
FeatureEngineConcrete = Literal["none", "pandas", "dirty_cat", "torch"] | ||
FeatureEngineConcrete = Literal["none", "pandas", "dirty_cat", "torch", "cu_cat"] | ||
FeatureEngine = Literal[FeatureEngineConcrete, "auto"] | ||
|
||
|
||
def resolve_feature_engine( | ||
feature_engine: FeatureEngine, | ||
) -> FeatureEngineConcrete: # noqa | ||
|
||
if feature_engine in ["none", "pandas", "dirty_cat", "torch"]: | ||
if feature_engine in ["none", "pandas", "dirty_cat", "torch", "cu_cat"]: | ||
return feature_engine # type: ignore | ||
|
||
if feature_engine == "auto": | ||
has_dependancy_text_, _, _ = lazy_import_has_dependancy_text() | ||
if has_dependancy_text_: | ||
return "torch" | ||
has_cuml_dependancy_, _, cudf = lazy_import_has_dependancy_cu_cat() | ||
if has_cuml_dependancy_: | ||
return "cu_cat" | ||
has_min_dependancy_, _ = lazy_import_has_min_dependancy() | ||
if has_min_dependancy_: | ||
return "dirty_cat" | ||
return "pandas" | ||
|
||
raise ValueError( # noqa | ||
f'feature_engine expected to be "none", ' | ||
'"pandas", "dirty_cat", "torch", or "auto"' | ||
'"pandas", "dirty_cat", "torch", "cu_cat", or "auto"' | ||
f'but received: {feature_engine} :: {type(feature_engine)}' | ||
) | ||
|
||
|
@@ -230,18 +292,19 @@ def features_without_target( | |
:param y: target DataFrame | ||
:return: DataFrames of model and target | ||
""" | ||
_, _, cudf = lazy_import_has_dependancy_cu_cat() | ||
if y is None: | ||
return df | ||
remove_cols = [] | ||
if y is None: | ||
pass | ||
elif isinstance(y, pd.DataFrame): | ||
elif isinstance(y, pd.DataFrame) or isinstance(y, cudf.DataFrame): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. great catch There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @dcolinmorgan There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe same problem elsewhere? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If cudf is None, I think the current would throw an exn |
||
yc = y.columns | ||
xc = df.columns | ||
for c in yc: | ||
if c in xc: | ||
remove_cols.append(c) | ||
elif isinstance(y, pd.Series): | ||
elif isinstance(y, pd.Series) or isinstance(y, cudf.Series): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. handle non-cu_cat import returning |
||
if y.name and (y.name in df.columns): | ||
remove_cols = [y.name] | ||
elif isinstance(y, List): | ||
|
@@ -265,7 +328,7 @@ def remove_node_column_from_symbolic(X_symbolic, node): | |
logger.info(f"Removing `{node}` from input X_symbolic list") | ||
X_symbolic.remove(node) | ||
return X_symbolic | ||
if isinstance(X_symbolic, pd.DataFrame): | ||
if isinstance(X_symbolic, pd.DataFrame) or 'cudf' in str(getmodule(X_symbolic)): | ||
logger.info(f"Removing `{node}` from input X_symbolic DataFrame") | ||
return X_symbolic.drop(columns=[node], errors="ignore") | ||
|
||
|
@@ -619,11 +682,19 @@ def fit_pipeline( | |
columns = X.columns | ||
index = X.index | ||
|
||
X = transformer.fit_transform(X) | ||
if keep_n_decimals: | ||
X = np.round(X, decimals=keep_n_decimals) # type: ignore # noqa | ||
|
||
return pd.DataFrame(X, columns=columns, index=index) | ||
X_type = str(getmodule(X)) | ||
if 'cudf' not in X_type: | ||
X = transformer.fit_transform(X) | ||
if keep_n_decimals: | ||
X = np.round(X, decimals=keep_n_decimals) # type: ignore # noqa | ||
X = pd.DataFrame(X, columns=columns, index=index) | ||
else: | ||
X = transformer.fit_transform(X.to_numpy()) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. how do we know if the transformer is cpu vs gpu? it seems to always assume cpu here, but if X is cudf and transformer is gpu, can't we keep X on gpu? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. a sometimes-ok soln would be checking transformer for being from cuml or maybe cu_cat, but that seems non-generalizable There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. wow this nearly gave me a heart attack -- good thoughts i will work with... but this is an artifact, no |
||
if keep_n_decimals: | ||
X = np.round(X, decimals=keep_n_decimals) # type: ignore # noqa | ||
_, _, cudf = lazy_import_has_dependancy_cu_cat() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. good practice to assert even after if statement check? good to know, yay learning There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's useful when you don't want the 'if' but can imagine future changes or misuses accidentally getting the assumption wrong |
||
X = cudf.DataFrame(X, columns=columns, index=index) | ||
return X | ||
|
||
|
||
def impute_and_scale_df( | ||
|
@@ -848,6 +919,7 @@ def process_dirty_dataframes( | |
similarity: Optional[str] = None, # "ngram", | ||
categories: Optional[str] = "auto", | ||
multilabel: bool = False, | ||
feature_engine: Optional[str] = "dirty_cat", | ||
lmeyerov marked this conversation as resolved.
Show resolved
Hide resolved
|
||
) -> Tuple[ | ||
pd.DataFrame, | ||
Optional[pd.DataFrame], | ||
|
@@ -873,8 +945,16 @@ def process_dirty_dataframes( | |
:return: Encoded data matrix and target (if not None), | ||
the data encoder, and the label encoder. | ||
""" | ||
from dirty_cat import SuperVectorizer, GapEncoder, SimilarityEncoder | ||
from sklearn.preprocessing import FunctionTransformer | ||
|
||
if feature_engine == 'cu_cat': | ||
lazy_import_has_dependancy_cu_cat() | ||
from cu_cat import SuperVectorizer, GapEncoder, SimilarityEncoder | ||
from cuml.preprocessing import FunctionTransformer | ||
|
||
else: | ||
from dirty_cat import SuperVectorizer, GapEncoder, SimilarityEncoder | ||
from sklearn.preprocessing import FunctionTransformer | ||
|
||
t = time() | ||
|
||
if not is_dataframe_all_numeric(ndf): | ||
|
@@ -911,12 +991,19 @@ def process_dirty_dataframes( | |
) | ||
# now just set the feature names, since dirty cat changes them in | ||
# a weird way... | ||
data_encoder.get_feature_names_out = callThrough(features_transformed) | ||
|
||
X_enc = pd.DataFrame( | ||
X_enc, columns=features_transformed, index=ndf.index | ||
) | ||
X_enc = X_enc.fillna(0.0) | ||
data_encoder.get_feature_names_out = callThrough(features_transformed) | ||
if 'cudf' not in str(getmodule(ndf)): | ||
X_enc = pd.DataFrame( | ||
X_enc, columns=features_transformed, index=ndf.index | ||
) | ||
X_enc = X_enc.fillna(0.0) | ||
else: | ||
_, _, cudf = lazy_import_has_dependancy_cu_cat() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what if cudf is passed in but cu_cat is not installed? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (and are there tests for this?) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. in general you mean? this gpu/feat branch was an attempt to handle this dilemma i think in future if cudf is passed as general engine it should initiate cucat and we remove feature_engine flag; also e.g. if cucat + umap-learn passed -- just to bypass user selection error / bad combinations There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes Tanmoy is out for a bit unexpectedly, so that would help And yes we need to handle multiple envs & multiple user prefs in them, so we can do smart defaults + allow overrides . Makes automated testing here important bc so tricky ... |
||
X_enc = cudf.DataFrame( | ||
X_enc, columns=features_transformed, index=ndf.index | ||
) | ||
X_enc = X_enc.fillna(0.0).to_pandas() # will be removed for future cu_cat release | ||
|
||
else: | ||
logger.info("-*-*- DataFrame is completely numeric") | ||
X_enc, _, data_encoder, _ = get_numeric_transformers(ndf, None) | ||
|
@@ -1117,7 +1204,8 @@ def process_nodes_dataframes( | |
n_topics_target=n_topics_target, | ||
similarity=similarity, | ||
categories=categories, | ||
multilabel=multilabel | ||
multilabel=multilabel, | ||
feature_engine=feature_engine, | ||
) | ||
|
||
if embedding: | ||
|
@@ -1235,20 +1323,31 @@ def encode_edges(edf, src, dst, mlb, fit=False): | |
""" | ||
# uses mlb with fit=T/F so we can use it in transform mode | ||
# to recreate edge feature concat definition | ||
edf_type = str(getmodule(edf)) | ||
source = edf[src] | ||
destination = edf[dst] | ||
source_dtype = str(getmodule(source)) | ||
logger.debug("Encoding Edges using MultiLabelBinarizer") | ||
if fit: | ||
if fit and 'cudf' not in source_dtype: | ||
T = mlb.fit_transform(zip(source, destination)) | ||
else: | ||
elif fit and 'cudf' in source_dtype: | ||
T = mlb.fit_transform(zip(source.to_pandas(), destination.to_pandas())) | ||
elif not fit and 'cudf' not in source_dtype: | ||
T = mlb.transform(zip(source, destination)) | ||
elif not fit and 'cudf' in source_dtype: | ||
T = mlb.transform(zip(source.to_pandas(), destination.to_pandas())) | ||
|
||
T = 1.0 * T # coerce to float | ||
columns = [ | ||
str(k) for k in mlb.classes_ | ||
] # stringify the column names or scikits.base throws error | ||
mlb.get_feature_names_out = callThrough(columns) | ||
mlb.columns_ = [src, dst] | ||
T = pd.DataFrame(T, columns=columns, index=edf.index) | ||
if 'cudf' in edf_type: | ||
_, _, cudf = lazy_import_has_dependancy_cu_cat() | ||
T = cudf.DataFrame(T, columns=columns, index=edf.index) | ||
else: | ||
T = pd.DataFrame(T, columns=columns, index=edf.index) | ||
logger.info(f"Shape of Edge Encoding: {T.shape}") | ||
return T, mlb | ||
|
||
|
@@ -1321,6 +1420,7 @@ def process_edge_dataframes( | |
MultiLabelBinarizer() | ||
) # create new one so we can use encode_edges later in | ||
# transform with fit=False | ||
_, _, cudf = lazy_import_has_dependancy_cu_cat() | ||
T, mlb_pairwise_edge_encoder = encode_edges( | ||
edf, src, dst, mlb_pairwise_edge_encoder, fit=True | ||
) | ||
|
@@ -1406,7 +1506,11 @@ def process_edge_dataframes( | |
if not X_enc.empty and not T.empty: | ||
logger.debug("-" * 60) | ||
logger.debug("<= Found Edges and Dirty_cat encoding =>") | ||
X_enc = pd.concat([T, X_enc], axis=1) | ||
T_type = str(getmodule(T)) | ||
if 'cudf' in T_type: | ||
X_enc = cudf.concat([T, X_enc], axis=1) | ||
else: | ||
X_enc = pd.concat([T, X_enc], axis=1) | ||
elif not T.empty and X_enc.empty: | ||
logger.debug("-" * 60) | ||
logger.debug("<= Found only Edges =>") | ||
|
@@ -1811,7 +1915,7 @@ def prune_weighted_edges_df_and_relabel_nodes( | |
" -- Pruning weighted edge DataFrame " | ||
f"from {len(wdf):,} to {len(wdf2):,} edges." | ||
) | ||
if index_to_nodes_dict is not None: | ||
if index_to_nodes_dict is not None and type(index_to_nodes_dict) == dict: | ||
wdf2[config.SRC] = wdf2[config.SRC].map(index_to_nodes_dict) | ||
wdf2[config.DST] = wdf2[config.DST].map(index_to_nodes_dict) | ||
return wdf2 | ||
|
@@ -1952,7 +2056,8 @@ def _featurize_nodes( | |
X_resolved = resolve_X(ndf, X) | ||
y_resolved = resolve_y(ndf, y) | ||
|
||
feature_engine = resolve_feature_engine(feature_engine) | ||
res.feature_engine = feature_engine | ||
dcolinmorgan marked this conversation as resolved.
Show resolved
Hide resolved
|
||
X_resolved, y_resolved = make_safe_gpu_dataframes(X_resolved, y_resolved, engine=feature_engine) | ||
|
||
from .features import ModelDict | ||
|
||
|
@@ -2076,6 +2181,9 @@ def _featurize_edges( | |
**{res._destination: res._edges[res._destination]} | ||
) | ||
|
||
res.feature_engine = feature_engine | ||
dcolinmorgan marked this conversation as resolved.
Show resolved
Hide resolved
|
||
X_resolved, y_resolved = make_safe_gpu_dataframes(X_resolved, y_resolved, engine=feature_engine) | ||
|
||
# now that everything is set | ||
fkwargs = dict( | ||
X=X_resolved, | ||
|
@@ -2487,13 +2595,18 @@ def featurize( | |
default True. | ||
:return: graphistry instance with new attributes set by the featurization process. | ||
""" | ||
assert_imported() | ||
feature_engine = resolve_feature_engine(feature_engine) | ||
|
||
if feature_engine == 'dirty_cat': | ||
assert_imported() | ||
dcolinmorgan marked this conversation as resolved.
Show resolved
Hide resolved
|
||
elif feature_engine == 'cu_cat': | ||
assert_cuml_cucat() | ||
|
||
if inplace: | ||
res = self | ||
else: | ||
res = self.bind() | ||
|
||
feature_engine = resolve_feature_engine(feature_engine) | ||
|
||
if kind == "nodes": | ||
res = res._featurize_nodes( | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add
assert cudf is not None
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
also probably good to switch from
lazy_import...cu_cat
to acudf
oneThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ok -- after the if statement seems best here again like other assert you mentioned