diff --git a/docs/source/graphistry.rst b/docs/source/graphistry.rst index c9fbcfa4dd..a6fdf5cf15 100644 --- a/docs/source/graphistry.rst +++ b/docs/source/graphistry.rst @@ -1,42 +1,81 @@ -graphistry package +Layout & Plugins ================== .. toctree:: :maxdepth: 3 - graphistry.compute + graphistry.layout graphistry.plugins graphistry.plugins_types -graphistry.plotter module -------------------------- +Plotter Module +================== -.. automodule:: graphistry.plotter +.. automodule:: graphistry.PlotterBase :members: :undoc-members: :show-inheritance: -graphistry.pygraphistry module ------------------------------- +Pygraphistry Module +================== .. automodule:: graphistry.pygraphistry :members: :undoc-members: :show-inheritance: -graphistry.arrow_uploader module --------------------------------- +Featurize +================== +.. automodule:: graphistry.feature_utils + :members: + :undoc-members: + :show-inheritance: + + +UMAP +================== +.. automodule:: graphistry.umap_utils + :members: + :undoc-members: + :show-inheritance: + + +Semantic Search +================== +.. automodule:: graphistry.text_utils + :members: + :undoc-members: + :show-inheritance: + +DBScan +================== +.. automodule:: graphistry.compute.cluster + :members: + :undoc-members: + :show-inheritance: + +Arrow uploader Module +================== .. automodule:: graphistry.arrow_uploader :members: :undoc-members: :show-inheritance: -graphistry.ArrowFileUploader module ------------------------------------ +Arrow File Uploader Module +================== .. automodule:: graphistry.ArrowFileUploader :members: :undoc-members: :show-inheritance: + +Versioneer +================== + +.. automodule:: graphistry._version + :members: + :undoc-members: + :show-inheritance: + diff --git a/docs/source/index.rst b/docs/source/index.rst index 1943a5cf72..b45393c266 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -1,8 +1,11 @@ -PyGraphistry's documentation (|version|) +PyGraphistry[ai]'s documentation ======================================== -Quickstart: -`Read our tutorial `_ +.. Quickstart: +.. `Read our tutorial `_ + +PyGraphistry is a Python visual graph AI library to extract, transform, analyze, model, and visualize big graphs, and especially alongside Graphistry end-to-end GPU server sessions. Installing optional graphistry[ai] dependencies adds graph autoML, including automatic feature engineering, UMAP, and graph neural net support. Combined, PyGraphistry reduces your time to graph for going from raw data to visualizations and AI models down to three lines of code. +Here in our docstrings you can find useful packages, modules, and commands to maximize your graph AI experience with PyGraphistry. In the navbar you can find an overview of all the packages and modules we provided and a few useful highlighted ones as well. You can search for them on our Search page. For a full tutorial, refer to our `PyGraphistry `_ repo. .. toctree:: :maxdepth: 3 diff --git a/docs/source/modules.rst b/docs/source/modules.rst index 2d0d70fd92..71e0a12335 100644 --- a/docs/source/modules.rst +++ b/docs/source/modules.rst @@ -1,9 +1,9 @@ -doc -=== +.. doc +.. === -.. toctree:: - :maxdepth: 4 - :caption: Contents: +.. .. toctree:: +.. :maxdepth: 4 +.. :caption: Contents: - versioneer +.. versioneer diff --git a/docs/source/versioneer.rst b/docs/source/versioneer.rst index 804c171da3..a34edfc48d 100644 --- a/docs/source/versioneer.rst +++ b/docs/source/versioneer.rst @@ -1,2 +1,2 @@ -versioneer module -================= +.. versioneer module +.. ================= diff --git a/graphistry/PlotterBase.py b/graphistry/PlotterBase.py index 7b7d0604d0..badb060b19 100644 --- a/graphistry/PlotterBase.py +++ b/graphistry/PlotterBase.py @@ -300,7 +300,7 @@ def style(self, fg=None, bg=None, page=None, logo=None): :param fg: Dictionary {'blendMode': str} of any valid CSS blend mode :type fg: dict - :param bg: Nested dictionary of page background properties. {'color': str, 'gradient': {'kind': str, 'position': str, 'stops': list }, 'image': { 'url': str, 'width': int, 'height': int, 'blendMode': str } + :param bg: Nested dictionary of page background properties. { 'color': str, 'gradient': {'kind': str, 'position': str, 'stops': list }, 'image': { 'url': str, 'width': int, 'height': int, 'blendMode': str } :type bg: dict :param logo: Nested dictionary of logo properties. { 'url': str, 'autoInvert': bool, 'position': str, 'dimensions': { 'maxWidth': int, 'maxHeight': int }, 'crop': { 'top': int, 'left': int, 'bottom': int, 'right': int }, 'padding': { 'top': int, 'left': int, 'bottom': int, 'right': int}, 'style': str} @@ -314,15 +314,18 @@ def style(self, fg=None, bg=None, page=None, logo=None): **Example: Chained merge - results in url and blendMode being set, while color is dropped** :: + g2 = g.style(bg={'color': 'black'}, fg={'blendMode': 'screen'}) g3 = g2.style(bg={'image': {'url': 'http://site.com/watermark.png'}}) **Example: Gradient background** :: + g.style(bg={'gradient': {'kind': 'linear', 'position': 45, 'stops': [['rgb(0,0,0)', '0%'], ['rgb(255,255,255)', '100%']]}}) **Example: Page settings** :: + g.style(page={'title': 'Site - {{ name }}', 'favicon': 'http://site.com/logo.ico'}) """ @@ -850,13 +853,14 @@ def bind(self, source=None, destination=None, node=None, edge=None, :param edge: Attribute containing an edge's ID :type edge: str - :param edge_title: Attribute overriding edge's minimized label text. By default, the edge source and destination is used. + :param edge_title: Attribute overriding edge's minimized label text. + By default, the edge source and destination is used. :type edge_title: str :param edge_label: Attribute overriding edge's expanded label text. By default, scrollable list of attribute/value mappings. :type edge_label: str - :param edge_color: Attribute overriding edge's color. rgba (int64) or int32 palette index, see palette definitions `_ for values. Based on Color Brewer. + :param edge_color: Attribute overriding edge's color. rgba (int64) or int32 palette index, see `palette `_ definitions for values. Based on Color Brewer. :type edge_color: str :param edge_source_color: Attribute overriding edge's source color if no edge_color, as an rgba int64 value. @@ -874,7 +878,7 @@ def bind(self, source=None, destination=None, node=None, edge=None, :param point_label: Attribute overriding node's expanded label text. By default, scrollable list of attribute/value mappings. :type point_label: str - :param point_color: Attribute overriding node's color.rgba (int64) or int32 palette index, see palette definitions `_ for values. Based on Color Brewer. + :param point_color: Attribute overriding node's color.rgba (int64) or int32 palette index, see `palette `_ definitions for values. Based on Color Brewer. :type point_color: str :param point_size: Attribute overriding node's size. By default, uses the node degree. The visualization will normalize point sizes and adjust dynamically using semantic zoom. @@ -1007,6 +1011,7 @@ def nodes(self, nodes: Union[Callable, Any], node=None, *args, **kwargs) -> Plot **Example** :: + import graphistry def sample_nodes(g, n): @@ -1106,6 +1111,7 @@ def edges(self, edges: Union[Callable, Any], source=None, destination=None, edge **Example** :: + import graphistry def sample_edges(g, n): diff --git a/graphistry/compute/cluster.py b/graphistry/compute/cluster.py index 15b7cf0ed3..f19fbfbe38 100644 --- a/graphistry/compute/cluster.py +++ b/graphistry/compute/cluster.py @@ -71,11 +71,11 @@ def get_model_matrix(g, kind: str, cols: Optional[Union[List, str]], umap, targe Allows for a single function to get the model matrix for both nodes and edges as well as targets, embeddings, and features Args: - g: graphistry graph - kind: 'nodes' or 'edges' - cols: list of columns to use for clustering given `g.featurize` has been run - umap: whether to use UMAP embeddings or features dataframe - target: whether to use the target dataframe or features dataframe + :g: graphistry graph + :kind: 'nodes' or 'edges' + :cols: list of columns to use for clustering given `g.featurize` has been run + :umap: whether to use UMAP embeddings or features dataframe + :target: whether to use the target dataframe or features dataframe Returns: pd.DataFrame: dataframe of model matrix given the inputs @@ -99,11 +99,11 @@ def dbscan_fit(g: Any, dbscan: Any, kind: str = "nodes", cols: Optional[Union[Li Fits clustering on UMAP embeddings if umap is True, otherwise on the features dataframe or target dataframe if target is True. - args: - g: graphistry graph - kind: 'nodes' or 'edges' - cols: list of columns to use for clustering given `g.featurize` has been run - use_umap_embedding: whether to use UMAP embeddings or features dataframe for clustering (default: True) + Args: + :g: graphistry graph + :kind: 'nodes' or 'edges' + :cols: list of columns to use for clustering given `g.featurize` has been run + :use_umap_embedding: whether to use UMAP embeddings or features dataframe for clustering (default: True) """ X = get_model_matrix(g, kind, cols, use_umap_embedding, target) @@ -212,6 +212,8 @@ def dbscan( """DBSCAN clustering on cpu or gpu infered automatically. Adds a `_dbscan` column to nodes or edges. Examples: + :: + g = graphistry.edges(edf, 'src', 'dst').nodes(ndf, 'node') # cluster by UMAP embeddings @@ -244,14 +246,14 @@ def dbscan( https://github.com/graphistry/pygraphistry/blob/master/demos/ai/cyber/cyber-redteam-umap-demo.ipynb Args: - min_dist float: The maximum distance between two samples for them to be considered as in the same neighborhood. - kind str: 'nodes' or 'edges' - cols: list of columns to use for clustering given `g.featurize` has been run, nice way to slice features or targets by + :min_dist float: The maximum distance between two samples for them to be considered as in the same neighborhood. + :kind str: 'nodes' or 'edges' + :cols: list of columns to use for clustering given `g.featurize` has been run, nice way to slice features or targets by fragments of interest, e.g. ['ip_172', 'location', 'ssh', 'warnings'] - fit_umap_embedding bool: whether to use UMAP embeddings or features dataframe to cluster DBSCAN - min_samples: The number of samples in a neighborhood for a point to be considered as a core point. + :fit_umap_embedding bool: whether to use UMAP embeddings or features dataframe to cluster DBSCAN + :min_samples: The number of samples in a neighborhood for a point to be considered as a core point. This includes the point itself. - target: whether to use the target column as the clustering feature + :target: whether to use the target column as the clustering feature """ @@ -333,43 +335,51 @@ def transform_dbscan( Graph nodes | edges will be colored by '_dbscan' column. Examples: + :: + fit: g = graphistry.edges(edf, 'src', 'dst').nodes(ndf, 'node') g2 = g.featurize().dbscan() predict: + :: + emb, X, _, ndf = g2.transform_dbscan(ndf, return_graph=False) # or g3 = g2.transform_dbscan(ndf, return_graph=True) g3.plot() likewise for umap: + :: + fit: g = graphistry.edges(edf, 'src', 'dst').nodes(ndf, 'node') g2 = g.umap(X=.., y=..).dbscan() predict: + :: + emb, X, y, ndf = g2.transform_dbscan(ndf, ndf, return_graph=False) # or g3 = g2.transform_dbscan(ndf, ndf, return_graph=True) g3.plot() - args: - df: dataframe to transform - y: optional labels dataframe - min_dist: The maximum distance between two samples for them to be considered as in the same neighborhood. + Args: + :df: dataframe to transform + :y: optional labels dataframe + :min_dist: The maximum distance between two samples for them to be considered as in the same neighborhood. smaller values will result in less edges between the minibatch and the original graph. Default 'auto', infers min_dist from the mean distance and std of new points to the original graph - fit_umap_embedding: whether to use UMAP embeddings or features dataframe when inferring edges between + :fit_umap_embedding: whether to use UMAP embeddings or features dataframe when inferring edges between the minibatch and the original graph. Default False, uses the features dataframe - sample: number of samples to use when inferring edges between the minibatch and the original graph, + :sample: number of samples to use when inferring edges between the minibatch and the original graph, if None, will only use closest point to the minibatch. If greater than 0, will sample the closest `sample` points in existing graph to pull in more edges. Default None - kind: 'nodes' or 'edges' - return_graph: whether to return a graph or the (emb, X, y, minibatch df enriched with DBSCAN labels), default True + :kind: 'nodes' or 'edges' + :return_graph: whether to return a graph or the (emb, X, y, minibatch df enriched with DBSCAN labels), default True infered graph supports kind='nodes' only. - verbose: whether to print out progress, default False + :verbose: whether to print out progress, default False """ emb, X, y, df = self._transform_dbscan(df, y, kind=kind, verbose=verbose) diff --git a/graphistry/feature_utils.py b/graphistry/feature_utils.py index c9b4a9174c..a068ddeb7b 100644 --- a/graphistry/feature_utils.py +++ b/graphistry/feature_utils.py @@ -499,7 +499,6 @@ class Embedding: """ Generates random embeddings of a given dimension that aligns with the index of the dataframe - _____________________________________________________________________ """ def __init__(self, df: pd.DataFrame): @@ -628,10 +627,9 @@ def fit_pipeline( which helps for when transformer pipeline is scaling or imputer which sometime introduce small negative numbers, and umap metrics like Hellinger need to be positive - :param X, DataFrame to transform. + :param X: DataFrame to transform. :param transformer: Pipeline object to fit and transform - :param keep_n_decimals: Int of how many decimal places to keep in - rounded transformed data + :param keep_n_decimals: Int of how many decimal places to keep in rounded transformed data """ columns = X.columns index = X.index @@ -1754,10 +1752,11 @@ def fit_transform(self, src=None, dst=None, *args, **kwargs): def scale(self, X=None, y=None, return_pipeline=False, *args, **kwargs): """Fits new scaling functions on df, y via args-kwargs - example: + **Example:** + :: + from graphisty.features import SCALERS, SCALER_OPTIONS print(SCALERS) - g = graphistry.nodes(df) # set a scaling strategy for features and targets -- umap uses those and produces different results depending. g2 = g.umap(use_scaler='standard', use_scaler_target=None) @@ -1770,9 +1769,12 @@ def scale(self, X=None, y=None, return_pipeline=False, *args, **kwargs): clf.fit(X_scaled, y_scaled) args: - X: pd.DataFrame of features - y: pd.DataFrame of target features - kind: str, one of 'nodes' or 'edges' + :: + + + ;X: pd.DataFrame of features + :y: pd.DataFrame of target features + :kind: str, one of 'nodes' or 'edges' *args, **kwargs: passed to smart_scaler pipeline returns: scaled X, y @@ -1880,14 +1882,20 @@ class FeatureMixin(MIXIN_BASE): Subclasses UMAPMixin for umap-ing of automatic features. Usage: + :: + g = graphistry.nodes(df, 'node_column') g2 = g.featurize() or for edges, + :: + g = graphistry.edges(df, 'src', 'dst') g2 = g.featurize(kind='edges') or chain them for both nodes and edges, + :: + g = graphistry.edges(edf, 'src', 'dst').nodes(ndf, 'node_column') g2 = g.featurize().featurize(kind='edges') @@ -2202,21 +2210,21 @@ def transform(self, df: pd.DataFrame, """ Transform new data and append to existing graph, or return dataframes - args: - df: pd.DataFrame, raw data to transform - ydf: pd.DataFrame, optional - kind: str # one of `nodes`, `edges` - return_graph: bool, if True, will return a graph with inferred edges. - merge_policy: bool, if True, adds batch to existing graph nodes via nearest neighbors. - If False, will infer edges only between nodes in the batch, default False - min_dist: float, if return_graph is True, will use this value in NN search, or 'auto' to infer a good value - min_dist represents the maximum distance between two samples for one to be considered as in the neighborhood of the other. - sample: int, if return_graph is True, will use sample edges of existing graph to fill out the new graph - n_neighbors: int, if return_graph is True, will use this value for n_neighbors in Nearest Neighbors search - scaled: bool, if True, will use scaled transformation of data set during featurization, default True - verbose: bool, if True, will print metadata about the graph construction, default False - returns: - X, y: pd.DataFrame, transformed data if return_graph is False + **args:** + + :df: pd.DataFrame, raw data to transform + :ydf: pd.DataFrame, optional + :kind: str # one of `nodes`, `edges` + :return_graph: bool, if True, will return a graph with inferred edges. + :merge_policy: bool, if True, adds batch to existing graph nodes via nearest neighbors. If False, will infer edges only between nodes in the batch, default False + :min_dist: float, if return_graph is True, will use this value in NN search, or 'auto' to infer a good value. min_dist represents the maximum distance between two samples for one to be considered as in the neighborhood of the other. + :sample: int, if return_graph is True, will use sample edges of existing graph to fill out the new graph + :n_neighbors: int, if return_graph is True, will use this value for n_neighbors in Nearest Neighbors search + :scaled: bool, if True, will use scaled transformation of data set during featurization, default True + :verbose: bool, if True, will print metadata about the graph construction, default False + **Returns:** + + X, y: pd.DataFrame, transformed data if return_graph is False or a graphistry Plottable with inferred edges if return_graph is True """ if kind == "nodes": @@ -2255,7 +2263,9 @@ def scale( ): """Scale data using the same scalers as used in the featurization step. - example usage: + **Example** + :: + g = graphistry.nodes(df) X, y = g.featurize().scale(kind='nodes', use_scaler='robust', use_scaler_target='kbins', n_bins=3) @@ -2271,25 +2281,28 @@ def scale( clf.fit(X_scaled, y_scaled) - args: - df: pd.DataFrame, raw data to transform, if None, will use data from featurization fit - y: pd.DataFrame, optional target data - kind: str, one of `nodes`, `edges` - use_scaler: str, optional, one of `minmax`, `robust`, `standard`, `kbins`, `quantile` - use_scaler_target: str, optional, one of `minmax`, `robust`, `standard`, `kbins`, `quantile` - impute: bool, if True, will impute missing values - n_quantiles: int, number of quantiles to use for quantile scaler - output_distribution: str, one of `normal`, `uniform`, `lognormal` - quantile_range: tuple, range of quantiles to use for quantile scaler - n_bins: int, number of bins to use for KBinsDiscretizer - encode: str, one of `ordinal`, `onehot`, `onehot-dense`, `binary` - strategy: str, one of `uniform`, `quantile`, `kmeans` - keep_n_decimals: int, number of decimals to keep after scaling - return_scalers: bool, if True, will return the scalers used to scale the data - returns: - (X, y) transformed data if return_graph is False + **Args:** + + :df: pd.DataFrame, raw data to transform, if None, will use data from featurization fit + :y: pd.DataFrame, optional target data + :kind: str, one of `nodes`, `edges` + :use_scaler: str, optional, one of `minmax`, `robust`, `standard`, `kbins`, `quantile` + :use_scaler_target: str, optional, one of `minmax`, `robust`, `standard`, `kbins`, `quantile` + :impute: bool, if True, will impute missing values + :n_quantiles: int, number of quantiles to use for quantile scaler + :output_distribution: str, one of `normal`, `uniform`, `lognormal` + :quantile_range: tuple, range of quantiles to use for quantile scaler + :n_bins: int, number of bins to use for KBinsDiscretizer + :encode: str, one of `ordinal`, `onehot`, `onehot-dense`, `binary` + :strategy: str, one of `uniform`, `quantile`, `kmeans` + :keep_n_decimals: int, number of decimals to keep after scaling + :return_scalers: bool, if True, will return the scalers used to scale the data + + **Returns:** + + (X, y) transformed data if return_graph is False or a graph with inferred edges if return_graph is True, - or (X, y, scaler, scaler_target) if return_scalers is True + or (X, y, scaler, scaler_target) if return_scalers is True """ if df is None: # use the original data @@ -2774,7 +2787,8 @@ def _featurize_or_get_edges_dataframe_if_X_is_None( def get_matrix(self, columns: Optional[Union[List, str]] = None, kind: str = 'nodes', target: bool = False) -> pd.DataFrame: - """Returns feature matrix, and if columns are specified, returns matrix with only the columns that contain + """ + Returns feature matrix, and if columns are specified, returns matrix with only the columns that contain the string `column_part` in their name. `X = g.get_matrix(['feature1', 'feature2'])` @@ -2786,7 +2800,9 @@ def get_matrix(self, columns: Optional[Union[List, str]] = None, kind: str = 'no Powerful way to retrieve features from a featurized graph by column or (top) features of interest. - example: + **Example:** + :: + # get the full feature matrices X = g.get_matrix() y = g.get_matrix(target=True) @@ -2804,10 +2820,10 @@ def get_matrix(self, columns: Optional[Union[List, str]] = None, kind: str = 'no Caveats: - if you have a column name that is a substring of another column name, you may get unexpected results. Args: - columns (Union[List, str]): list of column names or a single column name that may exist in columns + :columns (Union[List, str]): list of column names or a single column name that may exist in columns of the feature matrix. If None, returns original feature matrix - kind (str, optional): Node or Edge features. Defaults to 'nodes'. - target (bool, optional): If True, returns the target matrix. Defaults to False. + :kind (str, optional): Node or Edge features. Defaults to 'nodes'. + :target (bool, optional): If True, returns the target matrix. Defaults to False. Returns: pd.DataFrame: feature matrix with only the columns that contain the string `column_part` in their name. diff --git a/graphistry/layout/graph/graph.py b/graphistry/layout/graph/graph.py index b04dd4842e..4cbeac9182 100644 --- a/graphistry/layout/graph/graph.py +++ b/graphistry/layout/graph/graph.py @@ -8,31 +8,55 @@ class Graph(object): - """ - The graph is stored in disjoint-sets holding each connected component in `components` as a list of graph_core objects. - - **Attributes** - C (list[GraphBase]): list of graph_core components. - - **Methods** - add_vertex(v): add vertex v into the Graph as a new component - add_edge(e): add edge e and its vertices into the Graph possibly merging the - associated graph_core components - get_vertices_count(): see order() - vertices(): see graph_core - edges(): see graph_core - remove_edge(e): remove edge e possibly spawning two new cores - if the graph_core that contained e gets disconnected. - remove_vertex(v): remove vertex v and all its edges. - order(): the order of the graph (number of vertices) - norm(): the norm of the graph (number of edges) - deg_min(): the minimum degree of vertices - deg_max(): the maximum degree of vertices - deg_avg(): the average degree of vertices - eps(): the graph epsilon value (norm/order), average number of edges per vertex. - connected(): returns True if the graph is connected (i.e. it has only one component). - components(): returns the list of components - """ + # """ + # The graph is stored in disjoint-sets holding each connected component in `components` as a list of graph_core objects. + + # **Attributes** + # C (list[GraphBase]): list of graph_core components. + + + # **add_edge(e):** + # add edge e and its vertices into the Graph possibly merging the associated graph_core components + + # **get_vertices_count():** + # see order() + + # **vertices():** + # see graph_core + + # **edges():** + # see graph_core + + # **remove_edge(e):** + # remove edge e possibly spawning two new cores if the graph_core that contained e gets disconnected. + + # **remove_vertex(v):** + # remove vertex v and all its edges. + + # **order():** + # the order of the graph (number of vertices) + + # **norm():** + # the norm of the graph (number of edges) + + # **deg_min():** + # the minimum degree of vertices + + # **deg_max():** + # the maximum degree of vertices + + # **deg_avg():** + # the average degree of vertices + + # **eps():** + # the graph epsilon value (norm/order), average number of edges per vertex. + + # **connected():** + # returns True if the graph is connected (i.e. it has only one component). + + # **components():** + # returns the list of components + # """ component_class = GraphBase @@ -73,16 +97,22 @@ def __init__(self, vertices = None, edges = None, directed = True): self.components.append(self.component_class(vertices, edge_set, directed)) def add_vertex(self, v): + """ + add vertex v into the Graph as a new component + """ for c in self.components: if v in c.verticesPoset: return c.verticesPoset.get(v) g = self.component_class(directed = self.directed) v = g.add_single_vertex(v) self.components.append(g) + print("add vertex v into the Graph as a new component") return v def add_edge(self, e): - + """ + add edge e and its vertices into the Graph possibly merging the associated graph_core components + """ x = e.v[0] y = e.v[1] x = self.add_vertex(x) @@ -116,6 +146,9 @@ def get_vertex_from_data(self, data): return None def vertices(self): + """ + see graph_core + """ for c in self.components: vertices = c.verticesPoset for v in vertices: @@ -128,6 +161,9 @@ def edges(self): yield e def remove_edge(self, e): + """ + remove edge e possibly spawning two new cores if the graph_core that contained e gets disconnected. + """ # get the GraphBase: c = e.v[0].component assert c == e.v[1].component @@ -147,6 +183,9 @@ def remove_edge(self, e): return e def remove_vertex(self, x): + """ + remove vertex v and all its edges. + """ c = x.component if c not in self.components: return None @@ -165,24 +204,42 @@ def remove_vertex(self, x): return x def order(self): + """ + the order of the graph (number of vertices) + """ return sum([c.order() for c in self.components]) def norm(self): + """ + the norm of the graph (number of edges) + """ return sum([c.norm() for c in self.components]) def deg_min(self): + """ + the minimum degree of vertices + """ return min([c.deg_min() for c in self.components]) def deg_max(self): + """ + the maximum degree of vertices + """ return max([c.deg_max() for c in self.components]) def deg_avg(self): + """ + the average degree of vertices + """ t = 0.0 for c in self.components: t += sum([v.degree() for v in c.verticesPoset]) return t / float(self.order()) def eps(self): + """ + the graph epsilon value (norm/order), average number of edges per vertex. + """ return float(self.norm()) / self.order() def path(self, x, y, f_io = 0, hook = None): @@ -203,4 +260,7 @@ def __contains__(self, G): return r def connected(self): + """ + returns the list of components + """ return len(self.components) == 1 diff --git a/graphistry/layout/graph/graphBase.py b/graphistry/layout/graph/graphBase.py index 0e1b8f51e4..725f45daf9 100644 --- a/graphistry/layout/graph/graphBase.py +++ b/graphistry/layout/graph/graphBase.py @@ -13,41 +13,6 @@ class GraphBase(object): loops (set[Edge]): the set of *loop* edges (of degree 0). directed (bool): indicates if the graph is considered *oriented* or not. - Methods: - vertices(cond=None): generates an iterator over vertices, with optional filter - edges(cond=None): generates an iterator over edges, with optional filter - matrix(cond=None): returns the associativity matrix of the graph component - order(): the order of the graph (number of vertices) - norm(): the norm of the graph (number of edges) - deg_min(): the minimum degree of vertices - deg_max(): the maximum degree of vertices - deg_avg(): the average degree of vertices - eps(): the graph epsilon value (norm/order), average number of edges per vertex. - path(x,y,f_io=0,hook=None): shortest path between vertices x and y by breadth-first descent, - contrained by f_io direction if provided. The path is returned as a list of Vertex objects. - If a *hook* function is provided, it is called at every vertex added to the path, passing - the vertex object as argument. - roots(): returns the list of *roots* (vertices with no inward edges). - leaves(): returns the list of *leaves* (vertices with no outward edges). - add_single_vertex(v): allow a GraphBase to hold a single vertex. - add_edge(e): add edge e. At least one of its vertex must belong to the graph, - the other being added automatically. - remove_edge(e): remove Edge e, asserting that the resulting graph is still connex. - remove_vertex(x): remove Vertex x and all associated edges. - dijkstra(x,f_io=0,hook=None): shortest weighted-edges paths between x and all other vertices - by dijkstra's algorithm with heap used as priority queue. - get_scs_with_feedback(): returns the set of strongly connected components - ("scs") by using Tarjan algorithm. - These are maximal sets of vertices such that there is a path from each - vertex to every other vertex. - The algorithm performs a DFS from the provided list of root vertices. - A cycle is of course a strongly connected component, - but a strongly connected component can include several cycles. - The Feedback Acyclic Set of edge to be removed/reversed is provided by - marking the edges with a "feedback" flag. - Complexity is O(V+E). - partition(): returns a *partition* of the connected graph as a list of lists. - neighbors(v): returns neighbours of a vertex v. """ def __init__(self, vertices = None, edges = None, directed = True): @@ -96,12 +61,21 @@ def __init__(self, vertices = None, edges = None, directed = True): v.component = self def roots(self): + """ + returns the list of *roots* (vertices with no inward edges). + """ return list(filter(lambda v: len(v.e_in()) == 0, self.verticesPoset)) def leaves(self): + """ + returns the list of *leaves* (vertices with no outward edges). + """ return list(filter(lambda v: len(v.e_out()) == 0, self.verticesPoset)) def add_single_vertex(self, v): + """ + allow a GraphBase to hold a single vertex. + """ if len(self.edgesPoset) == 0 and len(self.verticesPoset) == 0: v = self.verticesPoset.add(v) v.component = self @@ -109,6 +83,9 @@ def add_single_vertex(self, v): return None def add_edge(self, e): + """ + add edge e. At least one of its vertex must belong to the graph, the other being added automatically. + """ if e in self.edgesPoset: return self.edgesPoset.get(e) x = e.v[0] @@ -127,6 +104,9 @@ def add_edge(self, e): return e def remove_edge(self, e): + """ + remove Edge e, asserting that the resulting graph is still connex. + """ if e not in self.edgesPoset: return e.detach() @@ -143,6 +123,9 @@ def remove_edge(self, e): return e def remove_vertex(self, x): + """ + remove Vertex x and all associated edges. + """ if x not in self.verticesPoset: return vertices = x.neighbors() # get all neighbor vertices to check paths @@ -168,6 +151,9 @@ def constant_function(self, value): return lambda x: value def vertices(self, cond = None): + """ + generates an iterator over vertices, with optional filter + """ vertices = self.verticesPoset if cond is None: cond = self.constant_function(True) @@ -176,6 +162,9 @@ def vertices(self, cond = None): yield v def edges(self, cond = None): + """ + generates an iterator over edges, with optional filter + """ edges = self.edgesPoset if cond is None: cond = self.constant_function(True) @@ -185,7 +174,7 @@ def edges(self, cond = None): def matrix(self, cond = None): """ - This associativity matrix is like the adjacency matrix but antisymmetric. + This associativity matrix is like the adjacency matrix but antisymmetric. Returns the associativity matrix of the graph component :param cond: same a the condition function in vertices(). :return: array @@ -207,27 +196,46 @@ def matrix(self, cond = None): return mat def order(self): + """ + the order of the graph (number of vertices) + """ return len(self.verticesPoset) def norm(self): """ - The size of the edge poset. + The size of the edge poset (number of edges). """ return len(self.edgesPoset) def deg_min(self): + """ + the minimum degree of vertices + """ return min([v.degree() for v in self.verticesPoset]) def deg_max(self): + """ + the maximum degree of vertices + """ return max([v.degree() for v in self.verticesPoset]) def deg_avg(self): + """ + the average degree of vertices + """ return sum([v.degree() for v in self.verticesPoset]) / float(self.order()) def eps(self): + """ + the graph epsilon value (norm/order), average number of edges per vertex. + """ return float(self.norm()) / self.order() def path(self, x, y, f_io = 0, hook = None): + """ + shortest path between vertices x and y by breadth-first descent, contrained by f_io direction if provided. The path is returned as a list of Vertex objects. + If a *hook* function is provided, it is called at every vertex added to the path, passing the vertex object as argument. + """ assert x in self.verticesPoset assert y in self.verticesPoset x = self.verticesPoset.get(x) @@ -263,6 +271,9 @@ def path(self, x, y, f_io = 0, hook = None): return p def dijkstra(self, x, f_io = 0, hook = None): + """ + shortest weighted-edges paths between x and all other vertices by dijkstra's algorithm with heap used as priority queue. + """ from collections import defaultdict from heapq import heappop, heappush @@ -300,7 +311,11 @@ def dijkstra(self, x, f_io = 0, hook = None): def get_scs_with_feedback(self, roots = None): """ - Minimum FAS algorithm (feedback arc set) creating a DAG. + Minimum FAS algorithm (feedback arc set) creating a DAG. Returns the set of strongly connected components + ("scs") by using Tarjan algorithm. These are maximal sets of vertices such that there is a path from each vertex to every other vertex. + The algorithm performs a DFS from the provided list of root vertices. A cycle is of course a strongly connected component,but a strongly connected component can include several cycles. + The Feedback Acyclic Set of edge to be removed/reversed is provided by marking the edges with a "feedback" flag. + Complexity is O(V+E). :param roots: :return: diff --git a/graphistry/layout/graph/vertexBase.py b/graphistry/layout/graph/vertexBase.py index 07cb8d6794..1a950273f0 100644 --- a/graphistry/layout/graph/vertexBase.py +++ b/graphistry/layout/graph/vertexBase.py @@ -7,17 +7,6 @@ class VertexBase(object): **Attributes** e (list[Edge]): list of edges associated with this vertex. - **Methods** - degree() : degree of the vertex (number of edges). - e_in() : list of edges directed toward this vertex. - e_out(): list of edges directed outward this vertex. - e_dir(int): either e_in, e_out or all edges depending on provided direction parameter (>0 means outward). - neighbors(f_io=0): list of neighbor vertices in all directions (default) or in filtered f_io direction (>0 means outward). - e_to(v): returns the Edge from this vertex directed toward vertex v. - e_from(v): returns the Edge from vertex v directed toward this vertex. - e_with(v): return the Edge with both this vertex and vertex v - detach(): removes this vertex from all its edges and returns this list of edges. - """ def __init__(self): @@ -25,15 +14,27 @@ def __init__(self): self.e = [] def degree(self): + """ + degree() : degree of the vertex (number of edges). + """ return len(self.e) def e_in(self): + """ + e_in() : list of edges directed toward this vertex. + """ return list(filter((lambda e: e.v[1] == self), self.e)) def e_out(self): + """ + e_out(): list of edges directed outward this vertex. + """ return list(filter((lambda e: e.v[0] == self), self.e)) def e_dir(self, dir): + """ + either e_in, e_out or all edges depending on provided direction parameter (>0 means outward). + """ if dir > 0: return self.e_out() if dir < 0: @@ -42,7 +43,7 @@ def e_dir(self, dir): def neighbors(self, direction = 0): """ - Returns the neighbors of this vertex. + Returns the neighbors of this vertex. List of neighbor vertices in all directions (default) or in filtered f_io direction (>0 means outward). :param direction: - 0: parent and children @@ -58,24 +59,36 @@ def neighbors(self, direction = 0): return arr def e_to(self, y): + """ + returns the Edge from this vertex directed toward vertex v. + """ for e in self.e_out(): if e.v[1] == y: return e return None def e_from(self, x): + """ + returns the Edge from vertex v directed toward this vertex. + """ for e in self.e_in(): if e.v[0] == x: return e return None def e_with(self, v): + """ + return the Edge with both this vertex and vertex v + """ for e in self.e: if v in e.v: return e return None def detach(self): + """ + removes this vertex from all its edges and returns this list of edges. + """ E = self.e[:] for e in E: e.detach() diff --git a/graphistry/plugins/cugraph.py b/graphistry/plugins/cugraph.py index b5f070af72..5e7a656b25 100644 --- a/graphistry/plugins/cugraph.py +++ b/graphistry/plugins/cugraph.py @@ -239,16 +239,19 @@ def compute_cugraph( **Example: Pagerank** :: + g2 = g.compute_cugraph('pagerank') assert 'pagerank' in g2._nodes.columns **Example: Katz centrality with rename** :: + g2 = g.compute_cugraph('katz_centrality', out_col='katz_centrality_renamed') assert 'katz_centrality_renamed' in g2._nodes.columns **Example: Pass params to cugraph** :: + g2 = g.compute_cugraph('k_truss', params={'k': 2}) assert 'k_truss' in g2._nodes.columns @@ -360,6 +363,7 @@ def layout_cugraph( **Example: ForceAtlas2 layout** :: + import graphistry, pandas as pd edges = pd.DataFrame({'s': ['a','b','c','d'], 'd': ['b','c','d','e']}) g = graphistry.edges(edges, 's', 'd') @@ -367,6 +371,7 @@ def layout_cugraph( **Example: Change which column names are generated** :: + import graphistry, pandas as pd edges = pd.DataFrame({'s': ['a','b','c','d'], 'd': ['b','c','d','e']}) g = graphistry.edges(edges, 's', 'd') @@ -377,6 +382,7 @@ def layout_cugraph( **Example: Pass parameters to layout methods** :: + import graphistry, pandas as pd edges = pd.DataFrame({'s': ['a','b','c','d'], 'd': ['b','c','d','e']}) g = graphistry.edges(edges, 's', 'd') diff --git a/graphistry/plugins/igraph.py b/graphistry/plugins/igraph.py index a5bab3ac19..5fe5c2f8a6 100644 --- a/graphistry/plugins/igraph.py +++ b/graphistry/plugins/igraph.py @@ -53,6 +53,7 @@ def from_igraph(self, **Example: Convert from igraph, including all node/edge properties** :: + import graphistry, pandas as pd edges = pd.DataFrame({'s': ['a', 'b', 'c', 'd'], 'd': ['b', 'c', 'd', 'e'], 'v': [101, 102, 103, 104]}) g = graphistry.edges(edges, 's', 'd').materialize_nodes().get_degrees() @@ -62,6 +63,7 @@ def from_igraph(self, **Example: Enrich from igraph, but only load in 1 node attribute** :: + import graphistry, pandas as pd edges = pd.DataFrame({'s': ['a', 'b', 'c', 'd'], 'd': ['b', 'c', 'd', 'e'], 'v': [101, 102, 103, 104]}) g = graphistry.edges(edges, 's', 'd').materialize_nodes().get_degree() @@ -198,7 +200,8 @@ def from_igraph(self, return g -def to_igraph(self: Plottable, +def to_igraph( + self: Plottable, directed: bool = True, include_nodes: bool = True, node_attributes: Optional[List[str]] = None, @@ -309,8 +312,8 @@ def compute_igraph( :rtype: Plotter **Example: Pagerank** - :: + import graphistry, pandas as pd edges = pd.DataFrame({'s': ['a','b','c','d'], 'd': ['c','c','e','e']}) g = graphistry.edges(edges, 's', 'd') @@ -319,6 +322,7 @@ def compute_igraph( **Example: Pagerank with custom name** :: + import graphistry, pandas as pd edges = pd.DataFrame({'s': ['a','b','c','d'], 'd': ['c','c','e','e']}) g = graphistry.edges(edges, 's', 'd') @@ -327,6 +331,7 @@ def compute_igraph( **Example: Pagerank on an undirected** :: + import graphistry, pandas as pd edges = pd.DataFrame({'s': ['a','b','c','d'], 'd': ['c','c','e','e']}) g = graphistry.edges(edges, 's', 'd') @@ -334,7 +339,8 @@ def compute_igraph( assert 'pagerank' in g2._nodes.columns **Example: Pagerank with custom parameters** - :: + :: + import graphistry, pandas as pd edges = pd.DataFrame({'s': ['a','b','c','d'], 'd': ['c','c','e','e']}) g = graphistry.edges(edges, 's', 'd') @@ -447,6 +453,7 @@ def layout_igraph( **Example: Sugiyama layout** :: + import graphistry, pandas as pd edges = pd.DataFrame({'s': ['a','b','c','d'], 'd': ['b','c','d','e']}) g = graphistry.edges(edges, 's', 'd') @@ -456,6 +463,7 @@ def layout_igraph( **Example: Change which column names are generated** :: + import graphistry, pandas as pd edges = pd.DataFrame({'s': ['a','b','c','d'], 'd': ['b','c','d','e']}) g = graphistry.edges(edges, 's', 'd') @@ -466,6 +474,7 @@ def layout_igraph( **Example: Pass parameters to layout methods - Sort nodes by degree** :: + import graphistry, pandas as pd edges = pd.DataFrame({'s': ['a','b','c','d'], 'd': ['b','c','d','e']}) g = graphistry.edges(edges, 's', 'd') diff --git a/graphistry/pygraphistry.py b/graphistry/pygraphistry.py index 0a8dd76310..2051a32523 100644 --- a/graphistry/pygraphistry.py +++ b/graphistry/pygraphistry.py @@ -1942,6 +1942,7 @@ def nodes(nodes: Union[Callable, Any], node=None, *args, **kwargs) -> Plottable: **Example** :: + import graphistry def sample_nodes(g, n): @@ -1992,6 +1993,7 @@ def edges( **Example** :: + import graphistry def sample_edges(g, n): diff --git a/graphistry/text_utils.py b/graphistry/text_utils.py index d5b579b593..d9dc42060f 100644 --- a/graphistry/text_utils.py +++ b/graphistry/text_utils.py @@ -122,37 +122,38 @@ def search( If node data is not yet feature-encoded (and explicit edges are given), run automatic feature engineering: - ``` + :: + g2 = g.featurize(kind='nodes', X=['text_col_1', ..], min_words=0 # forces all named columns are textually encoded ) - ``` If edges do not yet exist, generate them via - ``` + :: + g2 = g.umap(kind='nodes', X=['text_col_1', ..], min_words=0 # forces all named columns are textually encoded ) - ``` + If an index is not yet built, it is generated `g2.build_index()` on the fly at search time. Otherwise, can set `g2.build_index()` to build it ahead of time. Args: - query (str): natural language query. - cols (list or str, optional): if fuzzy=False, select which column to query. + :query (str): natural language query. + :cols (list or str, optional): if fuzzy=False, select which column to query. Defaults to None since fuzzy=True by defaul. - thresh (float, optional): distance threshold from query vector to returned results. + :thresh (float, optional): distance threshold from query vector to returned results. Defaults to 5000, set large just in case, but could be as low as 10. - fuzzy (bool, optional): if True, uses embedding + annoy index for recall, + :fuzzy (bool, optional): if True, uses embedding + annoy index for recall, otherwise does string matching over given `cols` Defaults to True. - top_n (int, optional): how many results to return. Defaults to 100. + :top_n (int, optional): how many results to return. Defaults to 100. Returns: - pd.DataFrame, vector_encoding_of_query: - * rank ordered dataframe of results matching query - * vector encoding of query via given transformer/ngrams model if fuzzy=True - else None + **pd.DataFrame, vector_encoding_of_query:** + rank ordered dataframe of results matching query + + vector encoding of query via given transformer/ngrams model if fuzzy=True else None """ if not fuzzy: if cols is None: @@ -188,15 +189,15 @@ def search_graph( See help(g.search) for more information Args: - query (str): query input eg "coding best practices" - scale (float, optional): edge weigh threshold, Defaults to 0.5. - top_n (int, optional): how many results to return. Defaults to 100. - thresh (float, optional): distance threshold from query vector to returned results. + :query (str): query input eg "coding best practices" + :scale (float, optional): edge weigh threshold, Defaults to 0.5. + :top_n (int, optional): how many results to return. Defaults to 100. + :thresh (float, optional): distance threshold from query vector to returned results. Defaults to 5000, set large just in case, but could be as low as 10. - broader (bool, optional): if True, will retrieve entities connected via an edge + :broader (bool, optional): if True, will retrieve entities connected via an edge that were not necessarily bubbled up in the results_dataframe. Defaults to False. - inplace (bool, optional): whether to return new instance (default) or mutate self. + :inplace (bool, optional): whether to return new instance (default) or mutate self. Defaults to False. Returns: diff --git a/graphistry/umap_utils.py b/graphistry/umap_utils.py index 2107710a3d..633f941c55 100644 --- a/graphistry/umap_utils.py +++ b/graphistry/umap_utils.py @@ -281,18 +281,18 @@ def transform_umap(self, df: pd.DataFrame, ) -> Union[Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame], Plottable]: """Transforms data into UMAP embedding - args: - df: Dataframe to transform - y: Target column - kind: One of `nodes` or `edges` - min_dist: Epsilon for including neighbors in infer_graph - n_neighbors: Number of neighbors to use for contextualization - merge_policy: if True, use previous graph, adding new batch to existing graph's neighbors + Args: + :df: Dataframe to transform + :y: Target column + :kind: One of `nodes` or `edges` + :min_dist: Epsilon for including neighbors in infer_graph + :n_neighbors: Number of neighbors to use for contextualization + :merge_policy: if True, use previous graph, adding new batch to existing graph's neighbors useful to contextualize new data against existing graph. If False, `sample` is irrelevant. - sample: Sample number of existing graph's neighbors to use for contextualization -- helps make denser graphs - return_graph: Whether to return a graph or just the embeddings - fit_umap_embedding: Whether to infer graph from the UMAP embedding on the new data - verbose: Whether to print information about the graph inference + :sample: Sample number of existing graph's neighbors to use for contextualization -- helps make denser graphs + :return_graph: Whether to return a graph or just the embeddings + :fit_umap_embedding: Whether to infer graph from the UMAP embedding on the new data + :verbose: Whether to print information about the graph inference """ X, y_ = self.transform(df, y, kind=kind, return_graph=False, verbose=verbose) emb = self._umap.transform(X) # type: ignore @@ -437,47 +437,47 @@ def umap( Parameters ---------- - X: either a dataframe ndarray of features, or column names to featurize - y: either an dataframe ndarray of targets, or column names to featurize + :X: either a dataframe ndarray of features, or column names to featurize + :y: either an dataframe ndarray of targets, or column names to featurize targets - kind: `nodes` or `edges` or None. + :kind: `nodes` or `edges` or None. If None, expects explicit X, y (optional) matrices, and will Not associate them to nodes or edges. If X, y (optional) is given, with kind = [nodes, edges], it will associate new matrices to nodes or edges attributes. - scale: multiplicative scale for pruning weighted edge DataFrame + :scale: multiplicative scale for pruning weighted edge DataFrame gotten from UMAP, between [0, ..) with high end meaning keep all edges - n_neighbors: UMAP number of nearest neighbors to include for + :n_neighbors: UMAP number of nearest neighbors to include for UMAP connectivity, lower makes more compact layouts. Minimum 2 - min_dist: UMAP float between 0 and 1, lower makes more compact + :min_dist: UMAP float between 0 and 1, lower makes more compact layouts. - spread: UMAP spread of values for relaxation - local_connectivity: UMAP connectivity parameter - repulsion_strength: UMAP repulsion strength - negative_sample_rate: UMAP negative sampling rate - n_components: number of components in the UMAP projection, + :spread: UMAP spread of values for relaxation + :local_connectivity: UMAP connectivity parameter + :repulsion_strength: UMAP repulsion strength + :negative_sample_rate: UMAP negative sampling rate + :n_components: number of components in the UMAP projection, default 2 - metric: UMAP metric, default 'euclidean'. + :metric: UMAP metric, default 'euclidean'. see (UMAP-LEARN)[https://umap-learn.readthedocs.io/ en/latest/parameters.html] documentation for more. - suffix: optional suffix to add to x, y attributes of umap. - play: Graphistry play parameter, default 0, how much to evolve + :suffix: optional suffix to add to x, y attributes of umap. + :play: Graphistry play parameter, default 0, how much to evolve the network during clustering. 0 preserves the original UMAP layout. - encode_weight: if True, will set new edges_df from + :encode_weight: if True, will set new edges_df from implicit UMAP, default True. - encode_position: whether to set default plotting bindings + :encode_position: whether to set default plotting bindings -- positions x,y from umap for .plot(), default True - dbscan: whether to run DBSCAN on the UMAP embedding, default False. - engine: selects which engine to use to calculate UMAP: + :dbscan: whether to run DBSCAN on the UMAP embedding, default False. + :engine: selects which engine to use to calculate UMAP: default "auto" will use cuML if available, otherwise UMAP-LEARN. - feature_engine: How to encode data + :feature_engine: How to encode data ("none", "auto", "pandas", "dirty_cat", "torch") - inplace: bool = False, whether to modify the current object, default False. + :inplace: bool = False, whether to modify the current object, default False. when False, returns a new object, useful for chaining in a functional paradigm. - memoize: whether to memoize the results of this method, + :memoize: whether to memoize the results of this method, default True. - verbose: whether to print out extra information, default False. + :verbose: whether to print out extra information, default False. :return: self, with attributes set with new data """ if engine == UMAP_LEARN: