diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b81fba6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,107 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ + +# pycharm +.idea diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..b01acfd --- /dev/null +++ b/LICENSE @@ -0,0 +1,29 @@ +BSD 3-Clause License + +Copyright (c) 2020, Guillaume Dubuisson Duplessis +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..fcf4985 --- /dev/null +++ b/README.md @@ -0,0 +1,348 @@ +# gowpy + +A very simple framework for exploiting graph-of-words in NLP. +Currently at version **0.1.0** (alpha). + +gowpy leverages graph-of-words representation in order to do: +- **document classification** in a [scikit-learn](https://scikit-learn.org)-like + way via useful vectorizers, and +- **keyword extraction** from a document. + +## Quick Start +### Requirements and Installation +This project is based on Python 3.6+, [scikit-learn](https://github.com/scikit-learn/scikit-learn) and +[NetworkX](https://github.com/networkx/networkx). + +#### Installation from PyPI +```bash +pip install gowpy +``` + +#### Installation from the GitHub Source +First, clone the project: +```bash +git clone https://github.com/GuillaumeDD/gowpy.git +``` + +Then, `cd` to the project folder and run the install command: +```bash +cd gowpy/ +python setup.py install +``` + +### Example Usage + +#### Building a Graph-of-Words from a Document + +```python +from gowpy.gow.builder import GoWBuilder + +# Creation of a graph-of-words builder +# Here: +# - the graph-of-words will be directed, and +# - an edge will link every tokens co-occurring in a sliding window of size 4 +# +builder = GoWBuilder(directed=True, window_size=4) + +text = """gowpy is a simple framework for exploiting graph-of-words in nlp gowpy +leverages graph-of-words representation for document classification and for keyword extraction +from a document""" + +# Here, a preprocessing step fitted to the need of the project should be carried out + +# Creation of the graph-of-words +gow = builder.compute_gow_from_document(text) +``` + +Then, it is possible to visualize the document as a graph-of-words: +```python +import matplotlib.pyplot as plt +import networkx as nx + +g = gow.to_labeled_graph() + +options = { + "font_weight" : 'normal', + "font_color" : 'darkblue', + # + "edge_color" : 'lightgray', + # + "node_size" : 200, + "node_color": 'white', + "with_labels": True, +} +nx.draw(g, **options) +``` + +![A graph-of-words example](./resources/gow.png) + +#### Unsupervised Keywords Extraction +Graph-of-words can be leveraged to extract an automatically adaptative number of +cohesive keywords from a text document in an unsupervised fashion [[2,3]](#references). + +```python +from gowpy.summarization.unsupervised import GoWKeywordExtractor + +# Initialization of the keyword extractor +extractor_kw = GoWKeywordExtractor(directed=False, window_size=4) + +# +# Note that preprocessing is particularly important for keyword extraction +# in order to keep and normalize important terms such as adjectives and nouns. +# +# An already preprocessed text in which to extract keywords +preprocessed_text = """gowpy simple framework exploiting graph-of-words nlp gowpy +leverages graph-of-words representation document classification keyword extraction +document""" + +extractor_kw.extract(preprocessed_text) +``` + +Returns: +```text +[('gowpy', 4), + ('simple', 4), + ('framework', 4), + ('exploiting', 4), + ('graph-of-words', 4), + ('nlp', 4)] +``` + + +#### Classification with TW-IDF: a graph-based term weighting score +TW-IDF [[0]](#references) challenges the term independence assumption behind +the bag-of-words model by (i) exploiting a graph-of-words representation of a +document (here an unweighted directed graph of terms), and by (ii) leveraging +this new representation to replace the term frequency (TF) by graph-based term +weights (TW). + +TW-IDF is accessible via a dedicated vectorizer: +```python +from gowpy.feature_extraction.gow import TwidfVectorizer + +corpus = [ + 'hello world !', + 'foo bar' +] + +vectorizer_gow = TwidfVectorizer( + # Graph-of-words specificities + directed=True, + window_size=4, + # Token frequency filtering + min_df=0.0, + max_df=1.0, + # Graph-based term weighting approach + term_weighting='degree' +) + +X = vectorizer_gow.fit_transform(corpus) +X +``` +Returns: +```text +<2x5 sparse matrix of type '' + with 3 stored elements in Compressed Sparse Row format> +``` + +TW-IDF vectorizer fits seamlessly in a grid search: +```python +from sklearn.pipeline import Pipeline +from sklearn.svm import SVC + +from sklearn.model_selection import GridSearchCV + +pipeline = Pipeline([ + ('gow', TwidfVectorizer()), + ('svm', SVC()), +]) + +parameters = { + 'gow__directed' : [True, False], + 'gow__window_size' : [2, 4, 8, 16], + 'gow__b' : [0.0, 0.003], + 'gow__term_weighting' : ['degree', 'pagerank'], + 'gow__min_df' : [0, 5, 10], + 'gow__max_df' : [0.8, 0.9, 1.0], +# + 'svm__C' : [0.1, 1, 10], + 'svm__kernel' : ['linear'] +} + +# find the best parameters for both the feature extraction and the +# classifier +grid_search = GridSearchCV(pipeline, + parameters, + cv=10, + n_jobs=-1) +``` + +#### Going further: classification based on frequent subgraphs +Frequent subgraphs corresponding to long range n-gram can be mined and +subsequently used for document classification [[1]](#references). + +Classification with frequent subgraphs happens in a 3-step process: +1. Conversion of the corpus of already preprocessed documents into a collection + of graph-of-words +1. Mining the frequent subgraphs +1. Loading the frequent subgraphs and exploiting them for classification + +##### Conversion of the corpus into a collection of graph-of-words +The first step consists in turning the corpus into a graph-of-words and collection +and then export that collection into a file format suited for frequent subgraph +mining. +```python +from gowpy.gow.miner import GoWMiner +import gowpy.gow.io + +corpus = [ + 'hello world !', + 'foo bar', + # and many more... +] + +# Conversation of the corpus into a collection of graph-of-words +gow_miner = GoWMiner(directed=False, window_size=4) +corpus_gows = gow_miner.compute_gow_from_corpus(corpus) + +# Exportation of the collection of graph-of-words into a file for +# interoperability with other languages such as C++ +with open("corpus_gows.data", "w") as f_output: + data = gowpy.gow.io.gow_to_data(corpus_gows) + f_output.write(data) +``` + +##### Mining the frequent subgraphs +Frequent subgraphs mining can be realized via the [gSpan algorithm](https://www.cs.ucsb.edu/~xyan/software/gSpan.htm). +This step is not included in this project and has to be carried out by another +program. + +This project supports the reimplementation from [gBolt available at GitHub](https://github.com/Jokeren/gBolt). +Currently this implementation is limited to **undirected graph**. +To mine frequent subgraphs (after having installed gBolt on your machine): +```bash +OMP_NUM_THREADS=1 ./gbolt --input corpus_gows.data --output gbolt-mining-corpus_gow --dfs --nodes --support 0.01 +``` +Notice the **support parameter** which defines the minimum frequency of a subgraph +to be considered as frequent. Here it is set to 1% (0.01). +This parameter is **corpus specific** and should be carefully tuned (see [[1]](#references)). + +Mining produces two files: +- `gbolt-mining-corpus_gow.t0`: the frequent subgraphs with more than one node +- `gbolt-mining-corpus_gow.nodes`: the frequent single nodes + +These two files can be loaded by the same `gow_miner` used for exportation: +```python +gow_miner.load_graphs('gbolt-mining-corpus_gow.t0', + 'gbolt-mining-corpus_gow.nodes') +gow_miner +``` +Returns: +```text +Graph-of-word miner: + - is_directed: False + - window_size: 4 + - edge_labeling: True + + - Number of tokens: 5 + - Number of links between tokens: 4 + + - Number of loaded subgraph: 13 +``` + +##### Classification with frequent subgraphs +Classification with frequent subgraphs is accessible via a dedicated vectorizer: +```python +from gowpy.feature_extraction.gow import GoWVectorizer + +vectorizer_gow = GoWVectorizer(gow_miner) +X = vectorizer_gow.fit_transform(corpus) +# X is a sparse matrix +``` + +Before tuning the `min_df` (the minimum being the support chosen during mining) +and the `max_df`, it is possible the have a look at the normalized frequency +distribution: +```python +import pandas as pd +s_freq_per_pattern = pd.Series(gow_miner.stat_relative_freq_per_pattern()) +s_freq_per_pattern.describe() +``` +For instance, it can returns the following distribution: +```text +count 10369.000000 +mean 0.026639 +std 0.046551 +min 0.008333 +25% 0.010000 +50% 0.013333 +75% 0.022778 +max 0.865000 +dtype: float64 +``` + + +GoW vectorizer fits nicely in a grid search: +```python +from sklearn.pipeline import Pipeline +from sklearn.svm import SVC +from sklearn.feature_extraction.text import TfidfTransformer + +from sklearn.model_selection import GridSearchCV + +pipeline = Pipeline([ + ('gow', GoWVectorizer(gow_miner)), + ('tfidf', TfidfTransformer()), + ('svm', SVC()), +]) + +parameters = { + 'gow__subgraph_matching' : ['partial', 'induced'], + 'gow__min_df' : [0.00833, 0.01, 0.013333], + 'gow__max_df' : [0.022778, 0.5, 0.865], +# + 'svm__C' : [0.1, 1, 10], + 'svm__kernel' : ['linear'] +} + +# find the best parameters for both the feature extraction and the +# classifier +grid_search = GridSearchCV(pipeline, + parameters, + cv=10, + n_jobs=-1) +``` + +## References + +Detailed explanations, evaluations and discussions can be found in these papers: +- Information retrieval (TW-IDF) + + [0] [Graph-of-word and TW-IDF: New Approach to Ad Hoc IR](https://dl.acm.org/doi/abs/10.1145/2505515.2505671). + *Rousseau, François, and Michalis Vazirgiannis*. + *Proceedings of the 22nd ACM international conference on Information & Knowledge Management*.(**CIKM 2013**) +- Document classification with frequent subgraphs + + [1] [Text Categorization as a Graph Classification Problem](http://www.aclweb.org/anthology/P15-1164). + *Rousseau, François, Emmanouil Kiagias, and Michalis Vazirgiannis*. + *Proceedings of the 53rd Annual Meeting of the Association for Computational Linguistics and the 7th International + Joint Conference on Natural Language Processing* (**ACL 2015**) +- Keyword extraction from graph-of-words + + [2] [Main Core Retention on Graph-of-words for Single-Document Keyword Extraction](https://link.springer.com/chapter/10.1007/978-3-319-16354-3_42). + *Rousseau, François, and Michalis Vazirgiannis*. + *Proceedings of the 37th European Conference on Information Retrieval*. + (**ECIR 2015**) + + [3] [A Graph Degeneracy-based Approach to Keyword Extraction](https://www.aclweb.org/anthology/D16-1191/). + *Tiwier, Antoine Tixier, Malliaros Fragkiskos, and Vazirgiannis, Michalis*. + *Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing*. + (**EMNLP 2016**) + +This library involves the following algorithms: +- Frequent subgraph Mining (**currently not included in this library**) + + gSpan algorithm implementation for subgraph mining: [gBolt--very fast implementation for gSpan algorithm in data mining ](https://github.com/Jokeren/gBolt) +- Subgraph matching + + VF2 algorithm for subgraph isomorphism matching: [VF2 algorithm for subgraph isomorphism from NetworkX](https://networkx.github.io/documentation/stable/reference/algorithms/isomorphism.vf2.html) +- Graph degeneracy + + [k-core decomposition with NetworkX](https://networkx.github.io/documentation/stable/reference/algorithms/core.html) + + +## License +Released under the 3-Clause BSD license (see [LICENSE file](./LICENSE)) diff --git a/examples/README.ipynb b/examples/README.ipynb new file mode 100644 index 0000000..fc18ddc --- /dev/null +++ b/examples/README.ipynb @@ -0,0 +1,618 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# gowpy: README.md examples" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Building a Graph-of-Words from a Document" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from gowpy.gow.builder import GoWBuilder" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "builder = GoWBuilder(directed=True, \n", + " window_size=4)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "text = \"\"\"gowpy is a simple framework for exploiting graph-of-words in nlp gowpy \n", + "leverages graph-of-words representation for document classification and for keyword extraction \n", + "from a document\"\"\"\n", + "# ...\n", + "preprocessed_text = \"\"\"gowpy simple framework exploiting graph-of-words nlp gowpy \n", + "leverages graph-of-words representation document classification keyword extraction document\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "gow = builder.compute_gow_from_document(text)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Graph-of-words\n", + "Nodes: ['gowpy', 'is', 'a', 'simple', 'framework', 'for', 'exploiting', 'graph-of-words', 'in', 'nlp', 'leverages', 'representation', 'document', 'classification', 'and', 'keyword', 'extraction', 'from']\n", + "Edges: ['framework__graph-of-words', 'is__simple', 'exploiting__nlp', 'leverages__representation', 'for__exploiting', 'gowpy__a', 'graph-of-words__document', 'in__gowpy', 'extraction__document', 'for__classification', 'gowpy__graph-of-words', 'extraction__a', 'in__nlp', 'document__for', 'keyword__extraction', 'gowpy__leverages', 'a__document', 'graph-of-words__representation', 'a__for', 'gowpy__simple', 'for__in', 'is__a', 'extraction__from', 'nlp__gowpy', 'exploiting__graph-of-words', 'and__for', 'representation__for', 'leverages__graph-of-words', 'document__classification', 'for__document', 'in__leverages', 'from__a', 'gowpy__representation', 'simple__exploiting', 'simple__framework', 'nlp__graph-of-words', 'representation__classification', 'document__and', 'framework__for', 'for__from', 'classification__keyword', 'is__framework', 'nlp__leverages', 'graph-of-words__for', 'a__simple', 'and__keyword', 'for__keyword', 'representation__document', 'simple__for', 'gowpy__is', 'graph-of-words__nlp', 'leverages__for', 'keyword__from', 'graph-of-words__gowpy', 'framework__exploiting', 'exploiting__in', 'and__extraction', 'classification__and', 'for__extraction', 'keyword__a', 'for__graph-of-words', 'classification__for', 'for__and', 'from__document', 'graph-of-words__in', 'a__framework']" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gow" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import networkx as nx" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "g = gow.to_labeled_graph()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "options = {\n", + " \"font_weight\" : 'normal',\n", + " \"font_color\" : 'darkblue',\n", + " #\n", + " \"edge_color\" : 'lightgray',\n", + " #\n", + " \"node_size\" : 200,\n", + " \"node_color\": 'white',\n", + " \"with_labels\": True,\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "nx.draw(g, **options)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Unsupervised Keywords Extraction" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "from gowpy.summarization.unsupervised import GoWKeywordExtractor" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "extractor_kw = GoWKeywordExtractor(directed=False, window_size=4)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "15" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "preprocessed_text = \"\"\"gowpy simple framework exploiting graph-of-words nlp gowpy \n", + "leverages graph-of-words representation document classification keyword extraction \n", + "document\"\"\"\n", + "len(preprocessed_text.split())" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('gowpy', 4),\n", + " ('simple', 4),\n", + " ('framework', 4),\n", + " ('exploiting', 4),\n", + " ('graph-of-words', 4),\n", + " ('nlp', 4)]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "extractor_kw.extract(preprocessed_text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Classification with TW-IDF: a graph-based term weighting score" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "from gowpy.feature_extraction.gow import TwidfVectorizer" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "corpus = [\n", + " 'hello world !',\n", + " 'foo bar'\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "vectorizer_gow = TwidfVectorizer( \n", + " # Graph-of-words specificities\n", + " directed=True,\n", + " window_size=4,\n", + " # Token frequency filtering\n", + " min_df=0.0,\n", + " max_df=1.0,\n", + " # Graph-based term weighting approach\n", + " term_weighting='degree'\n", + ")\n", + "\n", + "X = vectorizer_gow.fit_transform(corpus)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[0.89442719, 0. , 0. , 0. , 0.4472136 ],\n", + " [0. , 1. , 0. , 0. , 0. ]])" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X.toarray()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<2x5 sparse matrix of type ''\n", + "\twith 3 stored elements in Compressed Sparse Row format>" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.pipeline import Pipeline\n", + "from sklearn.svm import SVC\n", + "\n", + "from sklearn.model_selection import GridSearchCV\n", + "\n", + "pipeline = Pipeline([\n", + " ('gow', TwidfVectorizer()),\n", + " ('svm', SVC()),\n", + "])\n", + "\n", + "parameters = {\n", + " 'gow__directed' : [True, False],\n", + " 'gow__window_size' : [2, 4, 8, 16],\n", + " 'gow__b' : [0.0, 0.003],\n", + " 'gow__term_weighting' : ['degree', 'pagerank'],\n", + " 'gow__min_df' : [0, 5, 10],\n", + " 'gow__max_df' : [0.8, 0.9, 1.0],\n", + "#\n", + " 'svm__C' : [0.1, 1, 10],\n", + " 'svm__kernel' : ['linear']\n", + "}\n", + "\n", + "# find the best parameters for both the feature extraction and the\n", + "# classifier\n", + "grid_search = GridSearchCV(pipeline, \n", + " parameters, \n", + " cv=10,\n", + " n_jobs=-1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Going further: classification based on frequent subgraphs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Conversion of the corpus into a collection of graph-of-words" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "from gowpy.gow.miner import GoWMiner\n", + "import gowpy.gow.io" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "corpus = [\n", + " 'hello world !',\n", + " 'foo bar',\n", + " # and many more...\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "gow_miner = GoWMiner(directed=False, window_size=4)\n", + "corpus_gows = gow_miner.compute_gow_from_corpus(corpus)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"corpus_gows.data\", \"w\") as f_output:\n", + " data = gowpy.gow.io.gow_to_data(corpus_gows)\n", + " f_output.write(data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Mining the frequent subgraphs" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Graph-of-word miner:\n", + " - is_directed: False\n", + " - window_size: 4\n", + " - edge_labeling: True\n", + "\n", + " - Number of tokens: 5\n", + " - Number of links between tokens: 4\n", + "\n", + " - Number of loaded subgraph: 13\n", + " " + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gow_miner.load_graphs('gbolt-mining-corpus_gow.t0', \n", + " 'gbolt-mining-corpus_gow.nodes')\n", + "gow_miner" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Classification with frequent subgraphs" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "count 13.0\n", + "mean 0.5\n", + "std 0.0\n", + "min 0.5\n", + "25% 0.5\n", + "50% 0.5\n", + "75% 0.5\n", + "max 0.5\n", + "dtype: float64" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "s_freq_per_pattern = pd.Series(gow_miner.stat_relative_freq_per_pattern())\n", + "s_freq_per_pattern.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<2x13 sparse matrix of type ''\n", + "\twith 13 stored elements in Compressed Sparse Row format>" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from gowpy.feature_extraction.gow import GoWVectorizer\n", + "\n", + "vectorizer_gow = GoWVectorizer(gow_miner)\n", + "X = vectorizer_gow.fit_transform(corpus)\n", + "X" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "feature_names = vectorizer_gow.get_feature_names()" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Nombre de features: 10\n", + "\t- hello world hello__world\n", + "\t- hello world ! world__! hello__world\n", + "\t- hello world ! world__! hello__world hello__!\n", + "\t- hello world ! hello__world hello__!\n", + "\t- hello ! hello__!\n", + "\t- hello world ! world__! hello__!\n", + "\t- world ! world__!\n", + "\t- hello\n", + "\t- !\n", + "\t- world\n" + ] + } + ], + "source": [ + "features = [feature for presence, feature in zip(X.toarray()[0], feature_names) if presence > 0]\n", + "print(\"Nombre de features: {}\".format(len(features)))\n", + "for feature in features:\n", + " print(f'\\t- {feature}')" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.pipeline import Pipeline\n", + "from sklearn.svm import SVC\n", + "from sklearn.feature_extraction.text import TfidfTransformer\n", + "\n", + "from sklearn.model_selection import GridSearchCV" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "pipeline = Pipeline([\n", + " ('gow', GoWVectorizer(gow_miner)),\n", + " ('tfidf', TfidfTransformer()),\n", + " ('svm', SVC()),\n", + "])\n", + "\n", + "parameters = {\n", + " 'gow__subgraph_matching' : ['partial', 'induced'],\n", + " 'gow__min_df' : [0.00833, 0.01, 0.013333],\n", + " 'gow__max_df' : [0.022778, 0.25, 0.5, 1.0],\n", + "#\n", + " 'svm__C' : [0.1, 1, 10],\n", + " 'svm__kernel' : ['linear']\n", + "}\n", + "\n", + "# find the best parameters for both the feature extraction and the\n", + "# classifier\n", + "grid_search = GridSearchCV(pipeline, \n", + " parameters, \n", + " cv=10,\n", + " n_jobs=-1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:gowpy36]", + "language": "python", + "name": "conda-env-gowpy36-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.10" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": true + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/gowpy/__init__.py b/gowpy/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/gowpy/feature_extraction/__init__.py b/gowpy/feature_extraction/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/gowpy/feature_extraction/gow/__init__.py b/gowpy/feature_extraction/gow/__init__.py new file mode 100644 index 0000000..bc2fa07 --- /dev/null +++ b/gowpy/feature_extraction/gow/__init__.py @@ -0,0 +1,2 @@ +from .gow_vectorizer import GoWVectorizer +from .tw_vectorizer import TwVectorizer, TwidfVectorizer diff --git a/gowpy/feature_extraction/gow/gow_vectorizer.py b/gowpy/feature_extraction/gow/gow_vectorizer.py new file mode 100644 index 0000000..1918667 --- /dev/null +++ b/gowpy/feature_extraction/gow/gow_vectorizer.py @@ -0,0 +1,225 @@ +import networkx.algorithms.isomorphism as iso +from networkx.algorithms import isomorphism + +import numbers + +from scipy.sparse import csr_matrix + +from typing import Sequence, Tuple, Generator + +from gowpy.gow.builder import GraphOfWords +from gowpy.gow.typing import Nodes + +from sklearn.base import BaseEstimator +from gowpy.gow.miner import GoWMiner + +SUBGRAPH_MATCHING_INDUCED = "induced" +SUBGRAPH_MATCHING_PARTIAL = "partial" + + +class GoWVectorizer(BaseEstimator): + """Convert a collection of text documents to a matrix of frequent subgraphs matching counts + + Frequent subgraphs have to be mined before using this vectorizer. + + This implementation produces a sparse representation of the counts using + scipy.sparse.csr_matrix. + + Parameters + ---------- + graph_of_words: GoWMiner + A graph-of-words miner containing the frequent subgraphs. + max_df : float in range [0.0, 1.0] or int, default=1.0 + Ignore frequent subgraphs that have a document frequency strictly + higher than the given threshold (corpus-specific frequent subgraphs). + If float, the parameter represents a proportion of documents, integer + absolute counts. + min_df : float in range [0.0, 1.0] or int, default=0.0 + Ignore frequent subgraphs that have a document frequency strictly + lower than the given threshold. This value is also called support + in the literature. + Note that the smallest value is defined with the support used when + mining frequent subgraphs. + If float, the parameter represents a proportion of documents, integer + absolute counts. + in the mapping should not be repeated and should not have any gap + between 0 and the largest index. + indexing : boolean, True by default + Frequent subgraphs are indexed for faster retrieval when computing + document features. + subgraph_matching : string {'induced', 'partial'} + Frequent subgraph matching approach + 'partial' (default) : subgraph matching corresponding to node and + edge inclusion. + 'induced' : slower approach, node-induced subgraph matching + """ + + def __init__(self, + graph_of_words: GoWMiner, + min_df: float = 0.0, + max_df: float = 1.0, + subgraph_matching: str = SUBGRAPH_MATCHING_PARTIAL, + indexing: bool = True): + self.graph_of_words = graph_of_words + + # Subgraph mining patterns + self.min_df: float = min_df + self.max_df: float = max_df + if self.min_df < 0.0: + raise ValueError("min_df is smaller than 0%") + + if self.max_df < 0.0: + raise ValueError("max_df is smaller than 0%") + + self.subgraph_matching: str = subgraph_matching + + if self.graph_of_words is None: + raise ValueError("No provided graph-of-words miner to compute features (graph_of_words is None)") + + self.indexing = indexing + + def __compute_subpatterns(self) -> Sequence[Tuple[int, GraphOfWords]]: + # Filtering patterns out by support + if self.graph_of_words is not None: + max_doc_count = (self.max_df / float(self.graph_of_words.corpus_size) + if isinstance(self.max_df, numbers.Integral) + else self.max_df) + min_doc_count = (self.min_df / float(self.graph_of_words.corpus_size) + if isinstance(self.min_df, numbers.Integral) + else self.min_df) + # Selecting subpatterns + subpatterns = [subgraph for subgraph in self.graph_of_words.frequent_subgraphs + if (float(subgraph.freq) / float(self.graph_of_words.corpus_size)) >= min_doc_count + if (float(subgraph.freq) / float(self.graph_of_words.corpus_size)) <= max_doc_count + ] + else: + subpatterns = [] + + if self.indexing: + # Indexing patterns by node codes + self.node_code_to_feature_i_s_ = {} + for feature_i, subgraph in enumerate(subpatterns): + for node_code in subgraph.nodes: + if node_code not in self.node_code_to_feature_i_s_: + self.node_code_to_feature_i_s_[node_code] = set() + + self.node_code_to_feature_i_s_[node_code].add(feature_i) + + return [(i, subgraph) for i, subgraph in enumerate(subpatterns)] + + def fit(self, raw_documents: Sequence[str], y=None): + self.selected_subpatterns_: Sequence[Tuple[int, GraphOfWords]] = self.__compute_subpatterns() + self.node_matcher_ = iso.categorical_node_match('label', -1) + + return self + + def fit_transform(self, raw_documents: Sequence[str], y=None): + self.fit(raw_documents, y) + return self.transform(raw_documents) + + def __get_probable_features_via_nodes(self, document_nodes: Nodes) -> Generator[ + Tuple[int, GraphOfWords], None, None]: + subpatterns = self.selected_subpatterns_ + + feature_i_s = set() + for node_code in document_nodes: + if node_code in self.node_code_to_feature_i_s_: + # Getting the feature indices in which the node code appears + temp_feature_i_s = self.node_code_to_feature_i_s_[node_code] + feature_i_s.update(temp_feature_i_s) + + for feature_i in sorted(feature_i_s): + _, subgraph = subpatterns[feature_i] + yield (feature_i, subgraph) + + def __iterate_over_features(self, document_nodes: Nodes) -> Generator[Tuple[int, GraphOfWords], None, None]: + if self.indexing: + return self.__get_probable_features_via_nodes(document_nodes) + else: + subpatterns = self.selected_subpatterns_ + return subpatterns + + def __is_iso_induced(self, + feature_gow: GraphOfWords, + document_gow: GraphOfWords) -> bool: + is_iso = False + document_nodes = document_gow.nodes + document_edges = document_gow.edges + + # optimisation: + # checking nodes and edges inclusion in document before running + # subgraph matching algorithms + # + if (feature_gow.nodes.issubset(document_nodes)) and \ + (feature_gow.edges.issubset(document_edges)): + if len(feature_gow.nodes) <= 2: + is_iso = True + else: + document_graph = document_gow.to_graph() + feature_graph = feature_gow.to_graph() + GM = isomorphism.GraphMatcher(document_graph, feature_graph, + node_match=self.node_matcher_) + is_iso = GM.subgraph_is_isomorphic() + + return is_iso + + @staticmethod + def __is_iso_partial(feature_gow: GraphOfWords, + document_gow: GraphOfWords) -> bool: + return (feature_gow.nodes.issubset(document_gow.nodes)) and \ + (feature_gow.edges.issubset(document_gow.edges)) + + def transform(self, raw_documents: Sequence[str]): + indptr = [0] + indices = [] + data = [] + + subpatterns = self.selected_subpatterns_ + temp_num_features = len(subpatterns) + + if temp_num_features > 0: + for document in raw_documents: + # Document to gowpy + document_gow = self.graph_of_words.compute_gow_from_document(document) + if self.subgraph_matching == SUBGRAPH_MATCHING_INDUCED: + # Feature computation + retained_features = [i_feature + for i_feature, feature_gow in self.__iterate_over_features(document_gow.nodes) + if self.__is_iso_induced(feature_gow, document_gow) + ] + else: + # Feature computation + retained_features = [i_feature + for i_feature, feature_gow in self.__iterate_over_features(document_gow.nodes) + if GoWVectorizer.__is_iso_partial(feature_gow, document_gow) + ] + + # Building blocks of the sparse matrix + for i_feature in retained_features: + indices.append(i_feature) + data.append(1) + indptr.append(len(indices)) + + resulting_matrix = csr_matrix((data, indices, indptr), dtype=int) + else: + resulting_matrix = csr_matrix((len(raw_documents), 0)) + return resulting_matrix + + def get_feature_names(self) -> Sequence[str]: + feature_names = [] + + subpatterns = self.selected_subpatterns_ + + for _, subgraph in subpatterns: + temp = [] + for n in subgraph.nodes_str(): + temp.append(n) + for e in subgraph.edges_str(): + temp.append(e) + + feature_names.append(' '.join(temp)) + + return feature_names + + def _more_tags(self): + return {'X_types': ['string']} diff --git a/gowpy/feature_extraction/gow/tw_vectorizer.py b/gowpy/feature_extraction/gow/tw_vectorizer.py new file mode 100644 index 0000000..6b518ba --- /dev/null +++ b/gowpy/feature_extraction/gow/tw_vectorizer.py @@ -0,0 +1,427 @@ +import networkx as nx +from networkx.algorithms.link_analysis.pagerank_alg import pagerank_numpy +from networkx.algorithms.centrality import degree_centrality, closeness_centrality, betweenness_centrality + +from typing import Sequence, Dict + +from gowpy.gow.builder import GoWBuilder, Tokenized_document +from gowpy.gow.typing import Tokenizer +from gowpy.utils.defaults import default_tokenizer + +from sklearn.base import BaseEstimator +from sklearn.feature_extraction.text import TfidfTransformer +from sklearn.pipeline import Pipeline + +from operator import itemgetter +import numbers +from collections import defaultdict + +import numpy as np +import scipy.sparse as sp + + +TERM_WEIGHT_DEGREE = "degree" +TERM_WEIGHT_DEGREE_CENTRALITY = "degree_centrality" +TERM_WEIGHT_CLOSENESS_CENTRALITY = "closeness_centrality" +TERM_WEIGHT_BETWEENNESS_CENTRALITY = "betweenness_centrality" +TERM_WEIGHT_PAGERANK = "pagerank" + + +# +# From: https://github.com/scikit-learn/scikit-learn/blob/95d4f0841d57e8b5f6b2a570312e9d832e69debc/sklearn/feature_extraction/text.py#L820 +# +def _document_frequency(X): + """Count the number of non-zero values for each feature in sparse X.""" + if sp.isspmatrix_csr(X): + return np.bincount(X.indices, minlength=X.shape[1]) + else: + return np.diff(X.indptr) + + +class TwVectorizer(BaseEstimator): + """Convert a collection of text documents to a matrix of graph-based weight for each token + + This implementation produces a sparse representation of the counts using + scipy.sparse.csr_matrix. + + Parameters + ---------- + max_df : float in range [0.0, 1.0] or int, default=1.0 + Ignore tokens that have a document frequency strictly + higher than the given threshold (corpus-specific stop words). + If float, the parameter represents a proportion of documents, integer + absolute counts. + min_df : float in range [0.0, 1.0] or int, default=1 + Ignore frequent subgraphs that have a document frequency strictly + lower than the given threshold. This value is also called support + in the literature. + If float, the parameter represents a proportion of documents, integer + absolute counts. + in the mapping should not be repeated and should not have any gap + between 0 and the largest index. + b : float {0.0, 0.003}, default=0.0 + Slope parameter of the tilting. + directed : boolean, True by default + If True, the graph-of-words is directed, else undirected + window_size : int, default=4 + Size of the window (in token) to build the graph-of-words. + term_weighting : string {'degree', 'degree_centrality', 'closeness_centrality', 'betweenness_centrality', 'pagerank'} + Graph-based term weighting approach for the nodes in the graph-of-words + 'degree' (default) : degree (undirected) or indegree (directed) of the nodes. + 'degree_centrality' : normalized degree centrality of the nodes + 'closeness_centrality' : very slow, closeness centrality of the nodes + 'betweenness_centrality' : very slow, the shortest-path betweenness centrality of the nodes + 'pagerank' : slow, the PageRank of the nodes + tokenizer : callable or None (default) + Override the string tokenization step. + """ + def __init__(self, + min_df: float = 0.0, + max_df: float = 1.0, + b: float = 0.0, + directed: bool = True, + window_size: int = 4, + term_weighting: str = TERM_WEIGHT_DEGREE, + tokenizer: Tokenizer = None): + # Subgraph mining patterns + self.min_df: float = min_df + self.max_df: float = max_df + if self.min_df < 0.0: + raise ValueError("min_df is smaller than 0%") + + if self.max_df < 0.0: + raise ValueError("max_df is smaller than 0%") + + self.term_weighting = term_weighting + + self.b = b + + self.tokenizer: Tokenizer = tokenizer if tokenizer is not None else default_tokenizer + + self.window_size = window_size + if self.window_size < 2: + raise ValueError("window_size < 2") + + self.directed = directed + + def __tw(self, tokens: Tokenized_document) -> Dict[str, int]: + """Computes the graph-based weight for each token of the document""" + gow = self.gow_builder_.compute_gow_from_tokenized_document(tokens) + graph = gow.to_graph() + tw = {} + if self.term_weighting == TERM_WEIGHT_DEGREE: + if graph.is_directed(): + dgraph = nx.DiGraph(graph) + for (node, degree) in dgraph.in_degree(graph.nodes): + token = self.gow_builder_.get_token_(node) + tw[token] = degree + else: + for (node, degree) in graph.degree(graph.nodes): + token = self.gow_builder_.get_token_(node) + tw[token] = degree + else: + degree_centrality, closeness_centrality, betweenness_centrality + if self.term_weighting == TERM_WEIGHT_DEGREE_CENTRALITY: + weighting_fct = degree_centrality + elif self.term_weighting == TERM_WEIGHT_CLOSENESS_CENTRALITY: + weighting_fct = closeness_centrality + elif self.term_weighting == TERM_WEIGHT_BETWEENNESS_CENTRALITY: + weighting_fct = betweenness_centrality + elif self.term_weighting == TERM_WEIGHT_PAGERANK: + weighting_fct = pagerank_numpy + else: + weighting_fct = lambda x: 1 + + if graph.is_directed(): + dgraph = nx.DiGraph(graph) + node_to_weight = weighting_fct(dgraph) + for (node, p) in node_to_weight.items(): + token = self.gow_builder_.get_token_(node) + tw[token] = p + else: + node_to_weight = weighting_fct(graph) + for (node, p) in node_to_weight.items(): + token = self.gow_builder_.get_token_(node) + tw[token] = p + return tw + + # + # Largely inspired by the CountVectorizer from scikit-learn + # See: https://github.com/scikit-learn/scikit-learn/blob/95d4f0841d57e8b5f6b2a570312e9d832e69debc/sklearn/feature_extraction/text.py#L1113 + # + def __count_vocab(self, tokenized_documents: Sequence[Tokenized_document], fixed_vocab: bool): + if fixed_vocab: + vocabulary = self.vocabulary_ + else: + vocabulary = defaultdict() + vocabulary.default_factory = vocabulary.__len__ + + j_indices = [] + indptr = [0] + data = [] + + for tokens in tokenized_documents: + feature_counter = {} + + tw = self.__tw(tokens) + + document_length = len(tokens) + denominator = 1.0 - self.b + self.b * (float(document_length) / self.avdl_) + + for feature in tokens: + try: + feature_idx = vocabulary[feature] + + if feature_idx not in feature_counter: + feature_counter[feature_idx] = tw[feature] / denominator + + except KeyError: + # Ignore out-of-vocabulary items for fixed_vocab=True + continue + + j_indices.extend(feature_counter.keys()) + data.extend(feature_counter.values()) + indptr.append(len(j_indices)) + + # disable defaultdict behaviour + if not fixed_vocab: + # disable defaultdict behaviour + vocabulary = dict(vocabulary) + if not vocabulary: + raise ValueError("empty vocabulary; perhaps the documents only" + " contain stop words") + + X = sp.csr_matrix((data, j_indices, indptr), + shape=(len(indptr) - 1, len(vocabulary)), + dtype=float) + + X.sort_indices() + + return vocabulary, X + + # + # Inspired by: https://github.com/scikit-learn/scikit-learn/blob/95d4f0841d57e8b5f6b2a570312e9d832e69debc/sklearn/feature_extraction/text.py#L1058 + # + def __sort_features(self, X, vocabulary): + """Sort features by name + Returns a reordered matrix and modifies the vocabulary in place + """ + sorted_features = sorted(vocabulary.items()) + map_index = np.empty(len(sorted_features), dtype=X.indices.dtype) + for new_val, (term, old_val) in enumerate(sorted_features): + vocabulary[term] = new_val + map_index[old_val] = new_val + + X.indices = map_index.take(X.indices, mode='clip') + return X + + # + # Inspired by: https://github.com/scikit-learn/scikit-learn/blob/95d4f0841d57e8b5f6b2a570312e9d832e69debc/sklearn/feature_extraction/text.py#L1072 + # + def __limit_features(self, X, vocabulary, high=None, low=None): + """Remove too rare or too common features. + Prune features that are non zero in more samples than high or less + documents than low, modifying the vocabulary, and restricting it to + at most the limit most frequent. + This does not prune samples with zero features. + """ + if high is None and low is None: + return X, set() + + # Calculate a mask based on document frequencies + dfs = _document_frequency(X) + mask = np.ones(len(dfs), dtype=bool) + if high is not None: + mask &= dfs <= high + if low is not None: + mask &= dfs >= low + + new_indices = np.cumsum(mask) - 1 # maps old indices to new + removed_terms = set() + for term, old_index in list(vocabulary.items()): + if mask[old_index]: + vocabulary[term] = new_indices[old_index] + else: + del vocabulary[term] + removed_terms.add(term) + kept_indices = np.where(mask)[0] + if len(kept_indices) == 0: + raise ValueError("After pruning, no terms remain. Try a lower" + " min_df or a higher max_df.") + return X[:, kept_indices], removed_terms + + def fit(self, raw_documents: Sequence[str], y=None): + self.fit_transform(raw_documents, y) + return self + + def fit_transform(self, raw_documents: Sequence[str], y=None): + max_df = self.max_df + min_df = self.min_df + + self.gow_builder_ = GoWBuilder(window_size=self.window_size, + directed=self.directed, + tokenizer=self.tokenizer) + N = len(raw_documents) + self.N_ = N + + avdl = 0.0 + tokenized_documents = [] + for document in raw_documents: + tok_document = self.tokenizer(document) + tokenized_documents.append(tok_document) + avdl += len(tok_document) + + avdl = avdl / float(N) + self.avdl_ = avdl + + vocabulary, X = self.__count_vocab(tokenized_documents, fixed_vocab=False) + X = self.__sort_features(X, vocabulary) + + max_doc_count = (max_df + if isinstance(max_df, numbers.Integral) + else max_df * N) + min_doc_count = (min_df + if isinstance(min_df, numbers.Integral) + else min_df * N) + + X, self.stop_words_ = self.__limit_features(X, vocabulary, max_doc_count, min_doc_count) + + self.vocabulary_ = vocabulary + + return X + + def transform(self, raw_documents: Sequence[str]): + _, X = self.__count_vocab([self.tokenizer(doc) for doc in raw_documents], fixed_vocab=True) + + return X + + def get_feature_names(self) -> Sequence[str]: + return [t for t, i in sorted(self.vocabulary_.items(), key=itemgetter(1))] + + def _more_tags(self): + return {'X_types': ['string']} + + +class TwidfVectorizer(BaseEstimator): + """Convert a collection of text documents to a TW-IDF matrix + + Equivalent to :class:`TwVectorizer` followed by + :class:`TfidfTransformer`. + + This implementation produces a sparse representation of the counts using + scipy.sparse.csr_matrix. + + Parameters + ---------- + max_df : float in range [0.0, 1.0] or int, default=1.0 + Ignore tokens that have a document frequency strictly + higher than the given threshold (corpus-specific stop words). + If float, the parameter represents a proportion of documents, integer + absolute counts. + min_df : float in range [0.0, 1.0] or int, default=1 + Ignore frequent subgraphs that have a document frequency strictly + lower than the given threshold. This value is also called support + in the literature. + If float, the parameter represents a proportion of documents, integer + absolute counts. + in the mapping should not be repeated and should not have any gap + between 0 and the largest index. + b : float {0.0, 0.003}, default=0.0 + Slope parameter of the tilting. + directed : boolean, True by default + If True, the graph-of-words is directed, else undirected + window_size : int, default=4 + Size of the window (in token) to build the graph-of-words. + term_weighting : string {'degree', 'degree_centrality', 'closeness_centrality', 'betweenness_centrality', 'pagerank'} + Graph-based term weighting approach for the nodes in the graph-of-words + 'degree' (default) : degree (undirected) or indegree (directed) of the nodes. + 'degree_centrality' : normalized degree centrality of the nodes + 'closeness_centrality' : very slow, closeness centrality of the nodes + 'betweenness_centrality' : very slow, the shortest-path betweenness centrality of the nodes + 'pagerank' : slow, the PageRank of the nodes + tokenizer : callable or None (default) + Override the string tokenization step. + norm : 'l1', 'l2' or None, optional (default='l2') + Each output row will have unit norm, either: + * 'l2': Sum of squares of vector elements is 1. The cosine + similarity between two vectors is their dot product when l2 norm has + been applied. + * 'l1': Sum of absolute values of vector elements is 1. + See :func:`preprocessing.normalize` + use_idf : boolean (default=True) + Enable inverse-document-frequency reweighting. + smooth_idf : boolean (default=True) + Smooth idf weights by adding one to document frequencies, as if an + extra document was seen containing every term in the collection + exactly once. Prevents zero divisions. + """ + + def __init__(self, + min_df: float = 0.0, + max_df: float = 1.0, + b: float = 0.0, + directed: bool = True, + window_size: int = 4, + term_weighting: str = TERM_WEIGHT_DEGREE, + tokenizer: Tokenizer = None, + # + norm='l2', + use_idf=True, + smooth_idf=True): + # Subgraph mining patterns + self.min_df: float = min_df + self.max_df: float = max_df + if self.min_df < 0.0: + raise ValueError("min_df is smaller than 0%") + + if self.max_df < 0.0: + raise ValueError("max_df is smaller than 0%") + + self.term_weighting = term_weighting + + self.b = b + + self.tokenizer: Tokenizer = tokenizer if tokenizer is not None else default_tokenizer + + self.window_size = window_size + if self.window_size < 2: + raise ValueError("window_size < 2") + + self.directed = directed + + self.norm = norm + self.use_idf = use_idf + self.smooth_idf = smooth_idf + + def fit(self, raw_documents: Sequence[str], y=None): + self.fit_transform(raw_documents, y) + return self + + def fit_transform(self, raw_documents: Sequence[str], y=None): + self.tw_vectorizer_ = TwVectorizer( + min_df=self.min_df, + max_df=self.max_df, + b=self.b, + directed=self.directed, + window_size=self.window_size, + term_weighting=self.term_weighting, + tokenizer=self.tokenizer) + self.tfidf_transformer_ = TfidfTransformer( + norm=self.norm, + use_idf=self.use_idf, + smooth_idf=self.smooth_idf) + + self.pipeline_ = Pipeline([ + ('tw', self.tw_vectorizer_), + ('idf', self.tfidf_transformer_) + ]) + return self.pipeline_.fit_transform(raw_documents, y) + + def transform(self, raw_documents: Sequence[str]): + return self.pipeline_.transform(raw_documents) + + def get_feature_names(self) -> Sequence[str]: + return [t for t, i in sorted(self.tw_vectorizer_.vocabulary_.items(), key=itemgetter(1))] + + def _more_tags(self): + return {'X_types': ['string']} diff --git a/gowpy/gow/__init__.py b/gowpy/gow/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/gowpy/gow/builder.py b/gowpy/gow/builder.py new file mode 100644 index 0000000..83fedbd --- /dev/null +++ b/gowpy/gow/builder.py @@ -0,0 +1,264 @@ +from typing import Sequence, Dict, Optional, Union, List, Callable + +from gowpy.gow.typing import Token, Tokenized_document, Tokenizer, \ + Edge, Edge_with_code, Edge_label, Edges, Nodes +import networkx as nx +from gowpy.utils.defaults import default_tokenizer + + +def mk_undirected_edge(node_start_code: int, + node_end_code: int, + code: Optional[int] = None) -> Union[Edge, Edge_with_code]: + """Builds an unambiguous representation of an undirected edge""" + if node_start_code < node_end_code: + n1, n2, label = node_start_code, node_end_code, code + else: + n1, n2, label = node_end_code, node_start_code, code + + if code is None: + return n1, n2 + else: + return n1, n2, label + + +def mk_directed_edge(node_start_code: int, + node_end_code: int, + code: Optional[int] = None) -> Union[Edge, Edge_with_code]: + """Builds an unambiguous representation of a directed edge""" + if code is None: + return node_start_code, node_end_code + else: + return node_start_code, node_end_code, code + + +class GraphOfWords(object): + """ + Represents a graph-of-words + + .. seealso:: gowpy.gow.builder.GoWBuilder + .. note:: this class should not be used directly, see GoWBuilder + """ + def __init__(self, + nodes: Nodes, + edges: Edges, + get_token: Callable[[int], str], + get_label: Optional[Callable[[int], Edge_label]], + freq: int = 1, + directed: bool = False): + self.get_token = get_token + self.get_label = get_label + + self.nodes = nodes + self.edges = edges + self.directed = directed + self.freq = freq + + self.graph_: Optional[nx.Graph] = None + + def is_edge_labeling(self): + return self.get_label is not None + + def __str__(self): + nodes = self.nodes_str() + edges = self.edges_str() + return """Graph-of-words\nNodes: {}\nEdges: {}\n""".format(nodes, edges) + + def __repr__(self): + return self.__str__() + + def nodes_str(self) -> List[str]: + return [self.get_token(node_code) for node_code in self.nodes] + + def __edges_to_str(self, edge: Edge) -> str: + start_node, end_node = edge + return f'{self.get_token(start_node)}__{self.get_token(end_node)}' + + def edges_str(self) -> List[str]: + if self.is_edge_labeling(): + return [self.__edges_to_str(self.get_label(edge_label_code)) + for _, _, edge_label_code in self.edges] + else: + return [self.__edges_to_str(mk_directed_edge(node_start, node_end) if self.directed else mk_undirected_edge(node_start, node_end)) + for node_start, node_end in self.edges] + + def to_graph(self) -> nx.Graph: + """Computes and memoize a NetworkX representation + + This representation is suited for algorithms rather than visualisation. + """ + if self.graph_ is None: + g = nx.Graph() if not self.directed else nx.DiGraph() + + [g.add_node(node, label=node) for node in self.nodes] + + if self.is_edge_labeling(): + [g.add_edge(node_start_code, node_end_code, label=edge_code) + for node_start_code, node_end_code, edge_code in self.edges] + else: + g.add_edges_from(self.edges) + + self.graph_ = g + + return self.graph_ + + def to_labeled_graph(self) -> nx.Graph: + """Computes a NetworkX representation suited for drawing""" + g = nx.Graph() if not self.directed else nx.DiGraph() + + [g.add_node(self.get_token(node)) for node in self.nodes] + + if self.is_edge_labeling(): + [g.add_edge(self.get_token(node_start_code), self.get_token(node_end_code)) + for node_start_code, node_end_code, _ in self.edges] + else: + g.add_edges_from([(self.get_token(node_start_code), self.get_token(node_end_code)) + for node_start_code, node_end_code in self.edges]) + + return g + + +class GoWBuilder(object): + """Builder to construct graph-of-words from a single document or a corpus of documents + + Parameters + ---------- + directed : boolean, False by default + If True, the graph-of-words is directed, else undirected + window_size : int, default=4 + Size of the window (in token) to build the graph-of-words. + edge_labeling : boolean, False by default + If True, edges are labeled with a unique code, else edges are not labeled. + tokenizer : callable or None (default) + Override the string tokenization step. + """ + + def __init__(self, + directed: bool = False, + window_size: int = 4, + tokenizer: Tokenizer = None, + edge_labeling: bool = False): + # Graph parameters + self.directed: bool = directed + self.window_size: int = window_size + + self.corpus_size: Optional[int] = None + + self.tokenizer: Tokenizer = tokenizer if tokenizer is not None else default_tokenizer + + self.TOKEN_TO_INT_: Dict[Token, int] = {} + self.INT_TO_TOKEN_: Dict[int, Token] = {} + + self.edge_labeling = edge_labeling + if self.edge_labeling: + self.LABEL_TO_INT_: Dict[Edge_label, int] = {} + self.INT_TO_LABEL_: Dict[int, Edge_label] = {} + + # TODO generate a real formal python representation + def __repr__(self): + return f'''Graph-of-word builder: + - is_directed: {self.directed} + - window_size: {self.window_size} + - edge_labeling: {self.edge_labeling} + + - Number of tokens: {len(self.TOKEN_TO_INT_)} + - Number of links between tokens: {len(self.LABEL_TO_INT_)} + '''.lstrip() + + def __str__(self): + return self.__repr__() + + # Node + def get_code_(self, token: Token) -> int: + if token not in self.TOKEN_TO_INT_: + last_token_id_ = len(self.TOKEN_TO_INT_) + self.TOKEN_TO_INT_[token] = last_token_id_ + self.INT_TO_TOKEN_[last_token_id_] = token + + return self.TOKEN_TO_INT_[token] + + def get_token_(self, code: int) -> Token: + return self.INT_TO_TOKEN_[code] + + # Edge + def get_label_id_(self, label: Edge_label) -> int: + if label not in self.LABEL_TO_INT_: + last_label_id_ = len(self.LABEL_TO_INT_) + self.LABEL_TO_INT_[label] = last_label_id_ + self.INT_TO_LABEL_[last_label_id_] = label + + return self.LABEL_TO_INT_[label] + + def get_label_(self, code: int) -> Edge_label: + return self.INT_TO_LABEL_[code] + + def get_edge_code_(self, edge: Edge) -> int: + node_start_code, node_end_code = edge + # Computation of the edge label and edge label ID + if self.directed: + t1, t2 = (node_start_code, node_end_code) + else: + if node_start_code < node_end_code: + t1, t2 = (node_start_code, node_end_code) + else: + t1, t2 = (node_end_code, node_start_code) + + edge_label = (t1, t2) + + edge_code = self.get_label_id_(edge_label) + + return edge_code + + def compute_gow_from_corpus(self, raw_documents: Sequence[str]) -> Sequence[GraphOfWords]: + """Computes a graph-of-words representation for each given documents""" + result_graph_of_words = [] + + for raw_document in raw_documents: + gow = self.compute_gow_from_document(raw_document) + result_graph_of_words.append(gow) + + self.corpus_size = len(result_graph_of_words) + + return result_graph_of_words + + def compute_gow_from_tokenized_document(self, tokens: Tokenized_document) -> GraphOfWords: + nodes = set() + token_ids = [] + for token in tokens: + token_id = self.get_code_(token) + token_ids.append(token_id) + nodes.add(token_id) + + N = len(tokens) + + edges = set() + if self.edge_labeling: + for j in range(N): + for i in range(max(j - self.window_size + 1, 0), j): + # Only keep edges between two *different* tokens + if token_ids[i] != token_ids[j]: + edge = (token_ids[i], token_ids[j]) + edge_code = self.get_edge_code_(edge) + if self.directed: + edges.add(mk_directed_edge(token_ids[i], token_ids[j], edge_code)) + else: + edges.add(mk_undirected_edge(token_ids[i], token_ids[j], edge_code)) + else: + for j in range(N): + for i in range(max(j - self.window_size + 1, 0), j): + # Only keep edges between two *different* tokens + if token_ids[i] != token_ids[j]: + if self.directed: + edges.add(mk_directed_edge(token_ids[i], token_ids[j])) + else: + edges.add(mk_undirected_edge(token_ids[i], token_ids[j])) + + return GraphOfWords(nodes=nodes, + edges=edges, + get_label=self.get_label_ if self.edge_labeling else None, + get_token=self.get_token_, + directed=self.directed) + + def compute_gow_from_document(self, raw_document: str) -> GraphOfWords: + """Computes a graph-of-words representation from a document""" + tokens = self.tokenizer(raw_document) + return self.compute_gow_from_tokenized_document(tokens) diff --git a/gowpy/gow/io.py b/gowpy/gow/io.py new file mode 100644 index 0000000..7428d2a --- /dev/null +++ b/gowpy/gow/io.py @@ -0,0 +1,226 @@ +import re + +from typing import Tuple, List, Sequence, Callable + +from gowpy.gow.builder import mk_undirected_edge, mk_directed_edge +from gowpy.gow.builder import GraphOfWords +from gowpy.gow.typing import Edge_label + + +def gow_to_data(gows: Sequence[GraphOfWords]) -> str: + """ + Convert a sequence of graph-of-words into a text representation for interoperability with other programs + + Format: + - "t # N" means the Nth graph, + - "v M L" means that the Mth vertex in this graph has label L, + - "e P Q L" means that there is an edge connecting the Pth vertex with the Qth vertex. The edge has label L. + + :param gows: + :return: + """ + result_data = [] + + for i, gow in enumerate(gows): + nodes = gow.nodes + edges = gow.edges + + if len(nodes) > 0: + result_data.append(u"t # {}\n".format(i)) + + node_label_to_id = {} + + for node_label in nodes: + if not (node_label in node_label_to_id): + new_id = len(node_label_to_id) + node_label_to_id[node_label] = new_id + + node_id = node_label_to_id[node_label] + result_data.append(u"v {} {}\n".format(node_id, node_label)) + + edge_tuples = [] # TODO implementation with a heap to be more efficient? + for (node_start_label, node_end_label, edge_label_id) in edges: + # Computation of the node IDs in this graph given their node labels + node_start_id = node_label_to_id[node_start_label] + node_end_id = node_label_to_id[node_end_label] + + edge_tuples.append((node_start_id, node_end_id, edge_label_id)) + edge_tuples.sort() + + for node_start_id, node_end_id, edge_label_id in edge_tuples: + result_data.append(u"e {} {} {}\n".format(node_start_id, + node_end_id, + edge_label_id)) + + result_data.append(u"t # {}".format(-1)) + return u"".join(result_data) + + +r_new_graph_ = re.compile(u't +# +(\d+) +\\* +(\d+)') +r_new_vertex_ = re.compile(u'v +(\d+) +(\d+)') +r_new_edge_ = re.compile(u'e +(\d+) +(\d+) +(\d+)') +r_new_parent_graphs_ = re.compile(u'x: +([\d ]+)') + + +def load_graphs(input_file_subgraph: str, + input_file_frequent_nodes: str, + get_token: Callable[[int], str], + get_label: Callable[[int], Edge_label], + is_directed: bool=False) -> Sequence[GraphOfWords]: + # + current_id = None + current_freq = None + current_vertices = None + current_edges = None + current_parent_graph_ids = None + + subgraphs = [] + + with open(input_file_subgraph, 'r') as f_input_file: + for line in f_input_file: + m_new_graph = r_new_graph_.search(line) + m_new_vertex = r_new_vertex_.search(line) + m_new_edge = r_new_edge_.search(line) + m_new_parent_graphs = r_new_parent_graphs_.search(line) + + if m_new_graph: + # Saving + if current_id is not None: + subgraphs.append(_to_gow(current_id, + current_freq, + (current_vertices, current_edges), + current_parent_graph_ids, + get_token, get_label, + is_directed)) + + # Initialisation of the new graph + current_id = int(m_new_graph.group(1)) + current_freq = int(m_new_graph.group(2)) + current_vertices = [] + current_edges = [] + current_parent_graph_ids = None + + elif m_new_vertex: + vertex_id = int(m_new_vertex.group(1)) + vertex_label = int(m_new_vertex.group(2)) + + current_vertices.append((vertex_id, vertex_label)) + + elif m_new_edge: + node_start = int(m_new_edge.group(1)) + node_end = int(m_new_edge.group(2)) + edge_label = int(m_new_edge.group(3)) + + current_edges.append((node_start, node_end, edge_label)) + + elif m_new_parent_graphs: + current_parent_graph_ids = [int(graph_id) for graph_id in + m_new_parent_graphs.group(1).strip().split(' ')] + # assert len(current_parent_graph_ids) == current_freq + + else: + pass # other lines (probably empty) + + # Last line + if current_id and current_parent_graph_ids: + subgraphs.append( + _to_gow(current_id, current_freq, (current_vertices, current_edges), current_parent_graph_ids, + get_token, get_label, + is_directed)) + + current_id = None + PADDING_ID = len(subgraphs) + current_freq = None + current_vertices = None + current_edges = None + current_parent_graph_ids = None + + with open(input_file_frequent_nodes, 'r') as f_input_file: + for line in f_input_file: + m_new_graph = r_new_graph_.search(line) + m_new_vertex = r_new_vertex_.search(line) + m_new_parent_graphs = r_new_parent_graphs_.search(line) + + if m_new_graph: + # Saving + if current_id is not None: + subgraphs.append(_to_gow(current_id, + current_freq, + (current_vertices, current_edges), + current_parent_graph_ids, + get_token, get_label, + is_directed)) + + # Initialisation of the new graph + current_id = int(m_new_graph.group(1)) + PADDING_ID + current_freq = int(m_new_graph.group(2)) + current_vertices = [] + current_edges = [] + current_parent_graph_ids = None + + elif m_new_vertex: + vertex_id = int(m_new_vertex.group(1)) + vertex_label = int(m_new_vertex.group(2)) + + current_vertices.append((vertex_id, vertex_label)) + + elif m_new_parent_graphs: + current_parent_graph_ids = [int(graph_id) for graph_id in + m_new_parent_graphs.group(1).strip().split(' ')] + # assert len(current_parent_graph_ids) == current_freq + + else: + pass # other lines (probably empty) + + # Last line + if current_id and current_parent_graph_ids: + subgraphs.append( + _to_gow(current_id, current_freq, (current_vertices, current_edges), current_parent_graph_ids, + get_token, get_label, is_directed)) + + return subgraphs + +IO_Nodes = List[Tuple[int, int]] # (node_id, node_code) +IO_Edges = List[Tuple[int, int, int]] # (node_start_id, node_end_id, edge_code) +IO_Subgraph = Tuple[IO_Nodes, IO_Edges] + +def _to_gow(subg_id: int, + subg_freq: int, + subgraph: IO_Subgraph, + subg_current_parent_graph_ids: Sequence[int], + get_token: Callable[[int], str], + get_label: Callable[[int], Edge_label], + is_directed: bool) -> GraphOfWords: + id_: int = subg_id + freq: int = subg_freq + + subg_vertices, subg_edges = subgraph + + size = len(subg_vertices) + parents = subg_current_parent_graph_ids + + # Recomputation of nodes + # Dealing with nodes: + # Node = (node id in *this* graph, node code) + node_id_to_node_code = {} + nodes = set() + for node_id, node_code in subg_vertices: + node_id_to_node_code[node_id] = node_code + nodes.add(node_code) + + # Dealing with edges + edges = set() + for node_start_id, node_end_id, edge_label_code in subg_edges: + node_start_code = node_id_to_node_code[node_start_id] + node_end_code = node_id_to_node_code[node_end_id] + if is_directed: + edges.add(mk_directed_edge(node_start_code, node_end_code, edge_label_code)) + else: + edges.add(mk_undirected_edge(node_start_code, node_end_code, edge_label_code)) + + return GraphOfWords(nodes=nodes, + edges=edges, + get_token=get_token, + get_label=get_label, + freq=freq, + directed=is_directed) \ No newline at end of file diff --git a/gowpy/gow/miner.py b/gowpy/gow/miner.py new file mode 100644 index 0000000..0fccce4 --- /dev/null +++ b/gowpy/gow/miner.py @@ -0,0 +1,72 @@ +import numpy as np +from typing import Sequence, Optional + +import gowpy.gow.io +from gowpy.gow.builder import GraphOfWords +from gowpy.gow.typing import Tokenizer +from gowpy.gow.builder import GoWBuilder + + +class GoWMiner(GoWBuilder): + """A miner of frequent subgraphs for a collection of graph-of-words + + Currently, the mining operation is delegated to a C++ program. This class makes it possible to load the + mined sub-graphs-of-words. + + Parameters + ---------- + directed : boolean, False by default + If True, the graph-of-words is directed, else undirected + window_size : int, default=4 + Size of the window (in token) to build the graph-of-words. + tokenizer : callable or None (default) + Override the string tokenization step. + """ + + def __init__(self, + directed: bool = False, + window_size: int = 4, + tokenizer: Tokenizer = None): + # /!\ Edge labeling is important for IO + super().__init__(directed, window_size, tokenizer, edge_labeling=True) + self.frequent_subgraphs: Optional[Sequence[GraphOfWords]] = None + + # TODO generate a real formal python representation + def __repr__(self): + if self.frequent_subgraphs is None: + len_frequent_subgraphs = "not loaded yet" + else: + len_frequent_subgraphs = len(self.frequent_subgraphs) + return f'''Graph-of-word miner: + - is_directed: {self.directed} + - window_size: {self.window_size} + - edge_labeling: {self.edge_labeling} + + - Number of tokens: {len(self.TOKEN_TO_INT_)} + - Number of links between tokens: {len(self.LABEL_TO_INT_)} + + - Number of loaded subgraph: {len_frequent_subgraphs} + '''.lstrip() + + def load_graphs(self, + input_file_subgraph: str, + input_file_frequent_nodes: str) -> None: + self.frequent_subgraphs = gowpy.gow.io.load_graphs(input_file_subgraph, input_file_frequent_nodes, + self.get_token_, self.get_label_, + self.directed) + + def stat_freq_per_pattern(self) -> np.array: + """Computes the subgraph frequency series""" + return np.array([pattern.freq for pattern in self.frequent_subgraphs]) + + def stat_relative_freq_per_pattern(self) -> np.array: + """Computes the subgraph normalised frequency series""" + return np.array([pattern.freq / float(self.corpus_size) for pattern in self.frequent_subgraphs]) + + def stat_num_nodes_per_pattern(self) -> np.array: + """Computes the number of nodes per subgraph series""" + return np.array([len(pattern.nodes) for pattern in self.frequent_subgraphs]) + + def stat_num_edges_per_pattern(self) -> np.array: + """Computes the number of edges per subgraph series""" + return np.array([len(pattern.edges) for pattern in self.frequent_subgraphs]) diff --git a/gowpy/gow/typing.py b/gowpy/gow/typing.py new file mode 100644 index 0000000..325512a --- /dev/null +++ b/gowpy/gow/typing.py @@ -0,0 +1,12 @@ +from typing import Tuple, Callable, Sequence, Set, Union + +Token = str +Tokenized_document = Sequence[Token] +Tokenizer = Callable[[str], Tokenized_document] +Node = int +Nodes = Set[Node] + +Edge_label = Tuple[int, int] +Edge = Tuple[Node, Node] +Edge_with_code = Tuple[Node, Node, int] +Edges = Union[Set[Edge], Set[Edge_with_code]] diff --git a/gowpy/summarization/__init__.py b/gowpy/summarization/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/gowpy/summarization/unsupervised/__init__.py b/gowpy/summarization/unsupervised/__init__.py new file mode 100644 index 0000000..50b2e0d --- /dev/null +++ b/gowpy/summarization/unsupervised/__init__.py @@ -0,0 +1 @@ +from .keyword_extractor_gow import GoWKeywordExtractor \ No newline at end of file diff --git a/gowpy/summarization/unsupervised/keyword_extractor_gow.py b/gowpy/summarization/unsupervised/keyword_extractor_gow.py new file mode 100644 index 0000000..27ca107 --- /dev/null +++ b/gowpy/summarization/unsupervised/keyword_extractor_gow.py @@ -0,0 +1,48 @@ +from typing import Sequence, Tuple + +from gowpy.gow.builder import GoWBuilder +from gowpy.gow.typing import Tokenizer + +from networkx.algorithms.core import core_number + + +class GoWKeywordExtractor(object): + """Extract keywords from a text document based on a graph-of-words representation + + Parameters + ---------- + directed : boolean, False by default + If True, the graph-of-words is directed, else undirected + window_size : int, default=4 + Size of the window (in token) to build the graph-of-words. + tokenizer : callable or None (default) + Override the string tokenization step. + """ + def __init__(self, + directed: bool = False, + window_size: int = 4, + tokenizer: Tokenizer = None): + # TODO is_weighted + self.builder = GoWBuilder( + directed=directed, + window_size=window_size, + tokenizer=tokenizer) + + def extract(self, document: str) -> Sequence[Tuple[str, float]]: + gow = self.builder.compute_gow_from_document(document) + graph = gow.to_graph() + kcore = core_number(graph) + + keywords = [] + k_max = 0 + for v, k in kcore.items(): + if k > k_max: + keywords.clear() + k_max = k + + if k == k_max: + token_code = graph.nodes[v]['label'] + token = self.builder.get_token_(token_code) + keywords.append((token, k)) + + return keywords diff --git a/gowpy/utils/__init__.py b/gowpy/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/gowpy/utils/defaults.py b/gowpy/utils/defaults.py new file mode 100644 index 0000000..3b9447d --- /dev/null +++ b/gowpy/utils/defaults.py @@ -0,0 +1,5 @@ +from gowpy.gow.typing import Tokenized_document + + +def default_tokenizer(document: str) -> Tokenized_document: + return document.split() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..9110b1b --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +networkx>=2.4 +scikit-learn>=0.22.2 +matplotlib>=3.1 \ No newline at end of file diff --git a/resources/gow.png b/resources/gow.png new file mode 100644 index 0000000..6dd3760 Binary files /dev/null and b/resources/gow.png differ diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..f577190 --- /dev/null +++ b/setup.py @@ -0,0 +1,32 @@ +from setuptools import setup, find_packages + +with open("requirements.txt") as f: + required = f.read().splitlines() + +setup( + name="gowpy", + version="0.1.0", + description="A very simple graph-of-words framework for NLP", + long_description=open("README.md", encoding="utf-8").read(), + long_description_content_type="text/markdown", + author="Guillaume Dubuisson Duplessis", + author_email="guillaume@dubuissonduplessis.fr", + url="https://github.com/GuillaumeDD/gowpy.git", + packages=find_packages(exclude="tests"), + license="new BSD", + install_requires=required, + include_package_data=True, + python_requires=">=3.6", + classifiers=[ + 'Development Status :: 3 - Alpha', + 'Intended Audience :: Developers', + 'Intended Audience :: Education', + 'Intended Audience :: Science/Research', + 'License :: OSI Approved :: BSD License', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7', + 'Topic :: Software Development :: Libraries', + 'Topic :: Software Development :: Libraries :: Python Modules' + ], +)