From 4bc30afad3aa8faaf172cf10d147575088d720ef Mon Sep 17 00:00:00 2001 From: Guillaume Dubuisson Duplessis Date: Sun, 3 May 2020 17:09:55 +0200 Subject: [PATCH] Initial commit --- .gitignore | 107 +++ LICENSE | 29 + README.md | 348 ++++++++++ examples/README.ipynb | 618 ++++++++++++++++++ gowpy/__init__.py | 0 gowpy/feature_extraction/__init__.py | 0 gowpy/feature_extraction/gow/__init__.py | 2 + .../feature_extraction/gow/gow_vectorizer.py | 225 +++++++ gowpy/feature_extraction/gow/tw_vectorizer.py | 427 ++++++++++++ gowpy/gow/__init__.py | 0 gowpy/gow/builder.py | 264 ++++++++ gowpy/gow/io.py | 226 +++++++ gowpy/gow/miner.py | 72 ++ gowpy/gow/typing.py | 12 + gowpy/summarization/__init__.py | 0 gowpy/summarization/unsupervised/__init__.py | 1 + .../unsupervised/keyword_extractor_gow.py | 48 ++ gowpy/utils/__init__.py | 0 gowpy/utils/defaults.py | 5 + requirements.txt | 3 + resources/gow.png | Bin 0 -> 47504 bytes setup.py | 32 + 22 files changed, 2419 insertions(+) create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 README.md create mode 100644 examples/README.ipynb create mode 100644 gowpy/__init__.py create mode 100644 gowpy/feature_extraction/__init__.py create mode 100644 gowpy/feature_extraction/gow/__init__.py create mode 100644 gowpy/feature_extraction/gow/gow_vectorizer.py create mode 100644 gowpy/feature_extraction/gow/tw_vectorizer.py create mode 100644 gowpy/gow/__init__.py create mode 100644 gowpy/gow/builder.py create mode 100644 gowpy/gow/io.py create mode 100644 gowpy/gow/miner.py create mode 100644 gowpy/gow/typing.py create mode 100644 gowpy/summarization/__init__.py create mode 100644 gowpy/summarization/unsupervised/__init__.py create mode 100644 gowpy/summarization/unsupervised/keyword_extractor_gow.py create mode 100644 gowpy/utils/__init__.py create mode 100644 gowpy/utils/defaults.py create mode 100644 requirements.txt create mode 100644 resources/gow.png create mode 100644 setup.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b81fba6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,107 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ + +# pycharm +.idea diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..b01acfd --- /dev/null +++ b/LICENSE @@ -0,0 +1,29 @@ +BSD 3-Clause License + +Copyright (c) 2020, Guillaume Dubuisson Duplessis +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..fcf4985 --- /dev/null +++ b/README.md @@ -0,0 +1,348 @@ +# gowpy + +A very simple framework for exploiting graph-of-words in NLP. +Currently at version **0.1.0** (alpha). + +gowpy leverages graph-of-words representation in order to do: +- **document classification** in a [scikit-learn](https://scikit-learn.org)-like + way via useful vectorizers, and +- **keyword extraction** from a document. + +## Quick Start +### Requirements and Installation +This project is based on Python 3.6+, [scikit-learn](https://github.com/scikit-learn/scikit-learn) and +[NetworkX](https://github.com/networkx/networkx). + +#### Installation from PyPI +```bash +pip install gowpy +``` + +#### Installation from the GitHub Source +First, clone the project: +```bash +git clone https://github.com/GuillaumeDD/gowpy.git +``` + +Then, `cd` to the project folder and run the install command: +```bash +cd gowpy/ +python setup.py install +``` + +### Example Usage + +#### Building a Graph-of-Words from a Document + +```python +from gowpy.gow.builder import GoWBuilder + +# Creation of a graph-of-words builder +# Here: +# - the graph-of-words will be directed, and +# - an edge will link every tokens co-occurring in a sliding window of size 4 +# +builder = GoWBuilder(directed=True, window_size=4) + +text = """gowpy is a simple framework for exploiting graph-of-words in nlp gowpy +leverages graph-of-words representation for document classification and for keyword extraction +from a document""" + +# Here, a preprocessing step fitted to the need of the project should be carried out + +# Creation of the graph-of-words +gow = builder.compute_gow_from_document(text) +``` + +Then, it is possible to visualize the document as a graph-of-words: +```python +import matplotlib.pyplot as plt +import networkx as nx + +g = gow.to_labeled_graph() + +options = { + "font_weight" : 'normal', + "font_color" : 'darkblue', + # + "edge_color" : 'lightgray', + # + "node_size" : 200, + "node_color": 'white', + "with_labels": True, +} +nx.draw(g, **options) +``` + +![A graph-of-words example](./resources/gow.png) + +#### Unsupervised Keywords Extraction +Graph-of-words can be leveraged to extract an automatically adaptative number of +cohesive keywords from a text document in an unsupervised fashion [[2,3]](#references). + +```python +from gowpy.summarization.unsupervised import GoWKeywordExtractor + +# Initialization of the keyword extractor +extractor_kw = GoWKeywordExtractor(directed=False, window_size=4) + +# +# Note that preprocessing is particularly important for keyword extraction +# in order to keep and normalize important terms such as adjectives and nouns. +# +# An already preprocessed text in which to extract keywords +preprocessed_text = """gowpy simple framework exploiting graph-of-words nlp gowpy +leverages graph-of-words representation document classification keyword extraction +document""" + +extractor_kw.extract(preprocessed_text) +``` + +Returns: +```text +[('gowpy', 4), + ('simple', 4), + ('framework', 4), + ('exploiting', 4), + ('graph-of-words', 4), + ('nlp', 4)] +``` + + +#### Classification with TW-IDF: a graph-based term weighting score +TW-IDF [[0]](#references) challenges the term independence assumption behind +the bag-of-words model by (i) exploiting a graph-of-words representation of a +document (here an unweighted directed graph of terms), and by (ii) leveraging +this new representation to replace the term frequency (TF) by graph-based term +weights (TW). + +TW-IDF is accessible via a dedicated vectorizer: +```python +from gowpy.feature_extraction.gow import TwidfVectorizer + +corpus = [ + 'hello world !', + 'foo bar' +] + +vectorizer_gow = TwidfVectorizer( + # Graph-of-words specificities + directed=True, + window_size=4, + # Token frequency filtering + min_df=0.0, + max_df=1.0, + # Graph-based term weighting approach + term_weighting='degree' +) + +X = vectorizer_gow.fit_transform(corpus) +X +``` +Returns: +```text +<2x5 sparse matrix of type '' + with 3 stored elements in Compressed Sparse Row format> +``` + +TW-IDF vectorizer fits seamlessly in a grid search: +```python +from sklearn.pipeline import Pipeline +from sklearn.svm import SVC + +from sklearn.model_selection import GridSearchCV + +pipeline = Pipeline([ + ('gow', TwidfVectorizer()), + ('svm', SVC()), +]) + +parameters = { + 'gow__directed' : [True, False], + 'gow__window_size' : [2, 4, 8, 16], + 'gow__b' : [0.0, 0.003], + 'gow__term_weighting' : ['degree', 'pagerank'], + 'gow__min_df' : [0, 5, 10], + 'gow__max_df' : [0.8, 0.9, 1.0], +# + 'svm__C' : [0.1, 1, 10], + 'svm__kernel' : ['linear'] +} + +# find the best parameters for both the feature extraction and the +# classifier +grid_search = GridSearchCV(pipeline, + parameters, + cv=10, + n_jobs=-1) +``` + +#### Going further: classification based on frequent subgraphs +Frequent subgraphs corresponding to long range n-gram can be mined and +subsequently used for document classification [[1]](#references). + +Classification with frequent subgraphs happens in a 3-step process: +1. Conversion of the corpus of already preprocessed documents into a collection + of graph-of-words +1. Mining the frequent subgraphs +1. Loading the frequent subgraphs and exploiting them for classification + +##### Conversion of the corpus into a collection of graph-of-words +The first step consists in turning the corpus into a graph-of-words and collection +and then export that collection into a file format suited for frequent subgraph +mining. +```python +from gowpy.gow.miner import GoWMiner +import gowpy.gow.io + +corpus = [ + 'hello world !', + 'foo bar', + # and many more... +] + +# Conversation of the corpus into a collection of graph-of-words +gow_miner = GoWMiner(directed=False, window_size=4) +corpus_gows = gow_miner.compute_gow_from_corpus(corpus) + +# Exportation of the collection of graph-of-words into a file for +# interoperability with other languages such as C++ +with open("corpus_gows.data", "w") as f_output: + data = gowpy.gow.io.gow_to_data(corpus_gows) + f_output.write(data) +``` + +##### Mining the frequent subgraphs +Frequent subgraphs mining can be realized via the [gSpan algorithm](https://www.cs.ucsb.edu/~xyan/software/gSpan.htm). +This step is not included in this project and has to be carried out by another +program. + +This project supports the reimplementation from [gBolt available at GitHub](https://github.com/Jokeren/gBolt). +Currently this implementation is limited to **undirected graph**. +To mine frequent subgraphs (after having installed gBolt on your machine): +```bash +OMP_NUM_THREADS=1 ./gbolt --input corpus_gows.data --output gbolt-mining-corpus_gow --dfs --nodes --support 0.01 +``` +Notice the **support parameter** which defines the minimum frequency of a subgraph +to be considered as frequent. Here it is set to 1% (0.01). +This parameter is **corpus specific** and should be carefully tuned (see [[1]](#references)). + +Mining produces two files: +- `gbolt-mining-corpus_gow.t0`: the frequent subgraphs with more than one node +- `gbolt-mining-corpus_gow.nodes`: the frequent single nodes + +These two files can be loaded by the same `gow_miner` used for exportation: +```python +gow_miner.load_graphs('gbolt-mining-corpus_gow.t0', + 'gbolt-mining-corpus_gow.nodes') +gow_miner +``` +Returns: +```text +Graph-of-word miner: + - is_directed: False + - window_size: 4 + - edge_labeling: True + + - Number of tokens: 5 + - Number of links between tokens: 4 + + - Number of loaded subgraph: 13 +``` + +##### Classification with frequent subgraphs +Classification with frequent subgraphs is accessible via a dedicated vectorizer: +```python +from gowpy.feature_extraction.gow import GoWVectorizer + +vectorizer_gow = GoWVectorizer(gow_miner) +X = vectorizer_gow.fit_transform(corpus) +# X is a sparse matrix +``` + +Before tuning the `min_df` (the minimum being the support chosen during mining) +and the `max_df`, it is possible the have a look at the normalized frequency +distribution: +```python +import pandas as pd +s_freq_per_pattern = pd.Series(gow_miner.stat_relative_freq_per_pattern()) +s_freq_per_pattern.describe() +``` +For instance, it can returns the following distribution: +```text +count 10369.000000 +mean 0.026639 +std 0.046551 +min 0.008333 +25% 0.010000 +50% 0.013333 +75% 0.022778 +max 0.865000 +dtype: float64 +``` + + +GoW vectorizer fits nicely in a grid search: +```python +from sklearn.pipeline import Pipeline +from sklearn.svm import SVC +from sklearn.feature_extraction.text import TfidfTransformer + +from sklearn.model_selection import GridSearchCV + +pipeline = Pipeline([ + ('gow', GoWVectorizer(gow_miner)), + ('tfidf', TfidfTransformer()), + ('svm', SVC()), +]) + +parameters = { + 'gow__subgraph_matching' : ['partial', 'induced'], + 'gow__min_df' : [0.00833, 0.01, 0.013333], + 'gow__max_df' : [0.022778, 0.5, 0.865], +# + 'svm__C' : [0.1, 1, 10], + 'svm__kernel' : ['linear'] +} + +# find the best parameters for both the feature extraction and the +# classifier +grid_search = GridSearchCV(pipeline, + parameters, + cv=10, + n_jobs=-1) +``` + +## References + +Detailed explanations, evaluations and discussions can be found in these papers: +- Information retrieval (TW-IDF) + + [0] [Graph-of-word and TW-IDF: New Approach to Ad Hoc IR](https://dl.acm.org/doi/abs/10.1145/2505515.2505671). + *Rousseau, François, and Michalis Vazirgiannis*. + *Proceedings of the 22nd ACM international conference on Information & Knowledge Management*.(**CIKM 2013**) +- Document classification with frequent subgraphs + + [1] [Text Categorization as a Graph Classification Problem](http://www.aclweb.org/anthology/P15-1164). + *Rousseau, François, Emmanouil Kiagias, and Michalis Vazirgiannis*. + *Proceedings of the 53rd Annual Meeting of the Association for Computational Linguistics and the 7th International + Joint Conference on Natural Language Processing* (**ACL 2015**) +- Keyword extraction from graph-of-words + + [2] [Main Core Retention on Graph-of-words for Single-Document Keyword Extraction](https://link.springer.com/chapter/10.1007/978-3-319-16354-3_42). + *Rousseau, François, and Michalis Vazirgiannis*. + *Proceedings of the 37th European Conference on Information Retrieval*. + (**ECIR 2015**) + + [3] [A Graph Degeneracy-based Approach to Keyword Extraction](https://www.aclweb.org/anthology/D16-1191/). + *Tiwier, Antoine Tixier, Malliaros Fragkiskos, and Vazirgiannis, Michalis*. + *Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing*. + (**EMNLP 2016**) + +This library involves the following algorithms: +- Frequent subgraph Mining (**currently not included in this library**) + + gSpan algorithm implementation for subgraph mining: [gBolt--very fast implementation for gSpan algorithm in data mining ](https://github.com/Jokeren/gBolt) +- Subgraph matching + + VF2 algorithm for subgraph isomorphism matching: [VF2 algorithm for subgraph isomorphism from NetworkX](https://networkx.github.io/documentation/stable/reference/algorithms/isomorphism.vf2.html) +- Graph degeneracy + + [k-core decomposition with NetworkX](https://networkx.github.io/documentation/stable/reference/algorithms/core.html) + + +## License +Released under the 3-Clause BSD license (see [LICENSE file](./LICENSE)) diff --git a/examples/README.ipynb b/examples/README.ipynb new file mode 100644 index 0000000..fc18ddc --- /dev/null +++ b/examples/README.ipynb @@ -0,0 +1,618 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# gowpy: README.md examples" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Building a Graph-of-Words from a Document" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from gowpy.gow.builder import GoWBuilder" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "builder = GoWBuilder(directed=True, \n", + " window_size=4)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "text = \"\"\"gowpy is a simple framework for exploiting graph-of-words in nlp gowpy \n", + "leverages graph-of-words representation for document classification and for keyword extraction \n", + "from a document\"\"\"\n", + "# ...\n", + "preprocessed_text = \"\"\"gowpy simple framework exploiting graph-of-words nlp gowpy \n", + "leverages graph-of-words representation document classification keyword extraction document\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "gow = builder.compute_gow_from_document(text)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Graph-of-words\n", + "Nodes: ['gowpy', 'is', 'a', 'simple', 'framework', 'for', 'exploiting', 'graph-of-words', 'in', 'nlp', 'leverages', 'representation', 'document', 'classification', 'and', 'keyword', 'extraction', 'from']\n", + "Edges: ['framework__graph-of-words', 'is__simple', 'exploiting__nlp', 'leverages__representation', 'for__exploiting', 'gowpy__a', 'graph-of-words__document', 'in__gowpy', 'extraction__document', 'for__classification', 'gowpy__graph-of-words', 'extraction__a', 'in__nlp', 'document__for', 'keyword__extraction', 'gowpy__leverages', 'a__document', 'graph-of-words__representation', 'a__for', 'gowpy__simple', 'for__in', 'is__a', 'extraction__from', 'nlp__gowpy', 'exploiting__graph-of-words', 'and__for', 'representation__for', 'leverages__graph-of-words', 'document__classification', 'for__document', 'in__leverages', 'from__a', 'gowpy__representation', 'simple__exploiting', 'simple__framework', 'nlp__graph-of-words', 'representation__classification', 'document__and', 'framework__for', 'for__from', 'classification__keyword', 'is__framework', 'nlp__leverages', 'graph-of-words__for', 'a__simple', 'and__keyword', 'for__keyword', 'representation__document', 'simple__for', 'gowpy__is', 'graph-of-words__nlp', 'leverages__for', 'keyword__from', 'graph-of-words__gowpy', 'framework__exploiting', 'exploiting__in', 'and__extraction', 'classification__and', 'for__extraction', 'keyword__a', 'for__graph-of-words', 'classification__for', 'for__and', 'from__document', 'graph-of-words__in', 'a__framework']" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gow" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import networkx as nx" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "g = gow.to_labeled_graph()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "options = {\n", + " \"font_weight\" : 'normal',\n", + " \"font_color\" : 'darkblue',\n", + " #\n", + " \"edge_color\" : 'lightgray',\n", + " #\n", + " \"node_size\" : 200,\n", + " \"node_color\": 'white',\n", + " \"with_labels\": True,\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "nx.draw(g, **options)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Unsupervised Keywords Extraction" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "from gowpy.summarization.unsupervised import GoWKeywordExtractor" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "extractor_kw = GoWKeywordExtractor(directed=False, window_size=4)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "15" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "preprocessed_text = \"\"\"gowpy simple framework exploiting graph-of-words nlp gowpy \n", + "leverages graph-of-words representation document classification keyword extraction \n", + "document\"\"\"\n", + "len(preprocessed_text.split())" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('gowpy', 4),\n", + " ('simple', 4),\n", + " ('framework', 4),\n", + " ('exploiting', 4),\n", + " ('graph-of-words', 4),\n", + " ('nlp', 4)]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "extractor_kw.extract(preprocessed_text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Classification with TW-IDF: a graph-based term weighting score" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "from gowpy.feature_extraction.gow import TwidfVectorizer" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "corpus = [\n", + " 'hello world !',\n", + " 'foo bar'\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "vectorizer_gow = TwidfVectorizer( \n", + " # Graph-of-words specificities\n", + " directed=True,\n", + " window_size=4,\n", + " # Token frequency filtering\n", + " min_df=0.0,\n", + " max_df=1.0,\n", + " # Graph-based term weighting approach\n", + " term_weighting='degree'\n", + ")\n", + "\n", + "X = vectorizer_gow.fit_transform(corpus)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[0.89442719, 0. , 0. , 0. , 0.4472136 ],\n", + " [0. , 1. , 0. , 0. , 0. ]])" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X.toarray()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<2x5 sparse matrix of type ''\n", + "\twith 3 stored elements in Compressed Sparse Row format>" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.pipeline import Pipeline\n", + "from sklearn.svm import SVC\n", + "\n", + "from sklearn.model_selection import GridSearchCV\n", + "\n", + "pipeline = Pipeline([\n", + " ('gow', TwidfVectorizer()),\n", + " ('svm', SVC()),\n", + "])\n", + "\n", + "parameters = {\n", + " 'gow__directed' : [True, False],\n", + " 'gow__window_size' : [2, 4, 8, 16],\n", + " 'gow__b' : [0.0, 0.003],\n", + " 'gow__term_weighting' : ['degree', 'pagerank'],\n", + " 'gow__min_df' : [0, 5, 10],\n", + " 'gow__max_df' : [0.8, 0.9, 1.0],\n", + "#\n", + " 'svm__C' : [0.1, 1, 10],\n", + " 'svm__kernel' : ['linear']\n", + "}\n", + "\n", + "# find the best parameters for both the feature extraction and the\n", + "# classifier\n", + "grid_search = GridSearchCV(pipeline, \n", + " parameters, \n", + " cv=10,\n", + " n_jobs=-1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Going further: classification based on frequent subgraphs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Conversion of the corpus into a collection of graph-of-words" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "from gowpy.gow.miner import GoWMiner\n", + "import gowpy.gow.io" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "corpus = [\n", + " 'hello world !',\n", + " 'foo bar',\n", + " # and many more...\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "gow_miner = GoWMiner(directed=False, window_size=4)\n", + "corpus_gows = gow_miner.compute_gow_from_corpus(corpus)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"corpus_gows.data\", \"w\") as f_output:\n", + " data = gowpy.gow.io.gow_to_data(corpus_gows)\n", + " f_output.write(data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Mining the frequent subgraphs" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Graph-of-word miner:\n", + " - is_directed: False\n", + " - window_size: 4\n", + " - edge_labeling: True\n", + "\n", + " - Number of tokens: 5\n", + " - Number of links between tokens: 4\n", + "\n", + " - Number of loaded subgraph: 13\n", + " " + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gow_miner.load_graphs('gbolt-mining-corpus_gow.t0', \n", + " 'gbolt-mining-corpus_gow.nodes')\n", + "gow_miner" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Classification with frequent subgraphs" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "count 13.0\n", + "mean 0.5\n", + "std 0.0\n", + "min 0.5\n", + "25% 0.5\n", + "50% 0.5\n", + "75% 0.5\n", + "max 0.5\n", + "dtype: float64" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "s_freq_per_pattern = pd.Series(gow_miner.stat_relative_freq_per_pattern())\n", + "s_freq_per_pattern.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<2x13 sparse matrix of type ''\n", + "\twith 13 stored elements in Compressed Sparse Row format>" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from gowpy.feature_extraction.gow import GoWVectorizer\n", + "\n", + "vectorizer_gow = GoWVectorizer(gow_miner)\n", + "X = vectorizer_gow.fit_transform(corpus)\n", + "X" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "feature_names = vectorizer_gow.get_feature_names()" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Nombre de features: 10\n", + "\t- hello world hello__world\n", + "\t- hello world ! world__! hello__world\n", + "\t- hello world ! world__! hello__world hello__!\n", + "\t- hello world ! hello__world hello__!\n", + "\t- hello ! hello__!\n", + "\t- hello world ! world__! hello__!\n", + "\t- world ! world__!\n", + "\t- hello\n", + "\t- !\n", + "\t- world\n" + ] + } + ], + "source": [ + "features = [feature for presence, feature in zip(X.toarray()[0], feature_names) if presence > 0]\n", + "print(\"Nombre de features: {}\".format(len(features)))\n", + "for feature in features:\n", + " print(f'\\t- {feature}')" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.pipeline import Pipeline\n", + "from sklearn.svm import SVC\n", + "from sklearn.feature_extraction.text import TfidfTransformer\n", + "\n", + "from sklearn.model_selection import GridSearchCV" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "pipeline = Pipeline([\n", + " ('gow', GoWVectorizer(gow_miner)),\n", + " ('tfidf', TfidfTransformer()),\n", + " ('svm', SVC()),\n", + "])\n", + "\n", + "parameters = {\n", + " 'gow__subgraph_matching' : ['partial', 'induced'],\n", + " 'gow__min_df' : [0.00833, 0.01, 0.013333],\n", + " 'gow__max_df' : [0.022778, 0.25, 0.5, 1.0],\n", + "#\n", + " 'svm__C' : [0.1, 1, 10],\n", + " 'svm__kernel' : ['linear']\n", + "}\n", + "\n", + "# find the best parameters for both the feature extraction and the\n", + "# classifier\n", + "grid_search = GridSearchCV(pipeline, \n", + " parameters, \n", + " cv=10,\n", + " n_jobs=-1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:gowpy36]", + "language": "python", + "name": "conda-env-gowpy36-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.10" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": true + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/gowpy/__init__.py b/gowpy/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/gowpy/feature_extraction/__init__.py b/gowpy/feature_extraction/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/gowpy/feature_extraction/gow/__init__.py b/gowpy/feature_extraction/gow/__init__.py new file mode 100644 index 0000000..bc2fa07 --- /dev/null +++ b/gowpy/feature_extraction/gow/__init__.py @@ -0,0 +1,2 @@ +from .gow_vectorizer import GoWVectorizer +from .tw_vectorizer import TwVectorizer, TwidfVectorizer diff --git a/gowpy/feature_extraction/gow/gow_vectorizer.py b/gowpy/feature_extraction/gow/gow_vectorizer.py new file mode 100644 index 0000000..1918667 --- /dev/null +++ b/gowpy/feature_extraction/gow/gow_vectorizer.py @@ -0,0 +1,225 @@ +import networkx.algorithms.isomorphism as iso +from networkx.algorithms import isomorphism + +import numbers + +from scipy.sparse import csr_matrix + +from typing import Sequence, Tuple, Generator + +from gowpy.gow.builder import GraphOfWords +from gowpy.gow.typing import Nodes + +from sklearn.base import BaseEstimator +from gowpy.gow.miner import GoWMiner + +SUBGRAPH_MATCHING_INDUCED = "induced" +SUBGRAPH_MATCHING_PARTIAL = "partial" + + +class GoWVectorizer(BaseEstimator): + """Convert a collection of text documents to a matrix of frequent subgraphs matching counts + + Frequent subgraphs have to be mined before using this vectorizer. + + This implementation produces a sparse representation of the counts using + scipy.sparse.csr_matrix. + + Parameters + ---------- + graph_of_words: GoWMiner + A graph-of-words miner containing the frequent subgraphs. + max_df : float in range [0.0, 1.0] or int, default=1.0 + Ignore frequent subgraphs that have a document frequency strictly + higher than the given threshold (corpus-specific frequent subgraphs). + If float, the parameter represents a proportion of documents, integer + absolute counts. + min_df : float in range [0.0, 1.0] or int, default=0.0 + Ignore frequent subgraphs that have a document frequency strictly + lower than the given threshold. This value is also called support + in the literature. + Note that the smallest value is defined with the support used when + mining frequent subgraphs. + If float, the parameter represents a proportion of documents, integer + absolute counts. + in the mapping should not be repeated and should not have any gap + between 0 and the largest index. + indexing : boolean, True by default + Frequent subgraphs are indexed for faster retrieval when computing + document features. + subgraph_matching : string {'induced', 'partial'} + Frequent subgraph matching approach + 'partial' (default) : subgraph matching corresponding to node and + edge inclusion. + 'induced' : slower approach, node-induced subgraph matching + """ + + def __init__(self, + graph_of_words: GoWMiner, + min_df: float = 0.0, + max_df: float = 1.0, + subgraph_matching: str = SUBGRAPH_MATCHING_PARTIAL, + indexing: bool = True): + self.graph_of_words = graph_of_words + + # Subgraph mining patterns + self.min_df: float = min_df + self.max_df: float = max_df + if self.min_df < 0.0: + raise ValueError("min_df is smaller than 0%") + + if self.max_df < 0.0: + raise ValueError("max_df is smaller than 0%") + + self.subgraph_matching: str = subgraph_matching + + if self.graph_of_words is None: + raise ValueError("No provided graph-of-words miner to compute features (graph_of_words is None)") + + self.indexing = indexing + + def __compute_subpatterns(self) -> Sequence[Tuple[int, GraphOfWords]]: + # Filtering patterns out by support + if self.graph_of_words is not None: + max_doc_count = (self.max_df / float(self.graph_of_words.corpus_size) + if isinstance(self.max_df, numbers.Integral) + else self.max_df) + min_doc_count = (self.min_df / float(self.graph_of_words.corpus_size) + if isinstance(self.min_df, numbers.Integral) + else self.min_df) + # Selecting subpatterns + subpatterns = [subgraph for subgraph in self.graph_of_words.frequent_subgraphs + if (float(subgraph.freq) / float(self.graph_of_words.corpus_size)) >= min_doc_count + if (float(subgraph.freq) / float(self.graph_of_words.corpus_size)) <= max_doc_count + ] + else: + subpatterns = [] + + if self.indexing: + # Indexing patterns by node codes + self.node_code_to_feature_i_s_ = {} + for feature_i, subgraph in enumerate(subpatterns): + for node_code in subgraph.nodes: + if node_code not in self.node_code_to_feature_i_s_: + self.node_code_to_feature_i_s_[node_code] = set() + + self.node_code_to_feature_i_s_[node_code].add(feature_i) + + return [(i, subgraph) for i, subgraph in enumerate(subpatterns)] + + def fit(self, raw_documents: Sequence[str], y=None): + self.selected_subpatterns_: Sequence[Tuple[int, GraphOfWords]] = self.__compute_subpatterns() + self.node_matcher_ = iso.categorical_node_match('label', -1) + + return self + + def fit_transform(self, raw_documents: Sequence[str], y=None): + self.fit(raw_documents, y) + return self.transform(raw_documents) + + def __get_probable_features_via_nodes(self, document_nodes: Nodes) -> Generator[ + Tuple[int, GraphOfWords], None, None]: + subpatterns = self.selected_subpatterns_ + + feature_i_s = set() + for node_code in document_nodes: + if node_code in self.node_code_to_feature_i_s_: + # Getting the feature indices in which the node code appears + temp_feature_i_s = self.node_code_to_feature_i_s_[node_code] + feature_i_s.update(temp_feature_i_s) + + for feature_i in sorted(feature_i_s): + _, subgraph = subpatterns[feature_i] + yield (feature_i, subgraph) + + def __iterate_over_features(self, document_nodes: Nodes) -> Generator[Tuple[int, GraphOfWords], None, None]: + if self.indexing: + return self.__get_probable_features_via_nodes(document_nodes) + else: + subpatterns = self.selected_subpatterns_ + return subpatterns + + def __is_iso_induced(self, + feature_gow: GraphOfWords, + document_gow: GraphOfWords) -> bool: + is_iso = False + document_nodes = document_gow.nodes + document_edges = document_gow.edges + + # optimisation: + # checking nodes and edges inclusion in document before running + # subgraph matching algorithms + # + if (feature_gow.nodes.issubset(document_nodes)) and \ + (feature_gow.edges.issubset(document_edges)): + if len(feature_gow.nodes) <= 2: + is_iso = True + else: + document_graph = document_gow.to_graph() + feature_graph = feature_gow.to_graph() + GM = isomorphism.GraphMatcher(document_graph, feature_graph, + node_match=self.node_matcher_) + is_iso = GM.subgraph_is_isomorphic() + + return is_iso + + @staticmethod + def __is_iso_partial(feature_gow: GraphOfWords, + document_gow: GraphOfWords) -> bool: + return (feature_gow.nodes.issubset(document_gow.nodes)) and \ + (feature_gow.edges.issubset(document_gow.edges)) + + def transform(self, raw_documents: Sequence[str]): + indptr = [0] + indices = [] + data = [] + + subpatterns = self.selected_subpatterns_ + temp_num_features = len(subpatterns) + + if temp_num_features > 0: + for document in raw_documents: + # Document to gowpy + document_gow = self.graph_of_words.compute_gow_from_document(document) + if self.subgraph_matching == SUBGRAPH_MATCHING_INDUCED: + # Feature computation + retained_features = [i_feature + for i_feature, feature_gow in self.__iterate_over_features(document_gow.nodes) + if self.__is_iso_induced(feature_gow, document_gow) + ] + else: + # Feature computation + retained_features = [i_feature + for i_feature, feature_gow in self.__iterate_over_features(document_gow.nodes) + if GoWVectorizer.__is_iso_partial(feature_gow, document_gow) + ] + + # Building blocks of the sparse matrix + for i_feature in retained_features: + indices.append(i_feature) + data.append(1) + indptr.append(len(indices)) + + resulting_matrix = csr_matrix((data, indices, indptr), dtype=int) + else: + resulting_matrix = csr_matrix((len(raw_documents), 0)) + return resulting_matrix + + def get_feature_names(self) -> Sequence[str]: + feature_names = [] + + subpatterns = self.selected_subpatterns_ + + for _, subgraph in subpatterns: + temp = [] + for n in subgraph.nodes_str(): + temp.append(n) + for e in subgraph.edges_str(): + temp.append(e) + + feature_names.append(' '.join(temp)) + + return feature_names + + def _more_tags(self): + return {'X_types': ['string']} diff --git a/gowpy/feature_extraction/gow/tw_vectorizer.py b/gowpy/feature_extraction/gow/tw_vectorizer.py new file mode 100644 index 0000000..6b518ba --- /dev/null +++ b/gowpy/feature_extraction/gow/tw_vectorizer.py @@ -0,0 +1,427 @@ +import networkx as nx +from networkx.algorithms.link_analysis.pagerank_alg import pagerank_numpy +from networkx.algorithms.centrality import degree_centrality, closeness_centrality, betweenness_centrality + +from typing import Sequence, Dict + +from gowpy.gow.builder import GoWBuilder, Tokenized_document +from gowpy.gow.typing import Tokenizer +from gowpy.utils.defaults import default_tokenizer + +from sklearn.base import BaseEstimator +from sklearn.feature_extraction.text import TfidfTransformer +from sklearn.pipeline import Pipeline + +from operator import itemgetter +import numbers +from collections import defaultdict + +import numpy as np +import scipy.sparse as sp + + +TERM_WEIGHT_DEGREE = "degree" +TERM_WEIGHT_DEGREE_CENTRALITY = "degree_centrality" +TERM_WEIGHT_CLOSENESS_CENTRALITY = "closeness_centrality" +TERM_WEIGHT_BETWEENNESS_CENTRALITY = "betweenness_centrality" +TERM_WEIGHT_PAGERANK = "pagerank" + + +# +# From: https://github.com/scikit-learn/scikit-learn/blob/95d4f0841d57e8b5f6b2a570312e9d832e69debc/sklearn/feature_extraction/text.py#L820 +# +def _document_frequency(X): + """Count the number of non-zero values for each feature in sparse X.""" + if sp.isspmatrix_csr(X): + return np.bincount(X.indices, minlength=X.shape[1]) + else: + return np.diff(X.indptr) + + +class TwVectorizer(BaseEstimator): + """Convert a collection of text documents to a matrix of graph-based weight for each token + + This implementation produces a sparse representation of the counts using + scipy.sparse.csr_matrix. + + Parameters + ---------- + max_df : float in range [0.0, 1.0] or int, default=1.0 + Ignore tokens that have a document frequency strictly + higher than the given threshold (corpus-specific stop words). + If float, the parameter represents a proportion of documents, integer + absolute counts. + min_df : float in range [0.0, 1.0] or int, default=1 + Ignore frequent subgraphs that have a document frequency strictly + lower than the given threshold. This value is also called support + in the literature. + If float, the parameter represents a proportion of documents, integer + absolute counts. + in the mapping should not be repeated and should not have any gap + between 0 and the largest index. + b : float {0.0, 0.003}, default=0.0 + Slope parameter of the tilting. + directed : boolean, True by default + If True, the graph-of-words is directed, else undirected + window_size : int, default=4 + Size of the window (in token) to build the graph-of-words. + term_weighting : string {'degree', 'degree_centrality', 'closeness_centrality', 'betweenness_centrality', 'pagerank'} + Graph-based term weighting approach for the nodes in the graph-of-words + 'degree' (default) : degree (undirected) or indegree (directed) of the nodes. + 'degree_centrality' : normalized degree centrality of the nodes + 'closeness_centrality' : very slow, closeness centrality of the nodes + 'betweenness_centrality' : very slow, the shortest-path betweenness centrality of the nodes + 'pagerank' : slow, the PageRank of the nodes + tokenizer : callable or None (default) + Override the string tokenization step. + """ + def __init__(self, + min_df: float = 0.0, + max_df: float = 1.0, + b: float = 0.0, + directed: bool = True, + window_size: int = 4, + term_weighting: str = TERM_WEIGHT_DEGREE, + tokenizer: Tokenizer = None): + # Subgraph mining patterns + self.min_df: float = min_df + self.max_df: float = max_df + if self.min_df < 0.0: + raise ValueError("min_df is smaller than 0%") + + if self.max_df < 0.0: + raise ValueError("max_df is smaller than 0%") + + self.term_weighting = term_weighting + + self.b = b + + self.tokenizer: Tokenizer = tokenizer if tokenizer is not None else default_tokenizer + + self.window_size = window_size + if self.window_size < 2: + raise ValueError("window_size < 2") + + self.directed = directed + + def __tw(self, tokens: Tokenized_document) -> Dict[str, int]: + """Computes the graph-based weight for each token of the document""" + gow = self.gow_builder_.compute_gow_from_tokenized_document(tokens) + graph = gow.to_graph() + tw = {} + if self.term_weighting == TERM_WEIGHT_DEGREE: + if graph.is_directed(): + dgraph = nx.DiGraph(graph) + for (node, degree) in dgraph.in_degree(graph.nodes): + token = self.gow_builder_.get_token_(node) + tw[token] = degree + else: + for (node, degree) in graph.degree(graph.nodes): + token = self.gow_builder_.get_token_(node) + tw[token] = degree + else: + degree_centrality, closeness_centrality, betweenness_centrality + if self.term_weighting == TERM_WEIGHT_DEGREE_CENTRALITY: + weighting_fct = degree_centrality + elif self.term_weighting == TERM_WEIGHT_CLOSENESS_CENTRALITY: + weighting_fct = closeness_centrality + elif self.term_weighting == TERM_WEIGHT_BETWEENNESS_CENTRALITY: + weighting_fct = betweenness_centrality + elif self.term_weighting == TERM_WEIGHT_PAGERANK: + weighting_fct = pagerank_numpy + else: + weighting_fct = lambda x: 1 + + if graph.is_directed(): + dgraph = nx.DiGraph(graph) + node_to_weight = weighting_fct(dgraph) + for (node, p) in node_to_weight.items(): + token = self.gow_builder_.get_token_(node) + tw[token] = p + else: + node_to_weight = weighting_fct(graph) + for (node, p) in node_to_weight.items(): + token = self.gow_builder_.get_token_(node) + tw[token] = p + return tw + + # + # Largely inspired by the CountVectorizer from scikit-learn + # See: https://github.com/scikit-learn/scikit-learn/blob/95d4f0841d57e8b5f6b2a570312e9d832e69debc/sklearn/feature_extraction/text.py#L1113 + # + def __count_vocab(self, tokenized_documents: Sequence[Tokenized_document], fixed_vocab: bool): + if fixed_vocab: + vocabulary = self.vocabulary_ + else: + vocabulary = defaultdict() + vocabulary.default_factory = vocabulary.__len__ + + j_indices = [] + indptr = [0] + data = [] + + for tokens in tokenized_documents: + feature_counter = {} + + tw = self.__tw(tokens) + + document_length = len(tokens) + denominator = 1.0 - self.b + self.b * (float(document_length) / self.avdl_) + + for feature in tokens: + try: + feature_idx = vocabulary[feature] + + if feature_idx not in feature_counter: + feature_counter[feature_idx] = tw[feature] / denominator + + except KeyError: + # Ignore out-of-vocabulary items for fixed_vocab=True + continue + + j_indices.extend(feature_counter.keys()) + data.extend(feature_counter.values()) + indptr.append(len(j_indices)) + + # disable defaultdict behaviour + if not fixed_vocab: + # disable defaultdict behaviour + vocabulary = dict(vocabulary) + if not vocabulary: + raise ValueError("empty vocabulary; perhaps the documents only" + " contain stop words") + + X = sp.csr_matrix((data, j_indices, indptr), + shape=(len(indptr) - 1, len(vocabulary)), + dtype=float) + + X.sort_indices() + + return vocabulary, X + + # + # Inspired by: https://github.com/scikit-learn/scikit-learn/blob/95d4f0841d57e8b5f6b2a570312e9d832e69debc/sklearn/feature_extraction/text.py#L1058 + # + def __sort_features(self, X, vocabulary): + """Sort features by name + Returns a reordered matrix and modifies the vocabulary in place + """ + sorted_features = sorted(vocabulary.items()) + map_index = np.empty(len(sorted_features), dtype=X.indices.dtype) + for new_val, (term, old_val) in enumerate(sorted_features): + vocabulary[term] = new_val + map_index[old_val] = new_val + + X.indices = map_index.take(X.indices, mode='clip') + return X + + # + # Inspired by: https://github.com/scikit-learn/scikit-learn/blob/95d4f0841d57e8b5f6b2a570312e9d832e69debc/sklearn/feature_extraction/text.py#L1072 + # + def __limit_features(self, X, vocabulary, high=None, low=None): + """Remove too rare or too common features. + Prune features that are non zero in more samples than high or less + documents than low, modifying the vocabulary, and restricting it to + at most the limit most frequent. + This does not prune samples with zero features. + """ + if high is None and low is None: + return X, set() + + # Calculate a mask based on document frequencies + dfs = _document_frequency(X) + mask = np.ones(len(dfs), dtype=bool) + if high is not None: + mask &= dfs <= high + if low is not None: + mask &= dfs >= low + + new_indices = np.cumsum(mask) - 1 # maps old indices to new + removed_terms = set() + for term, old_index in list(vocabulary.items()): + if mask[old_index]: + vocabulary[term] = new_indices[old_index] + else: + del vocabulary[term] + removed_terms.add(term) + kept_indices = np.where(mask)[0] + if len(kept_indices) == 0: + raise ValueError("After pruning, no terms remain. Try a lower" + " min_df or a higher max_df.") + return X[:, kept_indices], removed_terms + + def fit(self, raw_documents: Sequence[str], y=None): + self.fit_transform(raw_documents, y) + return self + + def fit_transform(self, raw_documents: Sequence[str], y=None): + max_df = self.max_df + min_df = self.min_df + + self.gow_builder_ = GoWBuilder(window_size=self.window_size, + directed=self.directed, + tokenizer=self.tokenizer) + N = len(raw_documents) + self.N_ = N + + avdl = 0.0 + tokenized_documents = [] + for document in raw_documents: + tok_document = self.tokenizer(document) + tokenized_documents.append(tok_document) + avdl += len(tok_document) + + avdl = avdl / float(N) + self.avdl_ = avdl + + vocabulary, X = self.__count_vocab(tokenized_documents, fixed_vocab=False) + X = self.__sort_features(X, vocabulary) + + max_doc_count = (max_df + if isinstance(max_df, numbers.Integral) + else max_df * N) + min_doc_count = (min_df + if isinstance(min_df, numbers.Integral) + else min_df * N) + + X, self.stop_words_ = self.__limit_features(X, vocabulary, max_doc_count, min_doc_count) + + self.vocabulary_ = vocabulary + + return X + + def transform(self, raw_documents: Sequence[str]): + _, X = self.__count_vocab([self.tokenizer(doc) for doc in raw_documents], fixed_vocab=True) + + return X + + def get_feature_names(self) -> Sequence[str]: + return [t for t, i in sorted(self.vocabulary_.items(), key=itemgetter(1))] + + def _more_tags(self): + return {'X_types': ['string']} + + +class TwidfVectorizer(BaseEstimator): + """Convert a collection of text documents to a TW-IDF matrix + + Equivalent to :class:`TwVectorizer` followed by + :class:`TfidfTransformer`. + + This implementation produces a sparse representation of the counts using + scipy.sparse.csr_matrix. + + Parameters + ---------- + max_df : float in range [0.0, 1.0] or int, default=1.0 + Ignore tokens that have a document frequency strictly + higher than the given threshold (corpus-specific stop words). + If float, the parameter represents a proportion of documents, integer + absolute counts. + min_df : float in range [0.0, 1.0] or int, default=1 + Ignore frequent subgraphs that have a document frequency strictly + lower than the given threshold. This value is also called support + in the literature. + If float, the parameter represents a proportion of documents, integer + absolute counts. + in the mapping should not be repeated and should not have any gap + between 0 and the largest index. + b : float {0.0, 0.003}, default=0.0 + Slope parameter of the tilting. + directed : boolean, True by default + If True, the graph-of-words is directed, else undirected + window_size : int, default=4 + Size of the window (in token) to build the graph-of-words. + term_weighting : string {'degree', 'degree_centrality', 'closeness_centrality', 'betweenness_centrality', 'pagerank'} + Graph-based term weighting approach for the nodes in the graph-of-words + 'degree' (default) : degree (undirected) or indegree (directed) of the nodes. + 'degree_centrality' : normalized degree centrality of the nodes + 'closeness_centrality' : very slow, closeness centrality of the nodes + 'betweenness_centrality' : very slow, the shortest-path betweenness centrality of the nodes + 'pagerank' : slow, the PageRank of the nodes + tokenizer : callable or None (default) + Override the string tokenization step. + norm : 'l1', 'l2' or None, optional (default='l2') + Each output row will have unit norm, either: + * 'l2': Sum of squares of vector elements is 1. The cosine + similarity between two vectors is their dot product when l2 norm has + been applied. + * 'l1': Sum of absolute values of vector elements is 1. + See :func:`preprocessing.normalize` + use_idf : boolean (default=True) + Enable inverse-document-frequency reweighting. + smooth_idf : boolean (default=True) + Smooth idf weights by adding one to document frequencies, as if an + extra document was seen containing every term in the collection + exactly once. Prevents zero divisions. + """ + + def __init__(self, + min_df: float = 0.0, + max_df: float = 1.0, + b: float = 0.0, + directed: bool = True, + window_size: int = 4, + term_weighting: str = TERM_WEIGHT_DEGREE, + tokenizer: Tokenizer = None, + # + norm='l2', + use_idf=True, + smooth_idf=True): + # Subgraph mining patterns + self.min_df: float = min_df + self.max_df: float = max_df + if self.min_df < 0.0: + raise ValueError("min_df is smaller than 0%") + + if self.max_df < 0.0: + raise ValueError("max_df is smaller than 0%") + + self.term_weighting = term_weighting + + self.b = b + + self.tokenizer: Tokenizer = tokenizer if tokenizer is not None else default_tokenizer + + self.window_size = window_size + if self.window_size < 2: + raise ValueError("window_size < 2") + + self.directed = directed + + self.norm = norm + self.use_idf = use_idf + self.smooth_idf = smooth_idf + + def fit(self, raw_documents: Sequence[str], y=None): + self.fit_transform(raw_documents, y) + return self + + def fit_transform(self, raw_documents: Sequence[str], y=None): + self.tw_vectorizer_ = TwVectorizer( + min_df=self.min_df, + max_df=self.max_df, + b=self.b, + directed=self.directed, + window_size=self.window_size, + term_weighting=self.term_weighting, + tokenizer=self.tokenizer) + self.tfidf_transformer_ = TfidfTransformer( + norm=self.norm, + use_idf=self.use_idf, + smooth_idf=self.smooth_idf) + + self.pipeline_ = Pipeline([ + ('tw', self.tw_vectorizer_), + ('idf', self.tfidf_transformer_) + ]) + return self.pipeline_.fit_transform(raw_documents, y) + + def transform(self, raw_documents: Sequence[str]): + return self.pipeline_.transform(raw_documents) + + def get_feature_names(self) -> Sequence[str]: + return [t for t, i in sorted(self.tw_vectorizer_.vocabulary_.items(), key=itemgetter(1))] + + def _more_tags(self): + return {'X_types': ['string']} diff --git a/gowpy/gow/__init__.py b/gowpy/gow/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/gowpy/gow/builder.py b/gowpy/gow/builder.py new file mode 100644 index 0000000..83fedbd --- /dev/null +++ b/gowpy/gow/builder.py @@ -0,0 +1,264 @@ +from typing import Sequence, Dict, Optional, Union, List, Callable + +from gowpy.gow.typing import Token, Tokenized_document, Tokenizer, \ + Edge, Edge_with_code, Edge_label, Edges, Nodes +import networkx as nx +from gowpy.utils.defaults import default_tokenizer + + +def mk_undirected_edge(node_start_code: int, + node_end_code: int, + code: Optional[int] = None) -> Union[Edge, Edge_with_code]: + """Builds an unambiguous representation of an undirected edge""" + if node_start_code < node_end_code: + n1, n2, label = node_start_code, node_end_code, code + else: + n1, n2, label = node_end_code, node_start_code, code + + if code is None: + return n1, n2 + else: + return n1, n2, label + + +def mk_directed_edge(node_start_code: int, + node_end_code: int, + code: Optional[int] = None) -> Union[Edge, Edge_with_code]: + """Builds an unambiguous representation of a directed edge""" + if code is None: + return node_start_code, node_end_code + else: + return node_start_code, node_end_code, code + + +class GraphOfWords(object): + """ + Represents a graph-of-words + + .. seealso:: gowpy.gow.builder.GoWBuilder + .. note:: this class should not be used directly, see GoWBuilder + """ + def __init__(self, + nodes: Nodes, + edges: Edges, + get_token: Callable[[int], str], + get_label: Optional[Callable[[int], Edge_label]], + freq: int = 1, + directed: bool = False): + self.get_token = get_token + self.get_label = get_label + + self.nodes = nodes + self.edges = edges + self.directed = directed + self.freq = freq + + self.graph_: Optional[nx.Graph] = None + + def is_edge_labeling(self): + return self.get_label is not None + + def __str__(self): + nodes = self.nodes_str() + edges = self.edges_str() + return """Graph-of-words\nNodes: {}\nEdges: {}\n""".format(nodes, edges) + + def __repr__(self): + return self.__str__() + + def nodes_str(self) -> List[str]: + return [self.get_token(node_code) for node_code in self.nodes] + + def __edges_to_str(self, edge: Edge) -> str: + start_node, end_node = edge + return f'{self.get_token(start_node)}__{self.get_token(end_node)}' + + def edges_str(self) -> List[str]: + if self.is_edge_labeling(): + return [self.__edges_to_str(self.get_label(edge_label_code)) + for _, _, edge_label_code in self.edges] + else: + return [self.__edges_to_str(mk_directed_edge(node_start, node_end) if self.directed else mk_undirected_edge(node_start, node_end)) + for node_start, node_end in self.edges] + + def to_graph(self) -> nx.Graph: + """Computes and memoize a NetworkX representation + + This representation is suited for algorithms rather than visualisation. + """ + if self.graph_ is None: + g = nx.Graph() if not self.directed else nx.DiGraph() + + [g.add_node(node, label=node) for node in self.nodes] + + if self.is_edge_labeling(): + [g.add_edge(node_start_code, node_end_code, label=edge_code) + for node_start_code, node_end_code, edge_code in self.edges] + else: + g.add_edges_from(self.edges) + + self.graph_ = g + + return self.graph_ + + def to_labeled_graph(self) -> nx.Graph: + """Computes a NetworkX representation suited for drawing""" + g = nx.Graph() if not self.directed else nx.DiGraph() + + [g.add_node(self.get_token(node)) for node in self.nodes] + + if self.is_edge_labeling(): + [g.add_edge(self.get_token(node_start_code), self.get_token(node_end_code)) + for node_start_code, node_end_code, _ in self.edges] + else: + g.add_edges_from([(self.get_token(node_start_code), self.get_token(node_end_code)) + for node_start_code, node_end_code in self.edges]) + + return g + + +class GoWBuilder(object): + """Builder to construct graph-of-words from a single document or a corpus of documents + + Parameters + ---------- + directed : boolean, False by default + If True, the graph-of-words is directed, else undirected + window_size : int, default=4 + Size of the window (in token) to build the graph-of-words. + edge_labeling : boolean, False by default + If True, edges are labeled with a unique code, else edges are not labeled. + tokenizer : callable or None (default) + Override the string tokenization step. + """ + + def __init__(self, + directed: bool = False, + window_size: int = 4, + tokenizer: Tokenizer = None, + edge_labeling: bool = False): + # Graph parameters + self.directed: bool = directed + self.window_size: int = window_size + + self.corpus_size: Optional[int] = None + + self.tokenizer: Tokenizer = tokenizer if tokenizer is not None else default_tokenizer + + self.TOKEN_TO_INT_: Dict[Token, int] = {} + self.INT_TO_TOKEN_: Dict[int, Token] = {} + + self.edge_labeling = edge_labeling + if self.edge_labeling: + self.LABEL_TO_INT_: Dict[Edge_label, int] = {} + self.INT_TO_LABEL_: Dict[int, Edge_label] = {} + + # TODO generate a real formal python representation + def __repr__(self): + return f'''Graph-of-word builder: + - is_directed: {self.directed} + - window_size: {self.window_size} + - edge_labeling: {self.edge_labeling} + + - Number of tokens: {len(self.TOKEN_TO_INT_)} + - Number of links between tokens: {len(self.LABEL_TO_INT_)} + '''.lstrip() + + def __str__(self): + return self.__repr__() + + # Node + def get_code_(self, token: Token) -> int: + if token not in self.TOKEN_TO_INT_: + last_token_id_ = len(self.TOKEN_TO_INT_) + self.TOKEN_TO_INT_[token] = last_token_id_ + self.INT_TO_TOKEN_[last_token_id_] = token + + return self.TOKEN_TO_INT_[token] + + def get_token_(self, code: int) -> Token: + return self.INT_TO_TOKEN_[code] + + # Edge + def get_label_id_(self, label: Edge_label) -> int: + if label not in self.LABEL_TO_INT_: + last_label_id_ = len(self.LABEL_TO_INT_) + self.LABEL_TO_INT_[label] = last_label_id_ + self.INT_TO_LABEL_[last_label_id_] = label + + return self.LABEL_TO_INT_[label] + + def get_label_(self, code: int) -> Edge_label: + return self.INT_TO_LABEL_[code] + + def get_edge_code_(self, edge: Edge) -> int: + node_start_code, node_end_code = edge + # Computation of the edge label and edge label ID + if self.directed: + t1, t2 = (node_start_code, node_end_code) + else: + if node_start_code < node_end_code: + t1, t2 = (node_start_code, node_end_code) + else: + t1, t2 = (node_end_code, node_start_code) + + edge_label = (t1, t2) + + edge_code = self.get_label_id_(edge_label) + + return edge_code + + def compute_gow_from_corpus(self, raw_documents: Sequence[str]) -> Sequence[GraphOfWords]: + """Computes a graph-of-words representation for each given documents""" + result_graph_of_words = [] + + for raw_document in raw_documents: + gow = self.compute_gow_from_document(raw_document) + result_graph_of_words.append(gow) + + self.corpus_size = len(result_graph_of_words) + + return result_graph_of_words + + def compute_gow_from_tokenized_document(self, tokens: Tokenized_document) -> GraphOfWords: + nodes = set() + token_ids = [] + for token in tokens: + token_id = self.get_code_(token) + token_ids.append(token_id) + nodes.add(token_id) + + N = len(tokens) + + edges = set() + if self.edge_labeling: + for j in range(N): + for i in range(max(j - self.window_size + 1, 0), j): + # Only keep edges between two *different* tokens + if token_ids[i] != token_ids[j]: + edge = (token_ids[i], token_ids[j]) + edge_code = self.get_edge_code_(edge) + if self.directed: + edges.add(mk_directed_edge(token_ids[i], token_ids[j], edge_code)) + else: + edges.add(mk_undirected_edge(token_ids[i], token_ids[j], edge_code)) + else: + for j in range(N): + for i in range(max(j - self.window_size + 1, 0), j): + # Only keep edges between two *different* tokens + if token_ids[i] != token_ids[j]: + if self.directed: + edges.add(mk_directed_edge(token_ids[i], token_ids[j])) + else: + edges.add(mk_undirected_edge(token_ids[i], token_ids[j])) + + return GraphOfWords(nodes=nodes, + edges=edges, + get_label=self.get_label_ if self.edge_labeling else None, + get_token=self.get_token_, + directed=self.directed) + + def compute_gow_from_document(self, raw_document: str) -> GraphOfWords: + """Computes a graph-of-words representation from a document""" + tokens = self.tokenizer(raw_document) + return self.compute_gow_from_tokenized_document(tokens) diff --git a/gowpy/gow/io.py b/gowpy/gow/io.py new file mode 100644 index 0000000..7428d2a --- /dev/null +++ b/gowpy/gow/io.py @@ -0,0 +1,226 @@ +import re + +from typing import Tuple, List, Sequence, Callable + +from gowpy.gow.builder import mk_undirected_edge, mk_directed_edge +from gowpy.gow.builder import GraphOfWords +from gowpy.gow.typing import Edge_label + + +def gow_to_data(gows: Sequence[GraphOfWords]) -> str: + """ + Convert a sequence of graph-of-words into a text representation for interoperability with other programs + + Format: + - "t # N" means the Nth graph, + - "v M L" means that the Mth vertex in this graph has label L, + - "e P Q L" means that there is an edge connecting the Pth vertex with the Qth vertex. The edge has label L. + + :param gows: + :return: + """ + result_data = [] + + for i, gow in enumerate(gows): + nodes = gow.nodes + edges = gow.edges + + if len(nodes) > 0: + result_data.append(u"t # {}\n".format(i)) + + node_label_to_id = {} + + for node_label in nodes: + if not (node_label in node_label_to_id): + new_id = len(node_label_to_id) + node_label_to_id[node_label] = new_id + + node_id = node_label_to_id[node_label] + result_data.append(u"v {} {}\n".format(node_id, node_label)) + + edge_tuples = [] # TODO implementation with a heap to be more efficient? + for (node_start_label, node_end_label, edge_label_id) in edges: + # Computation of the node IDs in this graph given their node labels + node_start_id = node_label_to_id[node_start_label] + node_end_id = node_label_to_id[node_end_label] + + edge_tuples.append((node_start_id, node_end_id, edge_label_id)) + edge_tuples.sort() + + for node_start_id, node_end_id, edge_label_id in edge_tuples: + result_data.append(u"e {} {} {}\n".format(node_start_id, + node_end_id, + edge_label_id)) + + result_data.append(u"t # {}".format(-1)) + return u"".join(result_data) + + +r_new_graph_ = re.compile(u't +# +(\d+) +\\* +(\d+)') +r_new_vertex_ = re.compile(u'v +(\d+) +(\d+)') +r_new_edge_ = re.compile(u'e +(\d+) +(\d+) +(\d+)') +r_new_parent_graphs_ = re.compile(u'x: +([\d ]+)') + + +def load_graphs(input_file_subgraph: str, + input_file_frequent_nodes: str, + get_token: Callable[[int], str], + get_label: Callable[[int], Edge_label], + is_directed: bool=False) -> Sequence[GraphOfWords]: + # + current_id = None + current_freq = None + current_vertices = None + current_edges = None + current_parent_graph_ids = None + + subgraphs = [] + + with open(input_file_subgraph, 'r') as f_input_file: + for line in f_input_file: + m_new_graph = r_new_graph_.search(line) + m_new_vertex = r_new_vertex_.search(line) + m_new_edge = r_new_edge_.search(line) + m_new_parent_graphs = r_new_parent_graphs_.search(line) + + if m_new_graph: + # Saving + if current_id is not None: + subgraphs.append(_to_gow(current_id, + current_freq, + (current_vertices, current_edges), + current_parent_graph_ids, + get_token, get_label, + is_directed)) + + # Initialisation of the new graph + current_id = int(m_new_graph.group(1)) + current_freq = int(m_new_graph.group(2)) + current_vertices = [] + current_edges = [] + current_parent_graph_ids = None + + elif m_new_vertex: + vertex_id = int(m_new_vertex.group(1)) + vertex_label = int(m_new_vertex.group(2)) + + current_vertices.append((vertex_id, vertex_label)) + + elif m_new_edge: + node_start = int(m_new_edge.group(1)) + node_end = int(m_new_edge.group(2)) + edge_label = int(m_new_edge.group(3)) + + current_edges.append((node_start, node_end, edge_label)) + + elif m_new_parent_graphs: + current_parent_graph_ids = [int(graph_id) for graph_id in + m_new_parent_graphs.group(1).strip().split(' ')] + # assert len(current_parent_graph_ids) == current_freq + + else: + pass # other lines (probably empty) + + # Last line + if current_id and current_parent_graph_ids: + subgraphs.append( + _to_gow(current_id, current_freq, (current_vertices, current_edges), current_parent_graph_ids, + get_token, get_label, + is_directed)) + + current_id = None + PADDING_ID = len(subgraphs) + current_freq = None + current_vertices = None + current_edges = None + current_parent_graph_ids = None + + with open(input_file_frequent_nodes, 'r') as f_input_file: + for line in f_input_file: + m_new_graph = r_new_graph_.search(line) + m_new_vertex = r_new_vertex_.search(line) + m_new_parent_graphs = r_new_parent_graphs_.search(line) + + if m_new_graph: + # Saving + if current_id is not None: + subgraphs.append(_to_gow(current_id, + current_freq, + (current_vertices, current_edges), + current_parent_graph_ids, + get_token, get_label, + is_directed)) + + # Initialisation of the new graph + current_id = int(m_new_graph.group(1)) + PADDING_ID + current_freq = int(m_new_graph.group(2)) + current_vertices = [] + current_edges = [] + current_parent_graph_ids = None + + elif m_new_vertex: + vertex_id = int(m_new_vertex.group(1)) + vertex_label = int(m_new_vertex.group(2)) + + current_vertices.append((vertex_id, vertex_label)) + + elif m_new_parent_graphs: + current_parent_graph_ids = [int(graph_id) for graph_id in + m_new_parent_graphs.group(1).strip().split(' ')] + # assert len(current_parent_graph_ids) == current_freq + + else: + pass # other lines (probably empty) + + # Last line + if current_id and current_parent_graph_ids: + subgraphs.append( + _to_gow(current_id, current_freq, (current_vertices, current_edges), current_parent_graph_ids, + get_token, get_label, is_directed)) + + return subgraphs + +IO_Nodes = List[Tuple[int, int]] # (node_id, node_code) +IO_Edges = List[Tuple[int, int, int]] # (node_start_id, node_end_id, edge_code) +IO_Subgraph = Tuple[IO_Nodes, IO_Edges] + +def _to_gow(subg_id: int, + subg_freq: int, + subgraph: IO_Subgraph, + subg_current_parent_graph_ids: Sequence[int], + get_token: Callable[[int], str], + get_label: Callable[[int], Edge_label], + is_directed: bool) -> GraphOfWords: + id_: int = subg_id + freq: int = subg_freq + + subg_vertices, subg_edges = subgraph + + size = len(subg_vertices) + parents = subg_current_parent_graph_ids + + # Recomputation of nodes + # Dealing with nodes: + # Node = (node id in *this* graph, node code) + node_id_to_node_code = {} + nodes = set() + for node_id, node_code in subg_vertices: + node_id_to_node_code[node_id] = node_code + nodes.add(node_code) + + # Dealing with edges + edges = set() + for node_start_id, node_end_id, edge_label_code in subg_edges: + node_start_code = node_id_to_node_code[node_start_id] + node_end_code = node_id_to_node_code[node_end_id] + if is_directed: + edges.add(mk_directed_edge(node_start_code, node_end_code, edge_label_code)) + else: + edges.add(mk_undirected_edge(node_start_code, node_end_code, edge_label_code)) + + return GraphOfWords(nodes=nodes, + edges=edges, + get_token=get_token, + get_label=get_label, + freq=freq, + directed=is_directed) \ No newline at end of file diff --git a/gowpy/gow/miner.py b/gowpy/gow/miner.py new file mode 100644 index 0000000..0fccce4 --- /dev/null +++ b/gowpy/gow/miner.py @@ -0,0 +1,72 @@ +import numpy as np +from typing import Sequence, Optional + +import gowpy.gow.io +from gowpy.gow.builder import GraphOfWords +from gowpy.gow.typing import Tokenizer +from gowpy.gow.builder import GoWBuilder + + +class GoWMiner(GoWBuilder): + """A miner of frequent subgraphs for a collection of graph-of-words + + Currently, the mining operation is delegated to a C++ program. This class makes it possible to load the + mined sub-graphs-of-words. + + Parameters + ---------- + directed : boolean, False by default + If True, the graph-of-words is directed, else undirected + window_size : int, default=4 + Size of the window (in token) to build the graph-of-words. + tokenizer : callable or None (default) + Override the string tokenization step. + """ + + def __init__(self, + directed: bool = False, + window_size: int = 4, + tokenizer: Tokenizer = None): + # /!\ Edge labeling is important for IO + super().__init__(directed, window_size, tokenizer, edge_labeling=True) + self.frequent_subgraphs: Optional[Sequence[GraphOfWords]] = None + + # TODO generate a real formal python representation + def __repr__(self): + if self.frequent_subgraphs is None: + len_frequent_subgraphs = "not loaded yet" + else: + len_frequent_subgraphs = len(self.frequent_subgraphs) + return f'''Graph-of-word miner: + - is_directed: {self.directed} + - window_size: {self.window_size} + - edge_labeling: {self.edge_labeling} + + - Number of tokens: {len(self.TOKEN_TO_INT_)} + - Number of links between tokens: {len(self.LABEL_TO_INT_)} + + - Number of loaded subgraph: {len_frequent_subgraphs} + '''.lstrip() + + def load_graphs(self, + input_file_subgraph: str, + input_file_frequent_nodes: str) -> None: + self.frequent_subgraphs = gowpy.gow.io.load_graphs(input_file_subgraph, input_file_frequent_nodes, + self.get_token_, self.get_label_, + self.directed) + + def stat_freq_per_pattern(self) -> np.array: + """Computes the subgraph frequency series""" + return np.array([pattern.freq for pattern in self.frequent_subgraphs]) + + def stat_relative_freq_per_pattern(self) -> np.array: + """Computes the subgraph normalised frequency series""" + return np.array([pattern.freq / float(self.corpus_size) for pattern in self.frequent_subgraphs]) + + def stat_num_nodes_per_pattern(self) -> np.array: + """Computes the number of nodes per subgraph series""" + return np.array([len(pattern.nodes) for pattern in self.frequent_subgraphs]) + + def stat_num_edges_per_pattern(self) -> np.array: + """Computes the number of edges per subgraph series""" + return np.array([len(pattern.edges) for pattern in self.frequent_subgraphs]) diff --git a/gowpy/gow/typing.py b/gowpy/gow/typing.py new file mode 100644 index 0000000..325512a --- /dev/null +++ b/gowpy/gow/typing.py @@ -0,0 +1,12 @@ +from typing import Tuple, Callable, Sequence, Set, Union + +Token = str +Tokenized_document = Sequence[Token] +Tokenizer = Callable[[str], Tokenized_document] +Node = int +Nodes = Set[Node] + +Edge_label = Tuple[int, int] +Edge = Tuple[Node, Node] +Edge_with_code = Tuple[Node, Node, int] +Edges = Union[Set[Edge], Set[Edge_with_code]] diff --git a/gowpy/summarization/__init__.py b/gowpy/summarization/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/gowpy/summarization/unsupervised/__init__.py b/gowpy/summarization/unsupervised/__init__.py new file mode 100644 index 0000000..50b2e0d --- /dev/null +++ b/gowpy/summarization/unsupervised/__init__.py @@ -0,0 +1 @@ +from .keyword_extractor_gow import GoWKeywordExtractor \ No newline at end of file diff --git a/gowpy/summarization/unsupervised/keyword_extractor_gow.py b/gowpy/summarization/unsupervised/keyword_extractor_gow.py new file mode 100644 index 0000000..27ca107 --- /dev/null +++ b/gowpy/summarization/unsupervised/keyword_extractor_gow.py @@ -0,0 +1,48 @@ +from typing import Sequence, Tuple + +from gowpy.gow.builder import GoWBuilder +from gowpy.gow.typing import Tokenizer + +from networkx.algorithms.core import core_number + + +class GoWKeywordExtractor(object): + """Extract keywords from a text document based on a graph-of-words representation + + Parameters + ---------- + directed : boolean, False by default + If True, the graph-of-words is directed, else undirected + window_size : int, default=4 + Size of the window (in token) to build the graph-of-words. + tokenizer : callable or None (default) + Override the string tokenization step. + """ + def __init__(self, + directed: bool = False, + window_size: int = 4, + tokenizer: Tokenizer = None): + # TODO is_weighted + self.builder = GoWBuilder( + directed=directed, + window_size=window_size, + tokenizer=tokenizer) + + def extract(self, document: str) -> Sequence[Tuple[str, float]]: + gow = self.builder.compute_gow_from_document(document) + graph = gow.to_graph() + kcore = core_number(graph) + + keywords = [] + k_max = 0 + for v, k in kcore.items(): + if k > k_max: + keywords.clear() + k_max = k + + if k == k_max: + token_code = graph.nodes[v]['label'] + token = self.builder.get_token_(token_code) + keywords.append((token, k)) + + return keywords diff --git a/gowpy/utils/__init__.py b/gowpy/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/gowpy/utils/defaults.py b/gowpy/utils/defaults.py new file mode 100644 index 0000000..3b9447d --- /dev/null +++ b/gowpy/utils/defaults.py @@ -0,0 +1,5 @@ +from gowpy.gow.typing import Tokenized_document + + +def default_tokenizer(document: str) -> Tokenized_document: + return document.split() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..9110b1b --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +networkx>=2.4 +scikit-learn>=0.22.2 +matplotlib>=3.1 \ No newline at end of file diff --git a/resources/gow.png b/resources/gow.png new file mode 100644 index 0000000000000000000000000000000000000000..6dd3760a9e1f7552b437fd2a76041d2a4b055c75 GIT binary patch literal 47504 zcmd43_dnKs_y%l~O*YxtD?4P9y^_5{lB}$_?2x@zR(AHv&R$6pLXw@3o$Tn*X1Am*>gPFCUZQ_vva8pWcluTf5TKHvG$aci$D!Km^L1m_Z-0VW{BD? zN601AghbJyE%M`X)Yk^cDeD9Taq!IG3;F-lYLh(Cy~|a0en}#&H1Rk|%lG_LiXmRg zKB-!tOSRY{?_~5<0Y`{@z79tUFPdKwQP6*1HonAV2!pSh$n;M+$Z*5zi{JF#|L=D& z$)$1OCr+Gi_jTbXcIt9r@yNHB;&LJ27v*R5be0QsILJPxh5Y~IBNnk%wk)q%y!HbA zsxV+7@7>}5+GVYg^Wc{+ z&wRG)I5%HRPO5Hy_msg8&vN~Imxz&Ol%cFy7Um%fiwp6z`@<{hF{$UAdS(1eZChFc zyYunPbf3{sas#Df4i5Mf3-bnKE+qb+1J<1 zl6o%XA#odJiG|^ez$mTf-pTtt6?*Ca#{BdCIZ6|n_xzEBbIY=lkJJ+sSy5AY;nxuw zgKwG**`Jk8as9q>u6uiX937>34yQ||p$S_XIUsy4#I3hT>80jlT-WWFAmJR`9z{FEcgiVBM&8uP(}2R^H@Vkjt5 zV~;wAGOelhrQ#8t->YtCYS`G=pe?)lj4q-hr=RWZz9&_~EPnDZ>-=M?5-l85M$$;e zR5f0oYLlvPv#TcE&-FY!6zc0lui-R_-qDAv73(2y#G(dU$J76hlw~u)*r6wyVf3t1 z+7&+znysMmI$;{}jH19Vs}$=653L3d+x9LkF;*II8#P_AP-392xxX-87NUvOshCO_ zw#~@M2x7!}>UN4e(VY3cGfXhtnXNE`im-WSY!q=i!kE}tR1~r(M?XPbGF`s(;)_Gw ziYY28s_yRYpm&SI!>U{X@ZJe#xMC1T)c(aK%%H0{oo8m&fe#(b$6V91zNTk$MLX-J zA&0tLsjd<^a9s|HX!zAz{zS;@8pCz%6?GK_s9thJZS3vU+`K18OdLOC-7woNfS;mP zY`eqza9St2$0`Wcz@$oXL)`&pP-H3R3fP}!Z)aB+&x$m zl%Nc?rfMn2m82kI@K^<3jEfWw=kal>>2jVPRualGi(l;)PqkncVzMZ^>6@ z2U`v^TZ<)5y}^a3U`-xoB*V3Kb*<&|GE|q&yI*JYa@@gezuDeA zr_saP8*X&t$oIpVyU8jt78A&ZxsbckMaVJ+1qTP)W!+EywqSD`uc}4j2J1q)1{-oR zk&(JWHmpVoHw+CS&5*6N9@1jYD>cTFqg7Z@@pQba`S)*D_+*#}MP?Fw ze0-H5CAdV&089c}vEB{(cb^@Q|2KQsrl@D^9HDZxi8RL`g;bDcs)Q5~a zyoCEH(|4SEij1zx5xADpyDn;w;?z|F6?x#&v3yX}r@juh; zA{XX@3Hz3j81jA2=kWO?b@IxY`qwm_Yqcg-I$WtRaw>XytdV+;6q2wB`#6c;gxibS zLT|<3w4?&zD|dJ-#YVC^g1!Qfr~!Zk~fdx;J9q0P{w5>sD%#I|rGXpU>q{ z{PHIu?6|+yIwnE5i-Mm%yeJ#JY83T6ruq~-`v)?ey`vHzrwGZ|UwMIPL&U2iG>~e+UZ%RwCrSpi?is4sG ze{-wl>lh3zEj{yDxxsJY!xVEJVr9zs?r#+(tt zC`ns|t2!vu`QEJ^;Lb;QOU>%ySkiSR(tA~`D^qGB zLeUtN3*iEYX6eXx*+U||s>2jluhzL?mgak|k>Ae7_j=L-j_LWaHFpR+L3(9Hg)tl{ z4+TZ-d;Cvw;%8^amh!I8Sj{}fYZ6DsG1q*}YDUO2VqhyY zb2G`wVxY9}>(6Xex&}jO6rF0tXf-o)t}u)xaY9Ha10hEy)68&0glPz}wMsn#dCjWe zB<1UsK|W(jh3&-A?I2uGQ$u7@<<;XduzH4p^6ouVG2Pv!%KgNWNr{~5{F2F$aTUM% zv&z(p=e%Y57$0oE`f1G?L8f*RjkXnxgjm~Q5no-+M};5W6oX$_Dbw_`nJcP5|LgqR zTsv1qDWv!G5fSs7&2awEv2Cbm+6YeOy-~W(Mp@(T9cyKbimiTqus6V6^9dED^Du{U z|0M9bqHW!O8Bg10v2$^`tQ6rhBpJYVF96WnQvc1pwB0fWCZw!HS0|Gpc=P2X24b9wE$@S^LbmN!eT@5lIbi`PAFL-UxS%WR0#zW6sfZ$|4l zw}^Df5=YjE@v#nmWw|iZ#LCyqJVGGo(9zKaGBrF4*m2O3hFPWP$CB!#KMq=t++cVj zp?PjxJv~JUakb9qHJ`e})XK$(rM>HJYghNESYFu9T>Jh(yi6R!<;MK00qJP%fIkjq zu00AI6eHg{{MhLO5j4ZLs30boJCPFGmqseBOGZK%k>xDWK{iTJIWrM8aPmnA z6*;Dst?HANe~0&7=bqIpCFef@LX`#t$gdy})gac) zb-3(K=ellnpRLC)bzTsv40$Yv6^ie?3#jD@O}EU&9K$<}2o$Hs4a2827mmBBz5VRG zAz$bFjj_XwtgOEN{+D0h&ch*ga>9p1Gcr9N-e=vv>c+Wc%$Ke{cPIrFQClsUQX=j$ zmd$g_Oam)tXF|FmYX^t7yqy<5QiL5Ai%BE(sXx;guJ!|KpQb1To#Vow-t5fsbD=Pu+dGFQ*zH6K?_l>=o8D5SbyMs3aAOJwx00^9@ z!B)(gy0Boe;n7w?@t!Xo@8->$NvVce-D$(N31jOJ33YiWmQOD3h>A`hpU6LvF&$iC z(JaNr#-5wtEuMu)V6yZ5Pai`rdLDl$e?Z-LjF~mTrPoijlm*XTY!13z2N5Bxa5gE|!e(zo&L-$^wWd{+8zrXM4 zCAB`6%gpH+;>q8Wt>TMDN|FeL2rB{8=08Vg6Qlm(WNOh6GCU**^2Q=y@7{z9nVXwa zF)-W@43zon>py8hAGgYggXSS|3+3z7nL8c(y=sU~hfH@Y0Jq%Ix)UM z)=w@n{DsS13#eA|+&Nnc~`qe1YcPDs{YfvYdFI&29L0^vqR`yBfn!cZ$X9E6eUfa;eiLruX~uMVn(0 z@=sEVlk@>uV>QopFsc?)JDO4v5)$@qdZ@|Yw@4SRWQ;cYj$`wvcGElu@KmBoQE|zn zI#U!gz}{^yKlL$7O@h4fxBWGN{?ecCoiLF_({&!iK60IU*cCjJ6>7rUr^-I-_PeY- zVbtIe5v9dOKYt)~ch|p>C!QDMnY^V#pgtMlG-IcTi8L|xAjiy;a~Xba%jfr5%L zQ-e8GE!fUDW7Iw-Cg$Cam<4RK!?Q1oi~NzM&whz+x|v@10$MHBD=QuluzP0Ruj!Pv z?tb?aA>A^+n|AKP*Su&Y@T#t^E{)VyMGU9eZ?o6%zoG?__-|*xtPqeK8%^5E^tm|5 zqS_wa_1O4gTA?Gfa1hr{5{aNuE-X)NjyB_q*xY;q7zXx{1{->;GCCA#h2!52l;_BB zbu`(Sx^moGp=!f`Z#SCEPb=d?rRV5~3&$Sdq=`qCB6Bb#tMrlE_kJDte3_b>(r3Z@ zm}os|f;dCa1Ihx(S5<)h&Y;#}A@RA((OTMOVv zSJW)TtMkn+)Nh~$X_bb~I}bq#@xxgR33lN)YOq1j%%9n^7R+?$;V<6#%2xLC@GLD^ z|MiP|H8Xh81NB84sBfP?Cxsd+1w+Aj(MfHY8!9i~j*YFYP>UMNvm2VbS*tB=sTPH= z3UG4QAy4U)=`S6g!978(_C~J^`7)>gOb!1v6zRw}BKtCVh~6@=PvjG>{e-Df4xZBwU`x3#gsp}}@jgDu&^I%;SY2P;&7CKi$%?`={zYrR@r5;@MA z5@k3cD0ySpvC0iLVx?w$&5U)itSvgzGFhLeW?yC5hyZ~6U@Q3d?_ZvF^I|>YuTd;A zW{oDitxrDZ5k^~>~=xczxeE)oYRLKv;!&+K99`U?#W4FG{Vo^wz{+6rc- zQ4FLs+C4Bvq^c?XY4Dc+`*BS$Y1r1hDvl!xFf^WG-1>{*kOvi0FuEl2^#dQPMiW!| z;IFuJLrhfoHc+)`{f!!6hTAQW@qm34Ze6tU10fxsn@bO=0eLYfjy*%FbU5IR){Q1G z`uh3b{P%+Jp!AJbm>mECnq)j7X1 zP&9V`MhHs$5X(TXgq$!9)b2_WIcXTxMm+Lt$*O3N1v9@)PBs)}pgfhy%m<2(+Od)P z@tQkza&q#wx<^9=V~?n?utHHcJsK(Eh5t4g@rZkD5SNsc2uMh@Z9Gp^yS?otx$V`S z61;wRRur8zY#X9B<@+PC-p!at)y~Rl=@oC3O&yhBrbb^ZujAHYD1Ij^n{xbnd~5@#X?1ngwS9hLb2E`|gS%DDRb%S}m1nMxTOqX8Jk2{b-F$p^aYTf-a1UEhUStSI_@T;qUlo2}(O-*Ze zcfE5B&a5n`IN;R6v8R(pP=(3MonMO6#gUKoX(g%^3y6tfH_y%Q+XzF&#R+>MZ2HeL zl6@`PFfjLAQKr4P4C~o!pw;{zJ7+g!@~mjP{_cGju5+kw$fvnX}W&cwqT7U3j;tW zLo3FEg14ylc{OAicn9&cm z^>H=MO8*f#W_*@^S`N?tMb+sfeMQrsPX{g_=`?fH{_oL|s`+kKWCf{Md5=|sBl?3f zG8hiz<4^WNyz$?H5{v{m`3#|k&C~!MY0L4-HeV+Qpv}mdWBF1x4@LU83-|jy*jHbC^NCsx9$tjbj7d6chQbX-AuST?!~jtcI}-< zels`M^q`pkp`RuNuHE*+B?r9IN^~*iruKVUA)&h{51(!i$pm0(ODzxFUH^xcuS4N` zsoAm;HMDe4)Ge^6)N1C**}wV%vHd7Mzh*D6&!&}{(&Cg6)|`zkM0~3Iy)TO&tYu3Fnab+N|37ct;26dvx}_b6Vo>eRT}p3qY0)923x`dmGm9(?C4 zw~akUr^UvLK%oKnCt}<|kvXv&&^LfL*cuK* z{hnHNM*VQg2yaQelx1XW?Td(rfG{+Zx&epYM~WWKK*>l>n)`E7hisQewJWnmWjNy3 z|04E*bPj6YTfobV8Hnar7PymGIkyW~+ij<@wA)s1xg_f8L?roBc` zrCSD&A&K^;Ig6fD!z;0n9&O&Iqg6^a>dc9g4zDZ0^#A|u3vRSTmJo7wQ8iX7A$ic6=7dJ={ga6x2RSu z(Q>q98A$o_4{n_F_ppi@X;kmU%eHs1bIMYHVV52l^rG!8IUt`a?#^P2VX#l5V&(E*uC}MgPAq=yTCyMQB+KJ0+cEenlHOj zogs2fLZ0HQ>K*t<=_N(YJIky<`V^ z{NB|KnX};voUnSJ+&4W)#vSNU_LK9nqo3AXvNXALp(`jTn|aTd!CoaX^P1IyqQT64 zm_>yx8d)88o{diC>j>xmSUP$9WvV!M;Ez{r=D&UMxFO1}Q!-@j9H~fxk}#a2Q-+f; zoE9hpA1}o(TeEI0=oxG`i`lT@>$m9j!lCiuiJhn^TJ1aTqU35V(q=~HcXX_U&qX#o zB*b&Hd^#-PINJF5P(={l>IGtVaB%QDO&+i;8@s#8-rmjn7i%Ymly{0Av+%}m;1_VH z<>;#hx^Iv3Cpq2b zJxfAepi~vvt(F?8-wM%IHcr zlT`6o8juBWxv=4psS9u+1RIR_p@!f1--;PUlD*n*0VsyE0T~V`TY;=m2>VYB`09-_ z^NJwCS&L@Pm?x>RyefF*DvmEl3=F49lM3b5*jN1fB2ulMaQ`PmgnDnRQXd4EKK~KiKz?dZkEeHH&N16y%W7*L6A9 z;5oLlId0>~)wmI0CrCkQwlrAPti_T?xpY+G+D8yLxH~VICE5MLqj#m^S>@cTjKq>} z(Z#mSXq?|qe58Q&?(6GY64$E#Ac4sl4`~irpt`v516pKbV*`Rd%tvZLw0X-Qn6(Tb ziIJKfQ}wT-w4w=)=g*%fNd|~e{MHNGYD}oq`W<4qSm{Fnho1h$PxOoo6&tH1QSls^ z-s>|N?5(D|CqLxguisPn*0jU!+U^2nUBt+dmt^%2LmraLLIwD>J$EAcBk=m{iv!s! zMfaAC0?R&!f6`vtE$C2-yFt(vxn$ezypZA`<7sla=G;CuJqTrZj|(_xzI~p9f?Iqw zW^vhn&yTJ&$6#+}v3bwu-wI`fRo#yme#G){Ti1<{ApXD?5E>~!j$Hjaxi@rQy8af! zwa@DGhRr+4*08@+PDOQmOnGApRR+ydns#+Y0Ed?ke2|1TP?vzfBqisXW{e0yYnj;^ zUtEkoIB@&sBnlk!@Hx~NP^Le1=b?>tZ;dhH2?s?ADiH-@3^;_KUG+FOyEss~LmX#L z44T`qcr!mAUbIte|2mGPP2dm6q)>r3OtHv zyW!|SDf;^oIfF|}R;KY=-NER9$(}_|-8T~-upt3hL(uLDH8s?h=jZ1J{`iNPIXOxI zU%z}I)+;0HT^7jJdTjcmyb21dBAOmEJ^*-(n(^N_4$8C`aYDZ8WUqP2IQpCnHGu8$-?^YXEpfIBN^9J7!J?qE=ht z2u8Mdcf%QR^m^3+kunko>6LA(I6c1&q&r|TctPDWu_;_I?l6y2tTpOo>heijZ``qw zmQS@#pI;EIQ7$~VhgP*_dhOIgWm&A(6O4`xSrO1Uairp#DM#AZg;O|m zLp`vB#d@j-a!@2zOtm?ZXp~Osz3F)P>gem@;^Ksqj395veBHBkno_$40DEj^ZEOfQ-y+);apuYHC1@<$XdwEmXbK#;D--5gbdO=Q~#>O;M>ZD z@REa|20H{W)X$$qU!7)c1*^Czmu*zJyxRRA+%6MF!{@1Ybnd)?kjZ+@Oaz=mL*CXF5dN(V?^>uNT6uemK{e@<>(b_Wk3OCyRjqL?VqU}EgdKS}DjM3`X(^(W>i>z_ zj3z_&ppZq@wOLu{I^^9()}YBU7p?=2;lUr#2RmlKOF{qIoJqe0aO4 zw^@2N!K;NM$}=q<3Rr`|s$G7H=tkl4A__1XG|UQ!iiYP%diOdv7Xp5PV^`C4B@s1X z^Puw@BJ}L6F_4u_sd98rf?*P==;->lw=2mMvR=RbzCVo8xXv(A5BI5Ou#f1x+*!XP z=q?F{qT1){+6P}0A}E~-CpX{db8U5=7+_yIE5qtTq4&et`#vT5C|eWC)K(-=mP2l! zF21wematVYM!Y=hEuGmS!40F3THt^U1KX@>ns}t%b2cBa1PnZMaqqqWRSHi}u|=|N zxu~GcTirirQk4czO^+LCK9G6G4~2pp04^5Ly<_6zUA|?gzznCT-PY%tidtX#Utg22 zpQq1KuvnenNg>jQL^w2;EQ2JCav=S1rG8A)Ff)#Ubp?JSePn7pj4t!f#3w44pxdn- zLYerAx>5za7aY+IOf`asm|A@43T66g#<`=UvC2@8N~T?D+JwMZCB3#Gq$%XF-ca~DtWi;3G61qtW8P9dA*@ZnTRsudZd*w< zxszrG1TCyIk_euD#C(%_dU7CH!;O93{9k~~zkus<+ppTtq-*Eeh!}Fa^b~d)|dLGEnKi$tlOm|kdHvM+UZ2zUf zLA{ArPA*m|pC8n>oFJv++mUPXdn-ZnHUf6JX6c3Z_wOu+qbi}l^|>%1pxZ?q{r!s+ z|A7vK-2iqANHGl!4M3Pdnc^Is+LRWyH7Z-uf{lg>6AY{-M5E>oIiCic5XMLva35W> zH-j3SW=E-Mq*ta7o*umVmmLda$GexjxrjeD*&tuo3Q`Qw`>mtlC4#MH-WiCZP36jC zgml-^!P!bnwvlr`-km3i>Y1&XPv(nJ`>0dLu3#NHLj*j$;hpli?#WB-im3u49?^$_ z5M4kwloZU30ACQqVvohWlfw3PzhK!BTR~uxbF?CW+XFx`>DDh3EgHgz~C6Rc4KW^VuM zR;!3k_&>K=?X5lm_!I*FMOG{$?(mJ zPXND9dV642>T~5oDCQtT(Z`IljW^&T z=S)inDQ;~2j<7HWoY{yz)QCPND>W5cBWmCV~E=F33UgX|~6D69JT7P%=D|-7pVVHa{FrGGb9OQSB z{daym>65IlSd-$+LMz%d>V!Y z2hGWzS))G2)Z77xC#S{Ej+_X^CoHtpsv;>W7dvbVGP%&KbpE0kaX^RyZ1_YMV#|RCnw%)EybdZ3>6p?T3cJg z`RHJ1<*C1gVQG4-bR0c27&kx5iiAT_PxHLbm8u3uC?8A^phUr8o9M@GHRFq1LL9Ij zbR4LQ77a35nLh<}DHyDH$@Lf_dyl?)Wro$`7EW(9auW&k>{v+{Lh z$p)I(_rI3>;b(Vhq)@0zwHKx-zKCvpui^ysciDwz;o%pWyIx7(QbyJwMFajRrmq0& z3TNW&!|y6-yTGIZpn(z|>G5m)0miU8qm5N@ea*&RAUV)X$Q39#sjs{yVt$pNDuXy$2`=*Ad>w=qLfl4KuD6 zJ2gu=d+%0v+ZTiS@`pf_rT8rI^k5<<@9OVE#JXglJZln2zIVZAvEIG2Ab#)Jj3lkz zX_~3$=8-QeX{6m;yi(!#!)A6jt z3nhE>epBV;=8|i?Ju3J#S+pZ zh4QPa|E(jP1ZkQ5zVtN$BZ9~>vNVryDFlzc*Ka}SC3|iQl@veB+6(5NRs0494bn>?5Gj_-G7N5@vdT4_xrE=+7U3se*si@@mceTV3IA$IIV_+Ji?kGDJ0EN>!brb;~c~ zYriT*SK{1_GxbQ-sO<2%*c`D#bw5koLJ^Zm%`Xo08uTc>~fySL!hg(4u(eiGZ(6$aPW^ zd;fe(H`{NQ5q$M7Rsue#Kf48DJosYcPgL>TVwK}^bDR6fX=3SSx92oBFD{Ne*Wb)N zQ{;G8+uAvkHQ4Fj1qpdCo&Kg0Bh@Fp<)=4kwN7bn@Fj~cB9^(qySe6WMRim9AkuPq zvlRWy7WwH4rFJ*+wErFVUJ1P3d%k~l63+-y%eO~+y^S-u(902%cl{5A3ajN{P-fNT zYA_5;H;h==F5qDE#}BwVL`S+)+*|!mcfGG)&bb6)HZ;%y9}HBv(0(vcjOx&T;Uk>Z zLXj~cc*L$7x(JA_w*oJzr7dap01RRV|!7FI&zsk3ww0M^H;V-;p{~gUyI1P3ghe z^VV#AX1S7YraTlzRnwl60D!>g|*xI0KL#R$IYg&tW$@eJ*kj3G==ioTHFt zphyQb5zGh5_wR4972Ibg0UkPK12rLPA-FtjVSc`2>7M}@*y-uRfwX8fQ`XiVYctMx zhptNo#Q``LO}MCB*!vIN2FQn&9x+H#fWDCT@2AT_{B;Y1zdu(k&tcm78((M4P&p%> z*hpQ0jH>nka3g_L?L)|p86Qh^>n~fx-$Kc`lR*wo@LRA<)~??KBcEXA!PUt%4W)&6 z$i=QCL$EB?ZZoYq;)za;v{E zK7d#>3(Ng(yVLq5HoebH%j@$7uqNWU_1@^n$kve0w*s%&UeBcVh3jf;DbTEb;Atu; z2|+0riN^I?j{`?tf>W0j1-|)ix6{G4o1O~m)Y^XJQtYi%h zDb4r{hw7!(i%Eg>1UeKHpx;Rej!G_IgxGupp#)A3sPY3gZmQlTdS(4Ukb+AdEf@u` zMEt4u1R=5o!06u01^hGPO|rzeA~R}NSI~@QAoUR4BebzY4zhY;r8h>EUhl^pgaxR| z7Y{*r!}SZunl&aw8l?6!$LOLX7;=H4bpv6dw`V5r{Og_5nZQf*xO>xag>m&vsaz|~ zT}MkR(L-~`0nHLwdZ%Dn1>S6aY34uIaiot~eCu7p84Euy6f2)^F&%#KgO?`^{kOc| zQzGMk!)ITV10Pgasn}VnK#is5Jxb#Q)TB2_k0dEDamR7GF zFTbJR^Dv82^yzKGz+|+YbefE#Gq$P?JCqdLOm@$OA}#~Egwr0g9M4zULFoWEQb9rC z7A5psKutr@aUjJ%@Z^O>KMX`h93f5sA`kTTEFaH}4LobU1RBk8MHd3P?CY@xf}BvK z6ElK{-Kyad<=#CBVErk(9nCZ9E+Aqch2A9f0Q4F$f@IaO=bzmkEI(y|*>)VE;YllR zVI{o*eGBi;9SK2Qw?7=6I8KcU68ii$4IQy^^Zsg@sYS10*Lk&)_Q$gpvm@Tdw5|7h zC#81;>ua@YXdWfVRT+e?ii7D#zg!|08Xp+x;tDI&2||~5tU5rc1WT*RiJ#x%=b%YP za2lwxC5B@Lu8j8c+M?8AG`Ry3)QKq^Ih8JAMQ>1=_=p4u9MoH z@wack#Z=`_Ij&m{xD!i}FpbN=X~Y0_^pV?rUlMKrm}y+P3sb9&SZPMufSo&z?7+7W zc-z=cy~B2=uJY@P#Wg`2cfr;TKW3IF@#;32%N_-Q8af~V z-II$Q@#k*)Vk8g0)aPonQDy<6e`~An$B%6Nt(j`;fv4bE8vNf@0#fHcrp6!LSz<9h zXjV-1I*F>UH2r7{{cC7m?L1*q$Cx#$*7bxm&>0I%!?DWXWdK!B^oMsd^|!CR_w_~| zl%?p4`%kFCWUKuf3Y3Zm9ZH-dRgg}cSV7uya5df!*n@!c$&)3St5wYH*+Z(_7r#hB zLG$(X$arnSVbTcGa$R1r5X|rYL`F-dgJI~_49mbN7FJB{oQTU5-a}5Z@@pByslZ=f zUM;CQT0_fA3@{xC-MN=D09I@+NT}!JYY5=IwY5*`=Y*rj9a8u{fyfCYCs@|t1(fY; zmX-{hX)hJyXYx@H4EOg1pguqvY)>u>=OdxuzzCHCm~#7~#7pgOQ2GAor%yavyQ~8E zEUhm}OH0M{^ZlTx7oak$7b18fQ2RADg)4!!?z>9wg9^R}ajU@XrvKZ%8;6`>k34HC z*rZ=?h;cTX>|U@e-g=_9;7=}AVng!#j9hymS3-vkw%K{0m^F{ILYFN8kF<_ zr+XCdf5qK0Ky0cH@z1`vhImU7y>^foFBN4a5Zxokf)o^6=Gj1{w_JUmcM%8-m)Jq1 zcFfW$gA6Ll5}1tn(p|TzN|za~53c*VyLSX=$WMQ)H(3u6-u)l34a3^H<^UnRIKB(z zw(9G0WcyzH5D7jOyl#`|S4<^6G@N59zIB5Wr~(Lb8LwZ<0}Q;&4HTgjq%mNd0T?P5 zx{W`tOfJBFpCb+%iKcpX>-X;@-1!`>^|riYa}KhiNLE|h8NGw1vozDH7SE7@^XEyA zav^@?>m-yEAU%&Zbzkk`q6=Pu@CewE_jKOeZNPFliu+2fZA<_k>6Dss1RHD7fkPIP z^%IubIl+j8UXu%IHiC^R*U7o-prT@@|hCe7w z!4C(QyCTvb2>QBmoAd_gnpIEG6rUYBP1G8LgAB9*$40OIs^pQiCD2Wym_c%UP_Yi+ zAEdw>t!Af38(PuyOhASMQ8}N_mOj$6ccQ*KdxYY4i&T!A3>To3-t|M$nM(4O?>=)T zLLwrV?elz*)SJOz_xmzGZ=U`wRjU+;%&j<~R5fexu7j&iQyWa=^1xQOwwgVomeMrN zC13|J4OBicvPs}Ea4^t7*ttdP$Q8v59S#7?z(@%55vQUi8tG3=0^N)*#vviXE?Q7@ zT$k&YH<_7HsH-ro`ht%HC}?Q2gBChdrB0e}PE8NJe`RR1B?phKC%2VU3g2xv<5QQP zhGrTk_1Z?#d+uhShm~{NxTFBdTr(40U0od~{3JtL=s*<6E+9LRqARhncH^rM#-*j6 zP^2M>BT8@5`;T$;-yB#y;97tzZnE{9vAB<xryLrt<_2l2ohgczt3B{Yrpi#~t2o zHH1R739V$v{)8`IzC_K1%QFFG%;`L-B_0Q?e*~a57Z(==b`E1EZ~2$&LU-=eIaCJA zI=*%ss0E71Gf)vfl9&QF4D$wOfBDnfSs<@~B&1##@xk`xR21W|`0eT2LvG+9#fFas zz!%cP$dwwEBGN`R>qG$*?&am>|8!=8WAo`RK?^4wN1=pcODNTWwi?)bTz1}MQ+H~R z^BNH>SwHSWyZI3f1uA&+s9Z)IV0oa<0nn&NSChHU0JcnckCL`R;RRlpJcO$)XSojJ zZ|{bAfw_jxHjvl2rYHDiAE|U%}q`|XMp zwJD2G{65(-{ay!Q`NDA(3L9>2J&fU#!Ci3|p@S(v->4xBpR%RB1;V>2_qHh8t#vVcmC_B&?2)(o*L3Y=t znSyKmX2ZbmIg?gs>%e@^ zzS}-au%AA>e+W{)5S0lpYQGptN&0QHi-r+&+O}6CCxBG5%&D&S$>MC`!PC)MBEihy@ipltA{mo!K zdI9Nlz()7uw+iyUfn!s=k^29!R*7NA$qLgbf#wWOS);U6o}M)|VHNGM*4De6lc}Eu zpZ=5tj`{7;)pNQT{TFiWFp)qoc~Y(_uRdd)2GS39LgJ`>q6+KuJEPbA&%qTrGc_Oq zsN<*uQU)*zXNb>-SdV19tOyW>@Jjv+>c4-j!V3sc$k$MxK~_T>sZUZe@grem$@uW~1xO(Oq0{SZ238KJHz?SE>+@`UUFC(em=dhhR$`^uA*o+m!5i^I3g&swtR|slO?!HB zlI32>%oc3@*A&tdB#ISN(8gHA%WYU?V45!c<1G30oo`tPjeCB-liQ z`}suoaPszUAP%!0TP=2m-J^QyCQhfv^rFyqoeUQTwCxqCm+XJzNxApF$I;y|Em#;gt8)h|NU^abro`b7ykgb6x z;y?GU|4s*PY%Nz{Fi!Waw}GL`e@IFK`>HkW-A!h6vAIUe6*LxA&RHh|npuU9_|u0( zFUfw6@qer#|6*dzx>pSJ08K2i$z}bl&Hwo11hB)>M{b6Qo;UpjV%e?btOR7bvNYJh0`+81gkJ9!`<^Hj{DJp?UtDm#ojfveka#|FABu#TEFL_XGQ46^N6iRBt30 zg;UjfR#lfL)V+myeb?$yTC+hZ4H;WUO0)1>09(!~GMD6GTfB2sld9>GbJe8Qq$+!F zzHozzRJOMCku@dk#U}axYXm{CX;DTFxlNEcnZOaq!`@zss zwB-+`+wv$6$E~RBhkZHFH!NmaoiWqiT@W_Nl%ShRw#wiUL}eb+TR3W@ zPj}xR>Tl0qJcioGu;q7oVcp}gK%ho!93A;>>ZG83&9Zy>%j!q&MN|}IlNw*Et)#uZ zG>#B@b?a4R{)Uyg^=~b*n3I0ygH}gK-iB>Me`ymMsh-UXm zoe^}qpaZH-^S>eMIlY2pZ$|j{f_vV)gky#jI@|XG%t^uIbx->GY!w9sE0pGco>Ktf zlo(^VYG}smJQ&iC0M)jNUud%QCNQ}a?XKNQLyE8s5LK=)6EM#=5K z7M>CLiYex-(Oz3YRdd=`6Ht6o}?eHHhL(vTi&r3 zzjC4VQ0sQt*984uXBFf`){I_rV6%T)hDXUo=tuxW^23Z^76YV_`A|$FxY~0RFtv%K(oLC>u?lZAOAtzoOYDR*_EIUd*2D#j{sFsXF%0)BN&0;VJzY0 z^+4#rbn9?n1j~L&)S#lKHP&X3p(A|2{Wc&J++_M(#g*D{>Un2EoZnnmFVHyH|5356 zfhSj#kKH~*fGYK$InIfLyCOFMgj}Tk2%2A@dkrikd^zyk58j-esX1N}EY?hG7HUeJ zN4sc^N~Vm&?{j8%whj}21;*MQV4-w7q{y+Z^zCRpn&D$Xv3kNezhu*eILOV;E*g^8 zcf?r^x26KhSpXlMK7NK6$MC#)e&Q`S-K6OERF}{lpOn!G2LnkkfI{<9xF+X(Xg zuT;kvsfRQSw{;8i2l!8_&pJRp)nIF|d056EtCk160JPh}Uc8q#_ao{XMKFJ+hUip| z8DX(rqzbEiD)ONte&0*W*o%leROD6eP~pGhAFsO7Y!7YeQ0`##@|Ug9=Xz^UVP)LJ zlnmWyDQ7BJ6CyMOQ#j8k6oF<16C6C41niut;Vd=W#QSbNm=8|B&sYrJJIM|Mx+lx| z9Ix|qpzt;Qa|q@mYiK%nMSw#c8+T`9*2Lkl1fm%7f**mR*jxqn%P-+6=7+WD7yKhR zM}VNdj{AdabwGm)=pi~nc$R^!f*?HmBuIzjC1~)_DhW_275{l7y`Rv_|A(pf0LQxT z!^W+UL?|n>G_8`Ay+RZvo9t0G37N?#2}ww@5)!gPWhYcfl)Xu^NjCYP*Zut8_jr%v zdhX{w?z-gK!-IafjFDRJJKMS-iW{YS`$XIAgeo^(}#& za!B&*N~jWBI%5Ww?1J|vwv=*mpUcp?+?wx%O5ks^X90_ToFnsDF5-yC9Ou`gVbUXF zbsZW(HHQu#GH57(pP)&i-n+Nthu#@x>Ti?oh$7%~(2~IYVz2yir;%p8NS9PlTbm?T zEY)JRCi?@RH^Z6vdVGurWscp>Or)ZJVdQ!Ok92dF>Y-&_POcDz5r5*#hoXV^$>(={ zck>Wr6w1@xlXgk>#3n10t}8`2Qc_4Vc{Pg5hrgA=%Sn>U_373 z?^pg2s640Re!a6RReWtujKY0=zRag_ z_Hf@hvr>N1xgJ8M2_}e2+MFSqw<=HhnEZpEeqaSV?~V8P@0InnwGA z!A3QTftwQS9bq#%-wyZ2TlrdIiY=?ipDN^eMrwTPM0U?#-H(6TZL<_M`-qJM7Kk@h zDPMflHH|#aLr;J;^!bpfah8}_Skl57xJwK8`r7XdT;PU7&Lz@bjo7Q7^cAzh&kMu{ zDIJoRNXiV1;JZD|FZpP=laEHJ6KKY{95{)K817fg55)H<=q$O=}c|gQWx{CiQUZ1h@w@4S2W9|H7RFqT;aAme(*GNa?`ls)VwPj{2zXNw=ch}Z?fTV_iWYG zf^nIf+ia%7td7k3$4*ES844Ucc+mc>d58ixuzJAWLG?7XhQW_Za&LC-iB;PL{m+!y zKkqM*5HN73(i!A$$QV}l&!DJA43^?>zB<$E&8}Q&qG_GY9D4n2r&hLmOLMgN`CRv_ zx15DmbofuQP6ClraZmr>!KwYwwM460T^$#Fl$Y<}bbhB$=F}sf$bNgi zCN0umHVKy7)p;wN(-CIlZtI~!xi>K5R=EsOeW&MWL5$r)~Z&$ql;gLXy{{ zFWz0yo}Ts$ZM*h{n-UoyGhe=eRh9Fabns)RJJtppoGfG}c{8dupFe%AsG9k`w7#LB zkbU5X;PdH(G-Hm1DRpxtUxWYDx8~QcU%z;SE^=%EE*>N=qa}K|1yHdWlBj2BA*ARV z8xxdc@Xh*cHPx$(LikeRb0-s9p-B0;6Y%1}xEN+{aq&?l9WO5WXL2=J`gKRZeo8W& zB>elJ@g4W3c#;%dH_lPISX629g~^Hho^1v~C``Oe^5>u0OzX;bedgCQUW6YGlw>Uo z@7Ig$3DWNQ!AhbPIy!_N*oD=7jVzE6M47)n&M{bjOWu{Yif%8`DEI& z{o{o+{W#wWEbh^Qc4fQdiuAVax&9V7n3d4G%;{Y$efkmNWgW0^I$-B@ELNthC5tE7 zk>?WPw^gwL1_YjtM9Z+Vc!0I#HJPce->)bQ@Zg4*w^~E+(I2ehVN5YSeS_Hm?1GOQOTZ1syv&^u14pM#$5O6 z51+Y1d5j^f#7im?I(Hm5nCKih*P%!9PS5c3wLV}aFAf&kbbT>9C3k$kg{v!JtR+X_ zF|IQn3;qg0n_*rY`6G{wsGmb9S{y|ChT0k@dSanqu0q;=WhK zJqC?}v6Vi}yDhq#k1#g7-*oN64e(-j;|+ zV98IN8r*H|$H)y{59GhEs#6p&T)- zImaX&%4AwWnQ(z6D{1A1?_#*hNSjJ4MO}f{be?|G{ARdm^XO!j?0vP6`jNXsL0zV_ zV>{c`tCkZQq@ac<7G-zF8f;}HBQL$Y7qF34J!~{2L3S)z{5Brv$dwdt7ES4!5*IQJ zv&TLktAl2t`1EP>j})?Nr#O80g79)&;8i2~;6Z<&_Ec}EmTkUV18e8OHT}3*Nfq*T zmA&LaDBXa0+0*=qNq(Z6b2a)!oh#ds`rM=09Vfxw%Z!dHsii<^3)txIM2~}qk##lut}5AoZM320i;x1q z67O!5uN9Tpz+Wb!8qI(WMw#``o+I}T+cm#G=qyDiF+f}R_Q?m%f5Ug{pVwdl{2&om zEJ-6<@%x|p(hz>QJZUL8H~*1={ZztF@vVj4P2b;&3{aAl5fMd6|7r|7Mo^{EAeDWr zr685vxkcu)QXsx@?bwuL4%D0z3KF1IJ9k#OY5sM>(> zV}eAtWYSk1rZh+$kai)jfcM5p-V(#S57WzpM1r0*Cs#R1Lq){6Twd<_0TdZ1IPk%t z-cLoSTHs#wAKxT$)d5Zak&8FO|CtKJ!(j}A51l1xsIc9?NyG3<`{c6kS_K1s52E1# zA77MYkQ+dPdno#)yjr9=Gjefgbd>H!)Bydh?&BB^frtUZtlq)haFDpX^qF|YHfG-hPpQy5B}+eqrp8D^GD;F&J8c}5O_YJ zcek^WsVp33vW6G!#5n4-UB8?V*?5O?Qn$(3%Sp89jh}JSZL<7j8)rbdf<@BL6EO7K zsU51%v4A;U-C4Z@;ph1H`QZw zdyKic?4|<>$FVVGW4EwbWe9J>SbL*XrN3Mi6v}=wBW#sPWK0ROLYn2qo{g zYr?p4>=ZI_);JRowRedyn>v%J7s=qihCz~yj3by>TuBO-FEIpSXt=nklcx&~zl-e# z*UR`NqdegXt(|DAsGp~sinLi>N{D)R1OIq;&wAjN?hii)bDQmZW;r8coI@c|0lucy z&l)#P6l4}&IM@g!DeY9SEOOKY{X|L(N4V?+x)2S&SpRZ#;XgD z#~W-7Qv?`voyLfugXn+eQjKhfKemXqj`rV365pIT0mB|dgQ!JUKqv>+e2KqUXI^)< zHIsw7_3rvQ&(UK(V?m{CpvY*NOYeiaW_YB?3qlDY1tbt2ZeXDvZBc6Ov0>Ebm{W6? zsI>X~<{Q(RBHu*YiFjpG`qFv0vrt9G)j;JqFkPQkQeSLk@WHX63cF%k-zIx(dM7;(2(5uy5hO@Gd79+=ri>h`$ouU?xwEsN# zY9Io`0_j4bgmJXGb$z4!uBhKHIk*}3EPPX(7=d~?*kng>YsU6F#C}JKh_aGIA2|Dt zc@g~XYV0c6Z`?SKVd5SlqFA*DSt57}bEJjGky1nDNLrMT%kf!N46B2erN&IYcqt)8oVgoB-050{boj%>Ut>L3@wn0lSRtexXj1 zMprz;|A>Oh3~GRN7XJPm(W>~qyFh|(E`I@M3)`k2b~o$vFUy*V_|VXIm8@F`&@Rm1J1z#Rw}uEZH@iyMDkPbwgY-nk*&VJu0L~a2>LedTnF#L zppX!&@0J%c)ZWe%4Z)m8g_+}@CMc4C)aj`K&mHz-&s;c(hAZbQIf-0;r^$wDtPPL&ie#>;J(|xH{utgvDZlYamLlWz@hEAoC3ogQqjHu82Ri_ zam@R;yrXANn&!7vf5V8{g^`Krpucji6%14T%TrZ(NJ7m!pH251C&bYVtrG3dY4?Rn zc7TV3!d{5weT~RwA_heC`N(6{LeNRX1>L|sX&g&)GdTj@lktPIo@L zL0U5dntA_KO?Yx!zMXqtH|&vxzl|D|96Ad)ynaG+V4#qt^G&1@@iB3-&lBKWX466)P{n}KEi@&;Z@P*cqexG>?y6^)o4J@>y z>ZTH`dM39{LEJL`cIj*N$auUUc|kySanHS{pLrSu1J-FDs_hd0x3;;k-pUz>H+M>E z@goVWI9;Wdns+g?v<-hx@u!mAo=lGOj>t(1nA&&s4$4zZoXOZJr$?(jhU&DpiHr3a zz$K0-+n6YCPC&MBUQ!RtzjWN;rG@%`cGR~KmnH6*a4x`k>ixO9V~gSl7Vv!ge!JJ6 z5~02)pqctqK2@9F}1$n!_zYV{$_CK#9NJrje(eE+eCrqqu6X!Up=mAPz828sXq)Z2SwwuC`^?(NWMhhElV)hhdW(`rL& z*PU%23=>*k%KnHCtmM++@KF)QoA2N>Q^EMUa)nZ2%nz@$yHb*b2$P4`Asn3lAWzZp z_&&abG(@$lZ~UAnOMQnA|Fd5A{&V~ux)kzn9B~vauHOy8 z+z@Y%V|1Ia(1#`lQ!;=CPz-^fcRwt@aRY&ZcV%d*z9vE$z>i8FAHk;#trdKwZro{y zH47C|=tWXLpykTxJ@j1_2i^{xSlc$ZR#MiUqLaf1m01HTo-u7LZNQoYQkSKfU7Rla z|Iq>ncZ!m{5EGX$?0ZHSv!y=y+g3 z1P__|@K?)*r$_d71#qyhmdEV=8wa z@(;V+fP+LKAbkLm$_c!IV{enyW9j;y0JX0)sU9NvRK&N}>jg!L$(2sHHTP7KCwcPIDd`5=w%*tGG zQ=V>pU!!(VpARcduiM1$Mnlz-@j zqv;aGO{kRNq)8YUh7{%7{C$Yhr8u&JnH3bCCM7{|LJ%s|TTD(Za~-BtYO_tx-o@+4 zbC}?lH4s;@WOqP?(=kx(NP}n95*5`h@{W_?)jQI6WVcV;Mi+blhhb|qAbBB8`i%baF~5xGPuFxQpq+h`p%~l=v)8p8xeS z#%||te3mUXh+JRSHcA=4vnSJ~>uNw_vh?YnOVsd;lzp=-(Ns82%4zW6Y$V1ELs}>7 zCvSmp08bB(Ie?{)xp37W(tJr@@7?1zAjcb+ueU92A$>+x7Gok2ZEcrXzumjCf&nVv zrtK*`Dkt1f2OCH z0vehpcr8Gxe*eDKqi#5?Lw!~^?t80 zZrtqdcQ+hM?JYcW1GB;2@Rgu^_fsS|9?z})sYu)@Rye&T16t}I~7%&2zlXw89 zj8gU74!Z;Y5o2c1;m>cJR99C={=~T&DwrNPaj*1+KA)3>R#8C#QHkNj?2V6g7|UJg*YqqK8flV1Gew(gT^Mk%@aig`Q_lWdEzy2y@Q<_%C|dkFAZi#Dti-Ek zYZDC3oXRi~d&TWG_%2j%A3l&E7u~${dwf*R`%%vFQ~0@e-6P>^Lhb^`T5C}#m)-_c zY?f(3A|oSS?FgaP5Qz6wdh+`ST)N;{;`XjTZnrP*t+r_CJWz;j*Ipdh+ujB8AoG1n zt=T@5aj3p==rHVQdtVL(HGnU4FsPVopy1Y2k7)U|`su@vpyh>1vPM^ol_FUEh=06~ z1hPM5*FHP>J>RB95F0@KW;>=5dJ|Kf|d7fd}W#L7B@qT zbsM91?db^B*2Adri-s5kRp9w(p4@dKspIL@s_`WOzgp4_go5*cUX^uCRWs}0o*R%p zf*djyXk1$p4^}W&tcg+O!6?$y+uWxbSwJX}*Cq?RWbYJk>Pf6_-;14-J(PX!MHc!4 z{kRZpO;F2t-ik34{41zufAAA9KK!(Z$~MLCrVUuoo1w}$43S;2t-~P5$rN&4*Rt-J zDz|*O{idn~FrkF|)GPRAp2YkO)H19eKUL$3qrup$JnEW@`%PHCP?qXnz6N zBazXN_#D{hB$+qX)&D0j>UQe{-tJ)-+_<~D4=@G4%>2<5AN?;3Y#3>gG44idB5x@-Hs5(m+> zh311eh|hkFpwGiVFih1!26qm!{3XM=w<#mDebc$x#2U+9J*uA!|k4i z9ZM3v2Xo{wL4X<lt{*1lj?lo?JvBSo4=Zb0b-@Z+??KG-_U#W z!s9M~J_ix%xeZTvycxlcik+Fm!>sG|ReUFKCx|dWCCkw4zW?HTG&#bltbF=`;RpZM zf*JJE{r-F@sK%U`lL1{igkSq8Q;h{IuU(^r?Gyy7hB@_E9v5Z#OW@r-!*y_&Z~)vS6jON zydBGZl5~-ak$1SBsS@M9L9Ts{<Lv4+x(5B8LmXREdK2F#kRUObBGEt})d)jBQaqFNHcBwW zuKQpSgP)?Iy`3Ht>|I|*n)10nhl<)ith%~Sf$H;bKoO9S;KMHSKjTyAS3soL2SVIX zGqx!FdxpMkCa;~sG}7_F+U?$W;I4-_qVf|nbG1)mCxPKf*XVyLek_m4E<&jNpKtOm zWCr}n->w+!dGX-94gJ~3^zq$~M(Ey^=u%R?2xg(VYkrCuhn`SVEPI~_Do#`ig?p>n zDN2Y0oC3=HZzG7p)Z}PP&%TLpo2odaGz$?^^{gvFm;f6NVUqG%JST`jH_JjFPv+{) zBwE3W;aZU{wzdVCnjQYKxfB*qVMGrc*_X@(HU3f-|1aN~W}Xw&TxNJ|8f25QOF?~|P!6>+LOkBK|% z(vup;klPdy0L_^pAVS>(ZUNAk-TSH4%}M=_hWY&asW8xcAJ9eb4W1Nu7~tHam&N)R zmzIOD)A}&6gpELOVIZrwh@Lu2UpDJBP=RC$R}Vb7jL~*ME`M8Shp$JK^j-U9Z*a!m zWT1DYNrRX%nRbw2B5J%;XZ8{0 z<}RbMPnA?4S&;V+MSpsqtkOWKJGGRz3nv*{eBkVZyoI?@98@$QU!EqWq%>}A!S3+q zE7A%mW75;piM|yqJW4&kA;25>;5Z^TT-C6up>Avu_5hdFSu(VGN>Uq7QRBtD-2SG! z*PARjubl3AM+{uoK1MqVL?twuZZA1GYttJHfpdanKI65u*yS>j&l?H;E_t(Ru03=!erZHhE`n9}VS&C)SKc{!1^x`!vmn(gLey467+ zM!z|qK}!(tX+7A6K(3l`XS+x`^_Q>TmlK+Itj`+zzhjuc6%|cW$r(_=?khv9!e-vE z;NrtVm#sjQYDwo*Mi3++4xDG8$hn&<$ z%I(qvAe=zw5}xr@epgJMpNxbD_)CpzKyPqqE$LdMNR0@BT4=BuUBQN!F^AIod(*+A zR1t*%EWQVvhP3+BA4rH0Wk=U}2{F5-WLxm@<& zT||7pCMG6r?tglNDh^Qi(%YYpf)*fq>MdziaF@d#GLefM%*?5WcH0Mzi5tpr;{Bt` z7vAXj8Ula6r;dNTq#i^@I;j%z{1k3ahs_E^3;1@xK=3!((jls^=uU69XbX>u)9l%^ z2fuPr$MMtCad!!^f5Y#C9TOLcMFgvQdU|fYV@7Rcs5$>j1E5TuT?yom+zA(|Dq~tk zHl|G*M=i*A{s_*)Wg!gd_NDEx_tOrHYat6hG6m2G8CV_#Mff+o_V#CWkYsJMO7` z?0VF{`6A#|+cj@HA1I=(D?{82?IXlfnEC=IhBLrlE<~*e>m~UYNkE(0Sek6Qr>r&1 z5-DT?<*=rU7;q?-N5zp6vDp7U7pBg&57!T1+gRd|FSQD1ULz?j8$5n#@xcRd5G@~l zIO(?U^7RkOc@)u1Q=+Q#ICthKsMX=0Ne+y~?cIfQd*1I2H79KIP|H{t4L`Hi7G-n! zxb|s@SM`Gat^%2vE4I4y`yyJFpZP?NfnT^*N^ob4RdZb*udWpgnEb$B_UyhQZIS|X z6tK-i5o9co)nk(-ePq;0^W#?liZ-^k+^DOtdf*hZ?v@n5NyGcZCFWBdU#6_m)s^^h zpMi6Ah4BYJRJg+)O?2h>e(hcNt#0B+w(0uMuA^eU7pkAoUg11 z>w(7{^cO}Z6^JGzfpV?V?=>W6iLpkB21XkD5nmuc1Lz4TT|X{J?pyEkh9W-e+T9@v z-Z)6T+b^C$Rp-QjHxqVr<1377zrmDS{2@{+W6bA*4YIa>rHfAdm*WGMWkT7)Ww^_bieT2@Ch!`zwA9G{^krPH~^zc=0VD8F97~ zy(niWFX=hg-#)bM$cehhf}JWvccA_U=9&qI6u60W%-F4=P5)3PlLZByQN`mw$NFzH zrs{^DmQAUd_h+g(9QBO0{Aw5otLrm^?12NXKSqcp-@DP8w`&Y;B5YU%wr9_YiocI& z?{Y}oM*kw^yV;jN3}zo>m}EPzYeZq_Auq`3no zXoPf}x~_&7iyUS2zM8^eN$VKwtrTG1^R?YRkovA|anEbp3uLV_KHM5I>;ABU767Zt zO>lH~9rE*8#6@F95?2&x9A2oKuRSKaE+Do1LuK(#lMGx0>d@W^*8kAH*fAPNVFNc{ z&IWl37S~RL>KfO^?fqbA8pk$=WLaRyFb`+t*0FaJ0yAs+zz3E!p`q*3%}mM(Bn!r^ zES}VruNVX<0Xx-_T~kVz0R>;#BS5s*&%Ug0FNDo@f{8C_bWR???h#Z5Gb!m&&v#XR&SovIBQK=?rA#MZacQWh=1`8y%je@AMXBVPzwTkloW(sAM<~i68Vgn!dFrqd zp><|TWRNZ}s&Nbu%cb3Ux{Ms{Z1Xz9_MwELc(D-&ZC6G89GI+INtldWFO7my|v?Rmy<5g|AgAtz&$OwK(65$L>vcrosURn zGdY4jL*Mc^bbiqb)$VCArp88@PG61!Lu)<$F@|-i)Xp1Jck2KyKR3wiaTs(?eb0dZ zc95}XB9Iz5>=Al}0jI2k$!ZyYJa*26Se%Zsiw(h+5tlPL^ zbwrqW{kd{_Tim~7E8l|#yI;pY{R(o^0 zB6=p8k}swPTIB2p9o`7efQbn;8|)K1YpT%iKnDUl?W4zzVNsCaNJjh?MzL+vF7-b^ zCCcr)@m&)Dls|FED23PDSz79cJ9Ytol;oNEcpYUn|K6;o%|)4M?9y`6qtZQk{I zr}nbSHo0JxenUHb_!Z4<5$aG*f`>!fsqTMp1QDC*UmyB5=~NG5j;nv}>80(i>VJ~P zH(JgR5rt+eq^%LZuKsrxudca;U;fa$-2wlWjL@KXjf5PR`I|O0_QWz7vaJg0>S|>& zNKahEo$7YhWuwNV=Wp$k^ppv1YkL|J5Ba3hq1wSp*9Q-_nC8lzTi>@1*${BS`PDKj zO5;3TjD?r|-QuW~Ji0(E63UdOkWbJlU|t}`?Tgw=Q!Yo*BT~uK`W~jkR}Iy!WWTa$ zJu>$j>rv^6dvCgc#)Tvin%$mnxn)C0(AOI%W*GK>W<+^iKRS8{6c3nO%;CT1n4v_z zM?r{E7S*oDOt<`*OG1zU>qrYjuDfg1j|Jl5`dfB3$erKI6L(*bvG%E1>ey$z#` zWq((Bru(SbZajeV7A1ijHhXe1IlzB+l#Ik(i0}%hmXQ1a8xuBB5pTV3Y9y@%$eFnP{Qc@sK4jwS!4MFtV@J}L_!Qf~RP#eUC zFZ6R{SfR7B{Z%%A9u7L#(i-m4O`DiUlwNqpSLSXIt?1}zxF_2DioLBn@d0NUwX*lw zJ{v(`{D7bB_U9Kh@WCN~Q`P@UbSvL&uMDhx(TlA8nnVua2*7KRLO@D*xa0AD;06@w zqV!Iq8#?H01z<`MqX27*QA0H+vh+`h>&r=T)OYA{QiqOkv+C5?6r;HwLz1f*PGuG9 zsp42w7rf;<_l}l8D}3y+bNi&g%O*XuGG5nKY1_-uBPCq>JHIeuB^!j^Kb$q;0+#h%q(u!N zOk7+X%CYk252|D3g(OF3RaI$&<%*&wLiNvI)X)&!_Dj3;yVsYMB^sqSUIswEs|mL3 zY}y3q6%+?15i{cZ!CVA{OZ(89F*8!vjaDovAN&&Eq`q!bTrqO`#CZ9@b>DwWt)m;$ zDOuGM;x!XZ*l!R(5nlaI$vr-{>E$JoeT-ay9!6yXP>&0+aBCXS*;S8lQ$3h!uXe)h2T)3)Z9 zoFuZ%jR|x+;FLAT2pUT4+hbnyWrWe55m^w+%s@HQqSx%a-G$zg1A*ie9hjnttZ$SZ ze0Jf=yqW8d%QiE;N9u!U|Lw(k>-x&oD$cq(-IRNuM@c9H3eth1EJLZI9e!RlT0~V`u%pBe?Gxr+Y z5X~M40$_j&HlWWs!(;F>QGm3}Qi393U(Ilfn0-d8_N(BO9p9AnEjNnr8((`u?sEJw zch?S&j%L058^qzlv}x;Df^9#7Mb*x>tl}ZMVd?q8eAmd2UZY{OV?Z<4{=}Qk^sY}H zKrF_QhB>=Q(B!RbCjp@w|M_Jw` z`S>UuNwM<4Q$a5eI%9I;b@j^hqTS5?@bGY=0y}HK94qcj0`y^U8Nu{DVZ9S5Z!^_i z0jKg=VS;<#+l?t(0?PGv_|6HQm=Z1xlK`~p$788bWJ1oUZ|rVq#s!G!iRtn7S1GTh$_@zBd?NV%F`N_^TrHs~XJ&;Io-av4*` zyX-a}aK~v2y=M1@=5vRKRR#$e*-9eryc7HH-<1639KSAo$1~VX z{S0ZntUAE8(Gn2meP4z(q52G$zb0n7;f4CbW?p0j2)wx2z7b z^00bo4BQP%7WfP+Jm*cGp7yjb=s5Yj?Km1Gg6wgx1N^Kd(pOp0_L}6!e{Em>Q^l3Y z@Z9T)sm)Oy!aBJ*OD+yC5DhEJdPTeXoRD^KgB>TQzqYhoj*mw`vqDClwuDZAAp0zq z!4aL%D-9yW)Lgh!j95ru5jDNV5=J_05yEF28s{rdVXfR>TJq|O%{PGINW&o3wmABa~W z=Ql1Yq!n13R@-nX6Bs0rYj{wI(mzgK<5i0iA)TCtvb#f6z$)~(~f-z^yBL;*2(~Xeh&&Cf7Iu}L|ar4<5m!mh##H6Gy%)V94jv*`esGR=f zMOJr;_sjk0mOK!Y;N*l7Fn&eU==x@&QfK)(0jr<%P%$YD+8K1KNGIjI#A}X;3Z?Y6 z1Is&T+watxX?ku{g4u-4J%kMgo>vmh)<+?gfd@T;UH01rWdRzV?&xNXld)Qk%_eLV zDXm#M*2O)^WHOJIv0kBUfAhWCKQ;p~ACqMEQDTj%lTH=|{mduZpW=8qHQ zJ`|&$*LPEwMw^EEov=u7^!Zuru`VB|%9!f^Z@1r}Bn?nGFQ>yr44PpVf}ZZD+bvBd zilS3k8_Mb{%z9Dpf|gM&0A^H?A1z$F91ZSSKZGpnRy4X13}_05-ajw zg$dG!;UWYzj@|U*M)7;D#IXM7PphVU^?v2|vnK}y5V~e}!6joVN#%>f^7G!r*{dqv zp#b49aF4Bo5-s^2rgXn<031>}G->qfOhQ`;tbQUatcwkhc^^bL@}Dg&ap791jDQ#t z^#HP&Fn^ar#zFX!0#UzoxHUqch0l}?D=#6OYzimc?E&cOD{XYM)0y)kfEIjK=Rz>> zyYXxQbHYl(toV9A-+TPRnjr9ie$lrcHVY(NxMR-TE88{c?*2HKOp+r?h4iZI-+S@S zuaW=v-r-F$iW~z+ga!=O+86^SWWA&12~ct(6Z{Z;#ei=Lb4Vtg5T{Z0p)oLYWZ*2Q;dA7!q@8R(ZMyNR!7=5*L$ zU0T)IAu#$<98oLl@9EZ1l_9w1>Q2AEV*+@%#BksTU846+E&Q_~wdf-b=As9H3e<3S zadHjF7pM7U4K_Suz#}w2fZm4s+nxD!V_W~DjZjJ$Wq{bLD-e`ld2`<?pK3T?jwGjT!9 zH$4;^BCQPoaU+!G`iR2u$3e{l{nho>$S1RgdAexc3}r9_<35)I$VDLD5`o28#cb;G zys)XPoqTsG4<&upt<>r#t^ebd2`*OE$C`?OuNmF%`gW4$u|z!J2h2i?R=VOjfXcb= zSJU!8+Xn&~G2k@cmrDHO+Bhg0oO8r&P!>=mkO!Ho^F4q%qfkGW*RQI+o}l|KH95u_ z@!VLz`QAYfFyju*xT@cZdG}Mjxj>nk7K|Sbq8LyQps~B2aft2#Y!}&#N@~L(arx-( zrL)nh4$Mml=FA|UZq74J{MeR&dIo+o-Oi`Vz>#5xvzuFyvziEgDkMRz zg3a-_Q&24OTQXAZ`jQD(3seExA?bts&rMO9!RUAo9z9`|N*F2h@{q}+IB|@dJSRqR-0m!|L%*X{G>-#uZun7*dT_&j$HnO@{x?zjm`Li^9*5R zNGyPQtYti7_9?#c&^f8or53aOk#$a#hO7#!Gm3$)J$drv+S{{mcl-O--RP0t06Frj zayrtn-@BNjwF0qfZ3BhM>DSfRj~zKuB#v75_wq)vc!DRXbJhA$gyC81*G#}EQOlw4 zC@{=(WUs@3-!l0O<5a!@aK0`ef|mnt!vg@53_%0lqM(OEdVSBbMLKd5Q~ubmTyc=cb+d7-?yfVS;&vj9RXK4 zC?J*QAZF?BM22XHS`>u{kZCbL|9cPF5%KxM;IDskS9z?Qni2?oIEo9XCA;1a5(5&)vq4I)7^#p zPZF4#_DlzBGqH~Egc^D9RpsW(VtZbE)_mgkV&^;W3hQNuu*NZ5&cr#ECr#r81qE{I zMR`TO*uDWM1DuMa1$pCtW;cjqaVg*Vx{;dm3 z+5TvN3N#hYJ3b(g-#D{UlKuXM^MHveC4#V?Lq(hos;d`cvdcpyP!kmD_jg=8WniQ# z^YK)3oS9KR6NONf2xdZmN51)IdoaUB?nNV!!*4$A%C8n6N=ghvl+z7#hCL6+ty65?rWI5Tlv^9MaiO*3!}vX#V8p`A2!dC<);Mk~!bQ*Bw@G@%|g;aL9FM zZA?|Ln}HJcZ(wP$4vK$MjW^Bz8nFGYGfLBqhg3@rr?dh)#;*C?=?Ob;@29WkafO`9 zW%yjU#*7#sy9A)1y_@spI}tLHz~pMrv`eYp51=oZ2-iNd%kOzZn`SuAZStNRJS+@> zVOUcu>?Om)XvR=;z%tuCf4G!XX+S3lOmr$|UwFc`N5#@_uU%|Tn~@q+g0g`*>G?`n!1T003rKGYoTXPNak zQp7S5Zq#o=3IAI@gcVpX55eDI(tup~#~gZc%~*@6$~N3Hoo~mq*ZrwHcp)0FFf+Ry zcMosW#>PgMkM1VE;+vEu&*JCjp%w&yhr|!-AUL|}c|StYG`E0o9drc~!304e*jo6I z9OgDU;)Bsu03d|Mz}Pq#w(=(VL-ea=WpbTNXnn%#6*ea!&PED(WC1w=C`TZDT|R?6 zM1r<&oR6^`zgbEYIYx;5IUQ=YhPXZ`_*CY;w}6g7-wDEJ`Q)_d(|0B@gI8ql&3KSn z@j>!(Wt=)sw=ecV7U0Zrx3=3^9d@gSpFU)lTu+4cdZcG7meH9w;}qeQz&Hg-E*yh; zQ{h_fSzH>lN)gv2R}wn+JpcCb6g9v(fcOv)kI&jjJ4cS8#7X-;S$gC~=~XsLU;aBY z?;9EkOT~IEI`ejgnisrz^MhgX^ysyVlqapycaQ;v3&c!HIp8$K|bym5D_epsWgeeNEy=Q}tuKo(J0LSR$6eAMv>s(n;Bq6bosZCfcs z65AbFhw}gi*dxc#Kf`C&tgLS+dcyfEj^s_0r@lEcXyXAs}JCKaH_E7>-I7q?^B5ToWlV7GOC<*GI zrlQTBQ3BoX>syh(e8GnpioXZ3L+4Yf4f8DEjdV2TdP}F8v^;;3M!NP%ku4KgHC_9i z6gwbN7g$rKL6EeI#BBr>b$d6;4t~ob zzCG|zitwCn`bJ$>11t%shdoE%Ua>720X_QF!OdFH)(bmngOh1jY!~wgrUn=}+fUUs z`w@W}+B{bOhOtFFg6o{!m?G-GHL(c}&7@*kml_x>t*lgd8n*}*8k_+_)~A`uDqb^X zEC!_$qVow$0j$tEuy*e+gn9q%wA;LB)y7lFi3;~W=(n&cBX{+%DiI3@2YI6o?Wd}LDzIebvYll33g#e$ zS*zpwfks2UcTnh8aRQfe)vU{IoG9yzX?KZULFgkdw)W)vi`)fiO6xhWOuV%uTm2v> z-=92SYpk(+@$?1_1<8;EY_C8cfVaVZ((K+XZxX%AzfUx>?AOE9znH&>6T~mWc1S2; zu(0;Emia-F3&*&6%G<6v^j-P^Um-WGE|&`^l%S>@ng8^yQa8uZpzDo)+#8MoDGDQE zyjjFtd`r}&^So&*PKOZgAznnd&)5V%{N>OuaT-+1_@Lkd{jj2{cSt|UNqy*i~Xl6U{hBQq5tNkB9dy=6C2#!9*r zASFCD^8aXN74AYa_}{R7m+=j8NMZ%Wx8099y&Lz#A2Kuy{^6zezW^Uw4i24LD?9UN zi*0SbUJ&|x;ppdw!9?cYxeoap$bQSECa-tn{y0mF2b@#lgf~dlyTQXg-(_fkm+FCJ737JXUcdb$Ng7ENSt5emK6FsTPCK6EmW8SUTTj^0`0VWL zI>Q+yg)~Y={@w@*jwg2j@Y1i$@z=L!Y+@$#3NaDQU8a^6Vr}jIz4Bam)|kCIHaZ_r zO&(2IJ6jeItdm1ywmO%hy4hb|B>Sc4|I^ zTyA>NW!if8JB!k}`m2|+t=8rPra_-fNu7D(v{pHgKj0K^xLHr1sLJzi@ZU|nChsDr znfX!5Jh9CPlQ9}#Puce>j&0Iq=jENdawSH4Ke9c*PRVD9kzy|wmgylYee{SP72%o8 z*zmkYrA7MiiQ{OTF^ATU)?1#sId!5= zH_p1iypHdscH0u#Z5M2Lu9)q7ofHXo3!_B8*ZQY97>+5YxTknoFdn_w8`))At!7x| zG;jePi$BK`qT}Q5%vaghymifz7z`H+7p%?=Y-IRwIO+mvlh6}MPclZnPhiPH^sD`K z@21?fHb7U2Pjj;4Idk)UmJPv(hA;wW!5;hLhK5=r?&J?2@BbKY$~dyrnp0@qv|pS} z=M*z#9A6U@Z+Z>fgw7IqiYZpWWWPd9(Fx@eo-u z)58=~niN5asofz@FPRMfReVPr)vhIZ|NiaIF|v>UX~@@VUcNFG*pf-nlq^x&e1WE; zBl){bW9CIui|?6-iEh`2pIKZRi+LEI)fh;Msd#DxTyVBomR+y4e!H{c$WjFbDxN>T zFD&e2w}Z_j8x|RiY|bYJ{N}Jy(a<0hP27vrKH|c4`f|w9S4o{K60R1~uU_>ti#f4z z_=_{$SvbJq_hw&So;;ta497vSub~R3g$)yKGPO;n<>pc&7+_XKQDY0(;?u;$=$M%A z6P9AE5rEm?{)S1M=x1l2X&l2yy0NEQ+J`QR_cunM^U41G=G{A%vY{$Yn_{XQy(sz1 zm+3&PL(YhC=({mr^D8~51*?7c-=!6d8SHz1)1=xzGVsnm68U{EQdaWoIU|lzmk*o} zAz>mTK5`^NRm0|z(X<>7eM1<-u@`<73ks+IZ4N0r4EbX>#@_qk1Nv`%pV8LqlFNRu z@hnNIhZ>MWob_dc$xwyb4;F1{nGF;~c(<2Mj$`;9IO+NGM*%L@%$f|?JT@=$F)J(Q zrc|P@{_^FTg1S#sPOo$v%YRS$>kl0c z>3^)oINjOu@%_y+mrdqAZmrLAiHYq-z8N&w`AR_#72o5>!NI{~5PIWJf02=~H^C%B zsuhR!mFCU+vo}aS-o1{v(a@m$rnK~fgUqs=j)}h*5xz|?tG%r?i`=LDjg>(1(~4?q z8#kWcOg-vYe=r;7i$LoVLmN(AU=wVTOpL{t z9ePo4SGSq4^5#ApD=XE$m)a$&8oy3E_O36id*vDwa}xPKQMXRh z4G38s%=z$2=X-Rlb2!-_O3_N6RxNLdt@t6^hWHNpBeWdElnY~YhjvSco^b5S0=pUS4^{Rfb zGUM$^FB>M5gJM-RI?cAc7{oQe3WSbq6$na5|LE+bVacL!P1fRznc7{LaS?C@CwCzh9X9OibO4 z)~hF%6Q)o4>aX0;@ErdvI!Lu*?WZSEqfd}c8MsX5ZPp9WJcL5Np>pV=XrnriEoeU zhvGiB`+mJ%&v9MP>tYrWC1s8H7Dg!Boy3HXz+>+lT!H=;n9g3uZ?~a+IK|xa_1qdp z4#HL9Rl(#wA2(4oOi!+=>(?sFw|{3sU4S6s=OlEO>Ab0ROeJmhmHOFEIBGC>=iEH( z!ML`B-_OfHYolyA7(1?bb=+=0es{Tt#dZ`z1;2C0$Bbv8>-Za9BMO2Jqhqg$AR}QL z8~2989)+jy;m$t~k`dW6(%if!*|PKDa(`^O_%D;?G5-FeYf(M!2+lYd0H9(_0aFQw7RYwf;ma{q37IHQ|o_$^cBK#ss~ zVew#^b(WI<=i=Cy7j`8;-sN)oAr83=gDotWiVC_c%co%i7S-QBA6DG@SmV3P{^!s7 zc~_bqvqC@<*<9mD?{qC%$_6Y_QaRL$>g$nUwhAww5+C0h!U!`|==Ouc%PSf;Um7dC zI1K!=X9=1ivW7IQ=l&7y;4tB>OsRIsAqrv!?T}Flpqy|@4SFwu&EF|hMdhXV*ns2n*MROT^ z?4(zIK*->Dmp+5h@? zR?105+1JaGm0D6VRMNVjgDoC53hfzySabp<}ODBg`x4JW~f77 z9CDXBnOi-mm15@o8D4+Ce=Mv*s)FPc?t+u>!R*>66pDRnENIR&ZdYyrfe?1Y#cMo{ zoHuSfQbe!;6;#M$=*Bs#j*je-2Y!Dz{%(pIv!pV7?6-Cz_O!nR5ctlyF#6do=ONcd z9sjB4<_49#e*Ki+@`UT%cC~NcXtA1xyZiO$m6aTG4NPobgNzh;`F2)L*U709*UWyc zW}TLlltjSdlj~pUf6N97hn#Y9&lqI_(y^B~of%oU3jelJKSh?-)wSMNAal;9qH=X* z(&Fn?qw-eO7nPM7E-nHW3fT77b~sB*Kd+qa^2PSo<+SV%Ux%tLW@McMc*TGuckD~Y zDzht7M0oxbc$V|r=qu~1mS4fI8s4uwMX!`*wQjF-Y%#%?-ZC}@d9<;`U(gkTNCvXu zB`;ql&>)eicYBjki`9~!V*hk>D0=)_(J-{I&;nliiC)xD4;Z*5+x+{JYM8p(;%|mMu2_QHs2Ueglpk{598bP_cBx7n<@(xtyYb}k zRk_AY;KNC6NCwpO4X}i!Cv!}S1_q*Ky7ijd6pA%2bG|44&e&%ggK%(YUu!5U+?p#v z)j;rCF3USw?B^CDz~P}00kI41zE%FU5td76-LR!4`;Eh4ag45eNxhqGrWkM2>Dnc~ zz3ycmzIdCZu3Ma3D2$q}^I(qZ7LhYc7UO3C zoWuAmS`da(%&Qs_sl&37`MJ47ucr9k>4=GoUyMoOK@#CZ8xB-dfXe;FVW%4ifvIS8 z;_u%9>6w{iyyaN7*w&U4VAf>Rvxr#p7)i9Q1X}m`@uM^Q&k+<57>in4nZX7IW)BYN zH^SN2g6vf9s6-Ms`ylV`E*L(b8&PxHENvK{#4nohkrpbf9sRoaD~{+&$mzi2#dbou zj0Fz7is~bqA73a<>$fg~<_AtF&9H4quSs=*@ECBIUXV33pmmR6fFymb_vLe1T zaa)v3q0m%ZY4o2zGi#j8UMUNZ4ss|D_@r<7C4u45H1Vsf-o8|VmcK_gYRu0jKLi$T zJm7vN3TQ@2ru~jiERcBNv}>^z0!{}k?X9gXoNv)xm$Bo}b>=QvJD;MY;LN&l0|^LU zu9no4E_paB$h$%n_%4xx+T$am`N4n~ElHo#)K5#xspUP$71cezK2ez<6jfSklW&o5 z5hvp_PUzJVvKrlN@@cmCZ8f-{r8}v<+o+SeUIQiem!SO=mp0c305sG;HmB1)~1hS&SZK=9hwkHgC>TXYh82B-9 z=>*7>Q7cWI>1d5(iaM^oKMGKCbRuGv33iN8cK8zP%lxBN>eZd2!_9K7v-@%m;S}L8 zlh@RQ0-i-kp3&afU$pFdz6?$1DP?PyuhHTwgaYYcY8wO!($>Pe9%{lk9WWSR>H~m_ z=KHg5Fm}Pg%}sdw_HF3CtWRuWs;hOKQg8|rrzsU-c9c_eD+zBFkVeR#sbYaL2VYM< zD>>NSlhQPsCBzc_K2WRn@nbr6$_rugw9i7!V_&>5vOAt?yIfdUNGZ#b+{=8%sFj%z zraigWYxz!uTu;^?@APyuC3T#xtJ=10ulB-3K`o zLW4WdWw>S13?E^Z)eE+k26yg|qse&qJi{pdQyBealkMn2kJIvd%K(}TJT4Bkjs~|^ zRe5MO-P#5+6FVzkF?Tk8orD4GyF8~_-Y znVCUnjYmKrL;5=b)%9!L&f>)w%mr@Ue0hI=N%($MRf3)bx+fD86J%D3T?EG&85vMd z(B5#Okw_{on!=6hiqY$`U#@W&VV6Gm#Kv!NBzsSUz^J^;ZXnah+eM~-!J^Jl40cVX z;CwV}P!a;<+R#AP;naG^D$(Mw)cfBYOn1a;cX%wf_WS?`gzXA{OOa&7%|}Ejy-l~J1v1pNgKPH6 zWL}FVAVhYn)Wgr1>ON-Vqzb_5=K(ejCT`rkZ-%%d(&aq!n762&X)Z@ zp20j!iU@_k8M?^V2x99>*gglVCd21mv>302%o{*-3LE;A zPX7S4*dx^j%o2MImc=$^#vcz>{h2w`lD-=rAlQI}Qc+on)gdUoe|}Q9L7Zspn{v8Pp(`WFL0MF0uHP~o#i;-1eKJPHJk>* zEra!q^axCBe&Qc`LxOB&Q-6tQvJ|{_cfv+kpOO&|@)3x}nlh+BK69u~OG@w{mr9J! zV;$LdiQ7}cq^z(Din~+9?|GxFMgvz|Smf z-5;7uKoW$kZenVx6SQEQ#h-E2q`$$1k0wL5xA#Nwk45B#X`w=%s0;>kSXhwkQh^(d zPTK-@Z^gu$)7SkU;9X7YFR=@9|5qcY7QNI5zF=TEiEyenCzA2ye)GDv78X5je}6wD z@2Q$}p^nqzJ3iR?R7pvH{RsbMeBPi=gd!qH_L&$r4_rx7fj1mC{y*btke%Fqebd1N zZcm9KV!XpH{YRvh3ZVyDmk^jfF18Qk8U;}-;K~GzCo(j%vPtHP7a8R{@qfPgb?mpS za9mPu{sFJ(X(m@b?gkx;D!Ad?kZ3TkdWy)*$k1T#b7vf)V8DSk3B+?CxUMKo8(j?h zkfQBGhX7L}5zO67(Vd2G#Xwg8ehBR17q^3#%1zZ-T1%c-r1TD8aDbrm2g5dwof_w0 zkP_JSBPZ|Uj4dN5(%}<|2$hwVK!$+mt7K@wo~@=D`-aCmeMoIp+;HF&2jd?=dE0Gj zOx7hS5G1JLrS>EcXR@J)-5gS|og7)ZpHO2iIU6L$GPX z>ZQ5&g$CCVHOTBttaIjjj+dN!K7QQ#$r#;f3q2UI8#mf_ zDPNUoJNXjrw^M(YR_f>guZ&5G2#j^99P5c-MyB~>$F&X1 zk3W0)*!W`(rsE0>l5=>|J`f5oBBVvOFBDI08yFdJM2-b`keyXt_m-vx7sFa=Jm~qd zac~qX^*%9$^bsE3M5e&exiUiVh?`9v$(1-h~-4B8gGC!-zKU+rEh~_}CK>7|lABNi@E^c@=g}fuX#P-eK+Nos+ zyhUE~BiLLtshXA|ue5fD<4dZmr$8@*112SEgJlIZ&*6t2a{=%JM!QstC3WyFsC$Or z!qa23r-GU8=St@}pwtaixVkhoDR{T)y1G#yGnVDq4L(~QFzXvZ@mwai+b#^I<+Ob& z#*gmpyxS~u2Pr2jcqVp>9gNbyHx6W3I7%NEqAhrWxx`hzta@mec{28*ucxvySG;%; zKDDg^#%+yFJB@X905TsmEf2UUddWi60P;~?6GfiCAIQ7VQQoQBQ zFCx5H0}Gb=LgjK z;kO~u4ZPIo`8g>Th%pD|-=U`i$#F*(1S;6UaB+y=?%t$^vx-XS^a~1uaq$A|N!*mu z{=YaOqVyy&&8;TpQ-njQoIodPKYi>*!=Mw-<^eKyDue<1Gnd$4=4vcv7@%wLN91aR zuDaJk`o++s6CPa1`VHrJ-pugF+IV||WyN71 z@Lmgo+bx;2m6$4KK!+yLRqnqZs41EE_d zJWq(+bs-Fs@lkR{07?ipK6ZoI*U{A`u<3E;bBP%TV{Fw7zAGD9CoKm|_({Qq*w^=z zrAlI8AZIf`TC+qShkmPsM8vKWQ-qkDn1F%-R-uvsNN=2HW?lvryj98E5!l{2b9vHlO3@+`Af;Bb|6ENQcxi4 zoR-J)gc!Cin;L`)Hwl9G2p|l=n;yDd-@AKEzQw+klutg)X$%D=xQlY2+?Dx$D>#YK zo7>6LxjA|e-=BsYv$TsAQc1TB8>;rQGA?j3NMM%vU6;@k6-5BgMDUF)VJ=*y301x% z{Y*k_eg71uI_Dm^O)xcO5$RQf`i9)Kb_Gkx8x$!ndLWw*(I%9LB{Fwn@bK~RvHiyxXI1Dm+S;;(I`(G%K-K$R8C~n$^y*mN zQy(S-qe`wc?*~)GpPdN3%dSO^F;x6C;Mt*2fkUQV`C6G<<+3<>06EEUGTML9( z?oLz~{ACx}aS5?QlY=DtM73J^_^3JRV@)*Nz!<(A>hViNoM(72CH8ef1KqA8;u1Us z$QMBbNKYqIW&O$Iq%kCC{ijeZYlVbPRD(fbhZ@F<0lN+y@2KcgK<42g}tp&>v; zDS3HwWi^&{Yv85wRF0`0kXcmpznl6z@3Hf3z@Gt+Ep{-Bj{)c;Pv8W5`N7w^4J$~yfU0-0`uza9mi zGCO4rKi^?X$yEXx&0~z{ER7~N;$pjz9+QIZu$FL6K`-gemgqHmIUJZ$Tj<#zc|30a z7aG2`zOHmIRvNBAR%yc0dAi;fuaUOU)!?F5LOJKs7#tJ$;}mayy$!eONKeA=Q_}HQ zp>F&u&V;>xg`69%)Q0?B$0;h7!SUOD?_?QrTAnR)hSTs!p#8SVUxx1Pi%@TXL(<;C z_#JP*d}u@rWVMl}ZTizD=DClmxE59qOUXN=DAMJ_k)n^X-CuL93G}^0#$+A`sJOcJ zEf`WF$HB=IV2DOb#g}*!6VV$OEOr146fwon(cl*%D+-+;4wP*;6o^RFzu|Ns5lk*qT7(ft+WdC{z zN*4&fsFayU4gU{TkC^`_Dt2Q&zRF@r$1y}~|@bFuR zGx_^d@0H70MNU`PoAf@{uYvwkVMz&6gf5;~NE^S`dxR9NNLT$yO^&FPnwQrS?ugv< zC><>4Nw&5CC7y!uLA9l-#<-t9JI8B09Vo2>Ha#9R1Es)L>``U|aWwu*)9cm&E^q2x z6oXgP1(^L;M4@*Rqt6o#K+pE85qzA93BS@qx~Q5n$ zfvn_Scr+OOPaXuCe-yEg3olP6*~I!C)f@X#-?u?Cuid9>7Ktr`YDr-&WLcvWXMa4^ zaz0r}+m@R5JyTB4do&w;yHfB&G>bJ9E=gxU&aTii6z_W(S6=S!F7GG_kxMS)dfb@o zIp36H+W3g;;{ZBa?wo05jcYMUYIu7=qfkGJO>9B~;{_;^ziIQ)S3`+6nD%IyK+TfX z{dI@pqK-&xCQ8aa)#E~LhKm8Pi>2EG~#Y&~1 zgCEV$UQ5RynDz{KltSU>TioCK8K4BV4P=Wg`-T7zN=!woU`R9|W=4wOO}{C}<1-Gz zQACt!Yjj(<3KXKvdD5*RA%kv%Vl1Cy1*k~s^SI-~DGB0TT}X&8@IO>0*!|5zqiLxC gi{k(N7wny1aK9i!9!5EjivzzjRCJZA6>Y-*54J33(f|Me literal 0 HcmV?d00001 diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..f577190 --- /dev/null +++ b/setup.py @@ -0,0 +1,32 @@ +from setuptools import setup, find_packages + +with open("requirements.txt") as f: + required = f.read().splitlines() + +setup( + name="gowpy", + version="0.1.0", + description="A very simple graph-of-words framework for NLP", + long_description=open("README.md", encoding="utf-8").read(), + long_description_content_type="text/markdown", + author="Guillaume Dubuisson Duplessis", + author_email="guillaume@dubuissonduplessis.fr", + url="https://github.com/GuillaumeDD/gowpy.git", + packages=find_packages(exclude="tests"), + license="new BSD", + install_requires=required, + include_package_data=True, + python_requires=">=3.6", + classifiers=[ + 'Development Status :: 3 - Alpha', + 'Intended Audience :: Developers', + 'Intended Audience :: Education', + 'Intended Audience :: Science/Research', + 'License :: OSI Approved :: BSD License', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7', + 'Topic :: Software Development :: Libraries', + 'Topic :: Software Development :: Libraries :: Python Modules' + ], +)