-
Notifications
You must be signed in to change notification settings - Fork 1
/
similar_articles.py
83 lines (61 loc) · 2.85 KB
/
similar_articles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# -*- coding: utf-8 -*-
"""
Ce script prend l'id d'un article en entrée, et renvoie 5 articles proches
"""
import argparse
import logging
import sys
from gensim import corpora, similarities, models
import utils
def find_similar_articles(corpus_name, method, id=None, content=None):
corpus_file = corpus_name + '_' + method + '.mm'
index_file = corpus_name + '_' + method + '_index'
docid_file = corpus_name + '_docid.txt'
try:
corpus = corpora.mmcorpus.MmCorpus(corpus_file)
except Exception:
raise IOError('Impossible de charger le fichier %s' % (corpus_file))
try:
index = similarities.docsim.Similarity.load(index_file)
except Exception:
raise IOError('Impossible de charger le fichier %s' % (index_file))
if id is not None:
corpus_id = utils.get_article_by_id(id, docid_file)
tokens = corpus[corpus_id]
elif content is not None:
dico_file = corpus_name + '_wordids.txt'
try:
id2word = corpora.dictionary.Dictionary.load_from_text(dico_file)
except Exception:
raise IOError("Impossible de charger le fichier %s" % (dico_file))
if method == 'tfidf':
model_file = corpus_name + '_tfidf_model'
model = models.tfidfmodel.TfidfModel.load(model_file)
elif method.startswith('lsi'):
model_file = corpus_name + '_' + args.method + '_model'
model = models.lsimodel.LsiModel.load(model_file)
elif method.startswith('lda'):
model_file = corpus_name + '_' + args.method + '_model'
model = models.ldamodel.LdaModel.load(model_file)
tokens = model[id2word.doc2bow(utils.tokenize(content))]
else:
raise Exception("Il faut fournir un id ou un contenu")
sims = index[tokens]
sims = sorted(enumerate(sims), key=lambda item: -item[1])
return [(utils.get_article_by_corpus_number(x[0], docid_file), x[1]) for x in sims[:5]]
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="""Ce script prend l'id d'un article en entrée, et renvoie 5 articles proches""")
parser.add_argument('corpus_name', type=str, help='Le nom du corpus')
parser.add_argument('method', type=str, help="La méthode utilisée (lda, lsi, tfidf)")
parser.add_argument('--id', type=int, help="L'id de l'article")
parser.add_argument('-v', '--verbose', action='store_true',
help="Afficher les messages d'information")
args = parser.parse_args()
if args.verbose:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
if args.id:
print find_similar_articles(args.corpus_name, args.method, id=args.id)
else:
content = unicode(sys.stdin.read(), 'utf8')
print content
print find_similar_articles(args.corpus_name, args.method, content=content)