-
Notifications
You must be signed in to change notification settings - Fork 10
/
create_dictionary.py
executable file
·56 lines (48 loc) · 1.74 KB
/
create_dictionary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
from __future__ import print_function
import os
import sys
import json
import numpy as np
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from graph_qa_img_dataset import Dictionary
def create_dictionary(dataroot):
dictionary = Dictionary()
questions = []
files = [
'v2_OpenEnded_mscoco_train2014_questions.json',
'v2_OpenEnded_mscoco_val2014_questions.json',
'v2_OpenEnded_mscoco_test2015_questions.json',
'v2_OpenEnded_mscoco_test-dev2015_questions.json'
]
for path in files:
question_path = os.path.join(dataroot, path)
qs = json.load(open(question_path))['questions']
for q in qs:
dictionary.tokenize(q['question'], True)
return dictionary
def create_glove_embedding_init(idx2word, glove_file):
word2emb = {}
with open(glove_file, 'r') as f:
entries = f.readlines()
emb_dim = len(entries[0].split(' ')) - 1
print('embedding dim is %d' % emb_dim)
weights = np.zeros((len(idx2word), emb_dim), dtype=np.float32)
for entry in entries:
vals = entry.split(' ')
word = vals[0]
vals = map(float, vals[1:])
word2emb[word] = np.array(vals)
for idx, word in enumerate(idx2word):
if word not in word2emb:
print(word)
continue
weights[idx] = word2emb[word]
return weights, word2emb
if __name__ == '__main__':
d = create_dictionary('data')
d.dump_to_file('data/dictionary.pkl')
d = Dictionary.load_from_file('data/dictionary.pkl')
emb_dim = 300
glove_file = 'data/glove/glove.6B.%dd.txt' % emb_dim
weights, word2emb = create_glove_embedding_init(d.idx2word, glove_file)
np.save('data/glove6b_init_%dd.npy' % emb_dim, weights)