-
Notifications
You must be signed in to change notification settings - Fork 6
/
processing_pubtabnet.py
74 lines (60 loc) · 2.83 KB
/
processing_pubtabnet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import os
import json
from PIL import Image
from tqdm import tqdm
def preprocess_json(dataset_dir, output_dir):
json_path = os.path.join(dataset_dir, 'PubTabNet_2.0.0.jsonl')
dicts = {split: [] for split in ['train', 'val', 'test']}
with open(json_path, 'r', encoding='utf-8') as f:
for line in tqdm(f, total=509892):
data = json.loads(line)
data_new = dict()
image_path = os.path.join(dataset_dir, data['split'], data['filename'])
image = Image.open(image_path)
data_new['image_path'] = image_path
data_new['image_size'] = image.size
# start text
nd = len([x for x in data['html']['structure']['tokens'] if x == '</td>'])
nc = len(data['html']['cells'])
alert_msg = "The number of td (%d) is note equal to the number of cells (%d)." % (nd, nc)
assert nd == nc, alert_msg
data_new['text'] = ['<START>']
cnt_cell = 0
for struct_tok in data['html']['structure']['tokens']:
if struct_tok == '</td>':
cell = data['html']['cells'][cnt_cell]
cnt_cell += 1
data_new['text'] += cell['tokens']
data_new['text'].append(struct_tok)
# end text
data_new['num_tokens'] = len(data_new['text'])
dicts[data['split']].append(data_new)
for k, v in dicts.items():
output_path = os.path.join(output_dir, k + ".json")
with open(output_path, 'w', encoding='utf-8') as out:
json.dump(v, out)
def generate_vocab(dataset_dir, output_dir):
json_path = os.path.join(dataset_dir, "PubTabNet_2.0.0.jsonl")
tokens = {key: set() for key in ['structure', 'cell']}
with open(json_path, 'r', encoding='utf-8') as f:
for line in tqdm(f, total=509892):
data = json.loads(line)
tokens['structure'].update(data['html']['structure']['tokens'])
for cell in data['html']['cells']:
tokens['cell'].update(cell['tokens'])
print('\nsize of structure_tokens: ', len(tokens['structure']))
print('size of cell_tokens: ', len(tokens['cell']))
tokens['cell'] = tokens['cell'].difference(tokens['structure'])
tokens_total = []
for key, value in tokens.items():
tokens_total.extend(sorted(list(value)))
vocab_path = os.path.join(output_dir, 'vocab.txt')
with open(vocab_path, 'w', encoding='utf-8') as out:
out.write('<PAD>\n<START>\n<END>\n<CLS>\n')
for token in tokens_total:
out.write(token + '\n')
if __name__ == '__main__':
dataset_dir = '/data/private/datasets/pubtabnet'
output_dir = '/data/private/datasets/pubtabnet/annotations'
generate_vocab(dataset_dir, output_dir)
preprocess_json(dataset_dir, output_dir)