-
Notifications
You must be signed in to change notification settings - Fork 1
/
readera_pdf_highlighter.py
269 lines (239 loc) · 10.3 KB
/
readera_pdf_highlighter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
from dataclasses import dataclass
from functools import cache
import json
import os
from pathlib import Path
import re
import subprocess
import sys
from zipfile import ZipFile
import fitz
@dataclass
class BookInfo:
title: str
filename: str
citations: set
def get_all_citations(readera_backup_filename):
uris_to_book_infos = {}
print(f"Reading {readera_backup_filename}")
with ZipFile(readera_backup_filename) as backup_file:
with backup_file.open('library.json') as f:
data = json.load(f)
for doc in data['docs']:
uri = doc['uri']
doc_data = doc['data']
try:
book_title = doc_data['doc_title']
except KeyError:
book_title = doc_data['doc_file_name_title']
doc_links = doc['links']
assert len(doc_links) <= 1, doc_links
try:
book_filename = doc_links[0]['file_name']
except IndexError:
book_filename = None
book_info = BookInfo(book_title, book_filename, set())
uris_to_book_infos[uri] = book_info
for citation in doc['citations']:
try:
note_extra = citation['note_extra']
except KeyError:
note_extra = None
book_info.citations.add((citation['note_body'], citation['note_page']+citation['note_index'], note_extra))
return uris_to_book_infos
def verify_citations_complete():
"""
Check all ReadEra backup files in the current dir. Collect all citations.
For each backup file, show whether it has all citations or which are missing.
"""
all_uris_to_book_infos = {}
readera_backups_to_map = {}
for readera_backup_filename in sorted(Path('.').glob('ReadEra*bak'), reverse=True):
uris_to_book_infos = get_all_citations(readera_backup_filename)
readera_backups_to_map[readera_backup_filename] = uris_to_book_infos
for uri, book_info in uris_to_book_infos.items():
try:
all_uris_to_book_infos[uri].citations.update(book_info.citations)
except KeyError:
all_uris_to_book_infos[uri] = book_info
for readera_backup_filename, uris_to_book_infos in readera_backups_to_map.items():
print(f"Checking {readera_backup_filename}")
if uris_to_book_infos == all_uris_to_book_infos:
print(f" Contains all citations")
else:
for all_uri, all_book_info in all_uris_to_book_infos.items():
try:
book_citations = uris_to_book_infos[all_uri].citations
for citation in all_book_info:
if citation not in book_citations:
print(f" Missing citation: {all_book_info.title} / {citation}")
except KeyError:
print(f" Missing title {all_book_info.title}")
raise
def show_titles(readera_backup_filename):
uris_to_book_infos = get_all_citations(readera_backup_filename)
for book_info in sorted(uris_to_book_infos.values(), key=lambda book_info: book_info.title):
if len(book_info.citations) > 0:
print(f"{book_info.title}, Citations: {len(book_info.citations)}")
def get_citations_by_file(readera_backup_filename, book_filename):
uris_to_book_infos = get_all_citations(readera_backup_filename)
for book_info in uris_to_book_infos.values():
if book_info.filename == book_filename:
return book_info
return None
def show_citations_for_book(readera_backup_filename, book):
if Path(book).is_file():
print(f"Searching for file '{book}'")
book_info = get_citations_by_file(readera_backup_filename, book)
else:
print(f"Searching for title '{book}'")
uris_to_book_infos = get_all_citations(readera_backup_filename)
for book_info in uris_to_book_infos.values():
if book.lower() in book_info.title.lower():
break
else:
book_info = None
if not book_info:
print(f"Book not found")
sys.exit(1)
print(f"Book: {book_info.title}")
for citation in sorted(book_info.citations, key=lambda item: item[1]):
print(f"- {citation[0]}")
print()
def tokenize(text):
return re.findall(r'''[A-Za-z0-9!?.,;:'"]''', text)
def find_in(text: str, span: str) -> tuple[int, int, str]:
# Returns (start_index_of_match, end_index_of_match, remainder of span that was not matched)
# If a remainder is returned, the match was not complete but ended at the end of the text.
# When no prefix of span could be found, start_index_of_match and end_index_of_match are None.
try:
match_index = text.index(span)
return match_index, match_index + len(span) - 1, None
except ValueError:
for i in range(1, len(span)):
subspan = span[:-i]
if text.endswith(subspan):
return len(text) - len(subspan), len(text) - 1, span[len(subspan):]
return None, None, span
def add_citations_to_pdf(pdf_filename, citations):
doc = fitz.open(pdf_filename)
pages = [(page, page.get_textpage(flags=fitz.TEXT_MEDIABOX_CLIP)) for page in doc.pages()]
page_index = page_index_last_found = 0
num_found = num_not_found = 0
@cache
def get_textpage_words(page_index) -> tuple[str, list[tuple[int, int, int, int]]]:
words = []
coords = []
for word_info in pages[page_index][1].extractWORDS():
word = word_info[4]
word_tokenized = tokenize(word)
words += word_tokenized
coords += [word_info[:4]] * len(word_tokenized)
return ''.join(words), coords
for citation, _, note in sorted(citations, key=lambda item: item[1]):
citation_parts = citation.split('\n')
for citation_part in citation_parts:
citation_string = ''.join(tokenize(citation_part))
remainder = citation_string
highlights = []
while page_index < len(pages):
page, _ = pages[page_index]
textpage_string, char_coords = get_textpage_words(page_index)
match_start, match_end, remainder = find_in(textpage_string, remainder)
if match_start is None:
remainder = citation_string
highlights = []
else:
if len(highlights) > 0 and match_start > 0:
# This is a partial match which had a partial match on the previous page. But it
# does not start at the beginning of the page, so it is not a true continuation.
# - Ignore match from previous page
# - Check current page for the full citation
remainder = citation_string
highlights = []
continue
else:
highlights.append((page, fitz.Point(char_coords[match_start][:2]),
fitz.Point(char_coords[match_end][2:])))
if remainder:
page_index += 1
else:
for page, start_coords, stop_coords in highlights:
page.add_highlight_annot(start=start_coords, stop=stop_coords)
if note is not None:
page.add_text_annot(start_coords - fitz.Point(8, 16), note)
page_index_last_found = page_index
num_found += 1
break
else:
print(f"Citation not found: {citation_part=}")
num_not_found += 1
page_index = page_index_last_found
doc.saveIncr()
print(f"Citations found: {num_found}")
if num_not_found > 0:
print(f"Citations not found: {num_not_found}")
if '--debug' in sys.argv:
with open('book.txt', 'w') as f:
for _, textpage in pages:
print(textpage.extractText().encode(), file=f)
print(file=f)
return False
return True
def book_to_pdf(book_filename, pdf_filename):
# Only really tested with epub
# Possibly add --embed-all-fonts --subset-embedded-font
subprocess.check_call(['ebook-convert', book_filename, pdf_filename])
def make_highlighted_pdf(readera_backup_filename, book_filename):
book_info = get_citations_by_file(readera_backup_filename, book_filename)
if not book_info:
print(f"Book {book_filename} not found")
sys.exit(1)
pdf_filename = Path(book_filename).with_suffix('.pdf')
book_to_pdf(book_filename, pdf_filename)
success = add_citations_to_pdf(pdf_filename, book_info.citations)
print(f"Produced highlighted PDF file: {pdf_filename}")
if success:
print('OK')
else:
print('ERROR')
sys.exit(1)
def help():
print(f"Usage: {sys.argv[0]} <cmd> [args]")
print("Commands:")
print(" verify-citations-complete")
print(" Checks all backup files in the current directory and shows which have all citations")
print(" show-titles")
print(" Show all books with citations")
print(" show-citations <file or book title substring>")
print(" make-highlighted-pdf <book_filename>")
print(" Convert book to PDF and add highlights from ReadEra backup")
sys.stdout.reconfigure(encoding='utf-8')
try:
cmd = sys.argv[1]
except IndexError:
help()
sys.exit()
if cmd == 'verify-citations-complete':
verify_citations_complete()
else:
readera_backup_filename = os.getenv('READERA_BACKUP')
if readera_backup_filename is None:
readera_backup_filenames = tuple(Path('.').glob('ReadEra*.bak'))
if len(readera_backup_filenames) == 0:
print('Did not find any ReadEra backup in current directory')
sys.exit(1)
else:
readera_backup_filename = max(readera_backup_filenames, key=lambda f: f.stat().st_mtime)
print(f"Using {readera_backup_filename}")
if cmd == 'show-titles':
show_titles(readera_backup_filename)
elif cmd == 'show-citations':
book_title = sys.argv[2]
show_citations_for_book(readera_backup_filename, book_title)
elif cmd == 'make-highlighted-pdf':
book_filename = sys.argv[2]
make_highlighted_pdf(readera_backup_filename, book_filename)
else:
help()
sys.exit(1)