-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
73 lines (53 loc) · 2.21 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import logging, json, os
from datetime import datetime
def getJSON(file):
f = open(file)
json_object = json.load(f)
f.close()
return json_object
def numberOfDocumentsProcessedByThisThread(file):
try:
return int(getLastLineOfFile(file).split(' ')[0])
except:
return 0
def lastAbsoluteDocumentNumberProcessedByThisThread(file):
try:
return int(getLastLineOfFile(file).split(' ')[1])
except:
return 0
def getLastLineOfFile(file):
with open(file, 'rb') as f:
try:
f.seek(-2, os.SEEK_END)
while f.read(1) != b'\n':
f.seek(-2, os.SEEK_CUR)
last_line = f.readline().decode()
return last_line
except OSError as e:
print(e)
f.seek(0)
return ''
def documentShouldBeProcessedByThisThread(thread_number, document_number, number_of_threads):
if document_number > number_of_threads:
return thread_number == (document_number % number_of_threads) + 1
else:
return thread_number == document_number
def totalNumberOfDocumentsThisThreadMustProcess(thread_number, total_documents, number_of_threads):
generic_total = total_documents // number_of_threads
leftover = total_documents % number_of_threads
if thread_number <= leftover:
return generic_total + 1
else:
return generic_total
def getTimestamp():
return '[' + datetime.now().strftime("%m/%d/%Y %H:%M:%S") + ']'
def logProgress(documents_processed_by_this_thread, total_documents_for_this_thread, thread_number, document_number, progress_file):
percent_done = round((documents_processed_by_this_thread / (total_documents_for_this_thread)) * 100, 5)
progress_message = f'{getTimestamp()} [Thread-{thread_number}] {percent_done}% {documents_processed_by_this_thread}/{total_documents_for_this_thread} Document {document_number}'
print(progress_message)
with open(progress_file, 'a') as f:
f.write(f'{documents_processed_by_this_thread} {document_number}\n')
def logError(logger, e, thread_number):
error_message = f'{getTimestamp()} [Thread-{thread_number}] {e}'
logger.log(logging.ERROR, error_message)
print(error_message)