-
Notifications
You must be signed in to change notification settings - Fork 1
/
helper.py
161 lines (127 loc) · 5.63 KB
/
helper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import os
import tabula
import PyPDF2
import datetime
from time import mktime, strptime
from requests import post
import requests
# ----------------------------- GENERAL HELPER FUNCTIONS --------------------------------------------------------------
# Create a json string for given path recursively
# Source: https://stackoverflow.com/questions/25226208/represent-directory-tree-as-json
def path_dict(path):
p = path
name = os.path.basename(p)
# SMALL CONSTRUCT TO SHORTEN HIERARCHY
# Check if there is only one directory inside
while os.path.isdir(p) and len(os.listdir(p)) == 1:
# If that's the case then append it to the name and go further inside
sole_dir = os.listdir(p)[0]
name = name + "/" + sole_dir
p = os.path.join(p, sole_dir)
# ORIGINAL CONSTRUCT
d = {'name': name}
if os.path.isdir(p):
d['type'] = "directory"
d['children'] = [path_dict(os.path.join(p, x)) for x in os.listdir(p)]
# Really bad, but quick way out, high complexity if large hierarchy
d['npdf'] = path_number_of_files(p)
else:
d['type'] = "file"
if ".pdf" in p:
d['npdf'] = 1
d['url'] = p[5:]
else:
d['npdf'] = 0
return d
# TODO write in a Bottom up way or DP method to make it faster
# Finds the number of files in given path
def path_number_of_files(path):
n_files = sum([len(list(filter(lambda f: ".pdf" in f, files))) for r, d, files in os.walk(path)])
return n_files
# Returns the size of the directory in bytes
def dir_size(path):
total_size = 0
for dirpath, dirnames, filenames in os.walk(path):
for f in filenames:
fp = os.path.join(dirpath, f)
try:
total_size += os.path.getsize(fp)
except:
# Probably temporary file got removed or other error due to crawling in progress, ignore size
pass
return total_size
# PDF Creation Date Converter (from PDF format to datetime)
# Doesn't work for all encountered PDFs sadly
# https://stackoverflow.com/questions/16503075/convert-creationtime-of-pdf-to-a-readable-format-in-python
def pdf_date_format_to_datetime(str):
datestring = str[2:-7]
try :
ts = strptime(datestring, "%Y%m%d%H%M%S")
dt = datetime.datetime.fromtimestamp(mktime(ts))
except ValueError:
print("Unable to convert time for string: " + str)
dt = datetime.datetime.strptime("01/01/1970", '%m/%d/%Y')
return dt
# Checks if a URL exists
def url_status(url):
try:
r = requests.head(url)
return r.status_code
except:
return -1
# ----------------------------- DEPRECATED HELPER FUNCTIONS -----------------------------------------------------------
# Uses Tabula to detect and extract tables from the pdf's
# INPUT: path containing pdf's and the maximal number of pdf to analyse
# This function is now split into two Celery background tasks
def pdf_stats(path, n_pdf, post_url):
stats = {}
# Keep track of successful and unsuccessful files
n_success = 0
n_error = 0
for dir_, _, files in os.walk(path):
for fileName in files:
if ".pdf" in fileName:
# Check if enough pdf already processed
if n_success + n_error >= n_pdf:
return stats, n_error, n_success
print("Number errors: %d" % (n_error,))
print("Number successes: %d" % (n_success,))
print(stats)
try:
# Get file location
rel_file = os.path.join(dir_, fileName)
# STEP 0: set all counter to 0
n_table_pages = 0
n_table_rows = 0
table_sizes = {'small': 0, 'medium': 0, 'large': 0}
# STEP 1: count total number of pages
pdf_file = PyPDF2.PdfFileReader(open(rel_file, mode='rb'))
n_pages = pdf_file.getNumPages()
# STEP 2: run TABULA to extract all tables into one dataframe
df_array = tabula.read_pdf(rel_file, pages="all", multiple_tables=True)
# STEP 3: count number of rows in each dataframe
for df in df_array:
rows = df.shape[0]
n_table_rows += rows
n_table_pages += 1
# Add table stats
if rows <= SMALL_TABLE_LIMIT:
table_sizes['small'] += 1
elif rows <= MEDIUM_TABLE_LIMIT:
table_sizes['medium'] += 1
else:
table_sizes['large'] += 1
# STEP 4: save stats
creation_date = pdf_file.getDocumentInfo()['/CreationDate']
stats[fileName] = {'n_pages': n_pages, 'n_tables_pages': n_table_pages,
'n_table_rows': n_table_rows, 'creation_date': creation_date,
'table_sizes': table_sizes, 'url': rel_file}
print("Tabula Conversion done for %s" % (fileName,))
n_success += 1
# STEP 5: Send message asynchronously
post(post_url, json={'event':'my_response', 'data':
{'data': 'I successfully performed table detection', 'success': n_success, 'count': 1}})
except:
print("ERROR: Tabula Conversion failed for %s" % (fileName,))
n_error += 1
return stats, n_error, n_success