-
Notifications
You must be signed in to change notification settings - Fork 5
/
4chan-thread-archiver-orig
executable file
·239 lines (192 loc) · 8.53 KB
/
4chan-thread-archiver-orig
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
#!/usr/bin/env python
# coding: utf-8
# Part of the JSON-based-chanarchiver by Lawrence Wu, 2012/04/04
# Originally from https://github.com/socketubs/4chandownloader
# Rewritten to save in seperate images folder, download plain HTML, modularization, comments, and code cleanup
#
# Initial release Nov. 5, 2009
# v6 release Jan. 20, 2009
# http://cal.freeshell.org
#
# Refactor, update and Python package
# by Socketubs (http://socketubs.net/)
# 09-08-12
#
import os
import time
import json
import re
import errno
import requests
from docopt import docopt
doc = """4chan-thread-archiver-orig, uses 4chan API to download thread images and/or
thumbnails, along with thread HTML, JSON, and a list of referenced external
links. Does not use the py4chan library.
Usage:
4chan-thread-archiver-orig <url> [--path=<string>] [--delay=<int>] [--nothumbs] [--thumbsonly]
4chan-thread-archiver-orig -h | --help
4chan-thread-archiver-prog -v | --version
Options:
--nothumbs Don't download thumbnails
--thumbsonly Download thumbnails, no images
--delay=<int> Delay between thread checks [default: 20]
-h --help Show help
-v --version Show version
"""
# paths to 4chan URLs (with regex!)
FOURCHAN_API_URL = 'https://api.4chan.org/%s/res/%s.json' # board, thread
FOURCHAN_IMAGES_URL = 'https://images.4chan.org/%s/src/%s' # board, image
FOURCHAN_THUMBS_URL = 'https://thumbs.4chan.org/%s/thumb/%s' # board, thumb
FOURCHAN_BOARDS_URL = 'http://boards.4chan.org/%s/res/%s' # board, thread
# folder names for image and thumbnails
IMAGE_DIR_NAME = "img"
THUMB_DIR_NAME = "thumb"
_DEFAULT_FOLDER = "4chan-saved-threads"
_DUMP_COMPLETE_STRING = "Dump complete. To resume dumping the same thread,\nrun this script again."
# Recursively create paths if they don't exist
# replace with `os.makedirs(path,exist_ok=True)` in python3
def make_sure_path_exists(path):
try:
os.makedirs(path)
except OSError as exception:
if exception.errno != errno.EEXIST:
raise
# Download any file using requests
def download_file(fname, dst_folder, file_url):
# Destination of downloaded file
file_dst = os.path.join(dst_folder, fname)
# If the file doesn't exist, download it
if not os.path.exists(file_dst):
print('%s downloading...' % fname)
i = requests.get(file_url)
if i.status_code == 404:
print(' | Failed, try later (%s)' % file_url)
else:
open(file_dst, 'w').write(i.content)
else:
print('%s already downloaded' % fname)
# Dump all images within a thread from 4chan
def dump_images(post, dst_dir, board):
dst_images_dir = os.path.join(dst_dir, IMAGE_DIR_NAME)
make_sure_path_exists(dst_images_dir)
image_name = '%s%s' % (post['tim'], post['ext'])
image_url = FOURCHAN_IMAGES_URL % (board, image_name)
download_file(image_name, dst_images_dir, image_url)
# Dump all thumbnails within a thread from 4chan, using `post` from
def dump_thumbs(post, dst_dir, board):
dst_thumbs_dir = os.path.join(dst_dir, THUMB_DIR_NAME)
make_sure_path_exists(dst_thumbs_dir)
thumb_name = '%ss.jpg' % post['tim']
thumb_url = FOURCHAN_THUMBS_URL % (board, thumb_name)
download_file(thumb_name, dst_thumbs_dir, thumb_url)
# File in place regex function originally scripted by steveha on StackOverflow:
# http://stackoverflow.com/questions/1597649/replace-strings-in-files-by-python
# Notice: `\1` notation could be interpreted by python as `\x01`! Escape it with a second backslash: `\\1`
def file_replace(fname, pat, s_after):
# first, see if the pattern is even in the file.
with open(fname) as f:
if not any(re.search(pat, line) for line in f):
return # pattern does not occur in file so we are done.
# pattern is in the file, so perform replace operation.
with open(fname) as f:
out_fname = fname + ".tmp"
out = open(out_fname, "w")
for line in f:
out.write(re.sub(pat, s_after, line))
out.close()
os.rename(out_fname, fname)
# Dumps thread in raw HTML format to `<thread-id>.html`
def dump_html(dst_dir, board, thread):
fourchan_images_regex = re.compile("http://images.4chan.org/\w+/src/")
fourchan_thumbs_regex = re.compile("http://\d+.thumbs.4chan.org/\w+/thumb/")
html_filename = "%s.html" % thread
html_url = FOURCHAN_BOARDS_URL % (board, thread)
download_file(html_filename, dst_dir, html_url)
# Convert all links in HTML dump to use locally downloaded files
html_path = os.path.join(dst_dir, html_filename)
file_replace(html_path, '"//', '"http://')
file_replace(html_path, fourchan_images_regex, IMAGE_DIR_NAME + "/")
file_replace(html_path, fourchan_thumbs_regex, THUMB_DIR_NAME + "/")
# (future) Download a local copy of all CSS files, and convert HTML links to use them (we need to use beautifulsoup to get links...)
#file_replace(html_path, "http://static.4chan.org/css/(\w+.\d+).css", "css/\\1.css")
# Grab thread JSON from 4chan API
def dump_json(dst_dir, board, thread):
json_filename = "%s.json" % thread
json_path = os.path.join(dst_dir, json_filename)
json_thread = requests.get(FOURCHAN_API_URL % (board, thread))
json.dump(json_thread.json, open(json_path, 'w'), sort_keys=True, indent=2, separators=(',', ': '))
# Get all external links quoted in comments
def list_external_links(json_thread, dst_dir):
# The Ultimate URL Regex
# http://stackoverflow.com/questions/520031/whats-the-cleanest-way-to-extract-urls-from-a-string-using-python
linkregex = re.compile(r"""((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|(([^\s()<>]+|(([^\s()<>]+)))*))+(?:(([^\s()<>]+|(([^\s()<>]+)))*)|[^\s`!()[]{};:'".,<>?«»“”‘’]))""", re.DOTALL)
# File to store list of all external links quoted in comments (overwrite upon each loop iteration)
linklist_dst = os.path.join(dst_dir, "external_links.txt")
linklist_file = open(linklist_dst, "w")
for post in json_thread.json['posts']:
if not linkregex.search(post['com']):
continue
else:
# We need to get rid of all <wbr> tags before parsing
cleaned_com = re.sub(r'\<wbr\>', '', post['com'])
linklist = re.findall(linkregex, cleaned_com)
for item in linklist:
print("Found link to external site, saving in %s:\n%s\n" % (linklist_dst, item[0]))
linklist_file.write(item[0]) # re.findall creates tuple
linklist_file.write('\n') # subdivide with newlines
# Close linklist file after loop
linklist_file.close()
# Download images, thumbs, and gather links to external urls
def get_files(dst_dir, board, thread, nothumbs, thumbsonly):
# Grab thread JSON from 4chan API
json_thread = requests.get(FOURCHAN_API_URL % (board, thread))
for post in json_thread.json['posts']:
if post.get('filename', False):
# If file is deleted, move on
if post.get('filedeleted', False):
continue
# Download images, if not only downloading thumbnails
if not thumbsonly:
dump_images(post, dst_dir, board)
# Download thumbnails by default, but not if user doesn't want it
if thumbsonly or (not nothumbs):
dump_thumbs(post, dst_dir, board)
# Get all external links quoted in comments
list_external_links(json_thread, dst_dir)
def main(args):
# Copy data from arguments
thread = args.get('<url>').split('/')[5]
board = args.get('<url>').split('/')[3]
path = args.get('--path')
nothumbs = args.get('--nothumbs', False)
thumbsonly = args.get('--thumbsonly', False)
delay = args.get('--delay')
# Set a default path if none is given
if (path == None):
path = os.path.join(os.getcwd() + os.path.sep + _DEFAULT_FOLDER)
# try/except loop to handle Ctrl-C
try:
# Begin file dump loop
while 1:
print(' :: Board: %s' % board)
print(' :: Thread: %s' % thread)
# Create paths if they don't exist
dst_dir = os.path.join(path, board, thread)
make_sure_path_exists(dst_dir)
# Download images, thumbs, and gather links to external urls
get_files(dst_dir, board, thread, nothumbs, thumbsonly)
# Dumps thread in raw HTML format to `<thread-id>.html`
dump_html(dst_dir, board, thread)
# Dumps thread in JSON format to `<thread-id>.json` file, pretty printed
dump_json(dst_dir, board, thread)
# Wait to execute code again
print("Waiting %s seconds before retrying (Type Ctrl-C to stop)\n" % delay)
time.sleep(int(delay))
except KeyboardInterrupt:
print("\n")
print(_DUMP_COMPLETE_STRING)
raise SystemExit(0)
print(_DUMP_COMPLETE_STRING)
if __name__ == '__main__':
args = docopt(doc, version=0.3)
main(args)