4chan-thread-archiver-orig

#!/usr/bin/env python
# coding: utf-8

# Part of the JSON-based-chanarchiver by Lawrence Wu, 2012/04/04
# Originally from https://github.com/socketubs/4chandownloader
# Rewritten to save in seperate images folder, download plain HTML, modularization, comments, and code cleanup

#
# Initial release Nov. 5, 2009
# v6 release Jan. 20, 2009
# http://cal.freeshell.org
#
# Refactor, update and Python package
# by Socketubs (http://socketubs.net/)
# 09-08-12
#

import os
import time
import json
import re
import errno
import requests

from docopt import docopt

doc = """4chan-thread-archiver-orig, uses 4chan API to download thread images and/or 
thumbnails, along with thread HTML, JSON, and a list of referenced external
links. Does not use the py4chan library.

Usage:
  4chan-thread-archiver-orig <url> [--path=<string>] [--delay=<int>] [--nothumbs] [--thumbsonly]
  4chan-thread-archiver-orig -h | --help
  4chan-thread-archiver-prog -v | --version

Options:
  --nothumbs          Don't download thumbnails
  --thumbsonly        Download thumbnails, no images
  --delay=<int>       Delay between thread checks [default: 20]
  -h --help           Show help
  -v --version        Show version
"""

# paths to 4chan URLs (with regex!)
FOURCHAN_API_URL = 'https://api.4chan.org/%s/res/%s.json' # board, thread
FOURCHAN_IMAGES_URL = 'https://images.4chan.org/%s/src/%s' # board, image
FOURCHAN_THUMBS_URL = 'https://thumbs.4chan.org/%s/thumb/%s' # board, thumb
FOURCHAN_BOARDS_URL = 'http://boards.4chan.org/%s/res/%s' # board, thread

# folder names for image and thumbnails
IMAGE_DIR_NAME = "img"
THUMB_DIR_NAME = "thumb"
_DEFAULT_FOLDER = "4chan-saved-threads"

_DUMP_COMPLETE_STRING = "Dump complete. To resume dumping the same thread,\nrun this script again."

# Recursively create paths if they don't exist 
# replace with `os.makedirs(path,exist_ok=True)` in python3
def make_sure_path_exists(path):
    try:
	os.makedirs(path)
    except OSError as exception:
	if exception.errno != errno.EEXIST:
	    raise

# Download any file using requests
def download_file(fname, dst_folder, file_url):
    # Destination of downloaded file
    file_dst = os.path.join(dst_folder, fname)
    
    # If the file doesn't exist, download it
    if not os.path.exists(file_dst):
	print('%s downloading...' % fname)
	i = requests.get(file_url)
	if i.status_code == 404:
	    print(' | Failed, try later (%s)' % file_url)
	else:
	    open(file_dst, 'w').write(i.content)
    else:
	print('%s already downloaded' % fname)

# Dump all images within a thread from 4chan
def dump_images(post, dst_dir, board):
    dst_images_dir = os.path.join(dst_dir, IMAGE_DIR_NAME)
    make_sure_path_exists(dst_images_dir)
    
    image_name = '%s%s' % (post['tim'], post['ext'])
    image_url  = FOURCHAN_IMAGES_URL % (board, image_name)
    
    download_file(image_name, dst_images_dir, image_url)

# Dump all thumbnails within a thread from 4chan, using `post` from 
def dump_thumbs(post, dst_dir, board):
    dst_thumbs_dir = os.path.join(dst_dir, THUMB_DIR_NAME)
    make_sure_path_exists(dst_thumbs_dir)
    
    thumb_name = '%ss.jpg' % post['tim']
    thumb_url = FOURCHAN_THUMBS_URL % (board, thumb_name)
    
    download_file(thumb_name, dst_thumbs_dir, thumb_url)

# File in place regex function originally scripted by steveha on StackOverflow: 
# http://stackoverflow.com/questions/1597649/replace-strings-in-files-by-python
# Notice: `\1` notation could be interpreted by python as `\x01`! Escape it with a second backslash: `\\1`  
def file_replace(fname, pat, s_after):
    # first, see if the pattern is even in the file.
    with open(fname) as f:
	if not any(re.search(pat, line) for line in f):
	    return # pattern does not occur in file so we are done.

    # pattern is in the file, so perform replace operation.
    with open(fname) as f:
	out_fname = fname + ".tmp"
	out = open(out_fname, "w")
	for line in f:
	    out.write(re.sub(pat, s_after, line))
	out.close()
	os.rename(out_fname, fname)

# Dumps thread in raw HTML format to `<thread-id>.html`
def dump_html(dst_dir, board, thread):
    fourchan_images_regex = re.compile("http://images.4chan.org/\w+/src/")
    fourchan_thumbs_regex = re.compile("http://\d+.thumbs.4chan.org/\w+/thumb/")
    html_filename = "%s.html" % thread
    html_url = FOURCHAN_BOARDS_URL % (board, thread)
    download_file(html_filename, dst_dir, html_url)
    
    # Convert all links in HTML dump to use locally downloaded files
    html_path = os.path.join(dst_dir, html_filename)
    file_replace(html_path, '"//', '"http://')
    file_replace(html_path, fourchan_images_regex, IMAGE_DIR_NAME + "/")
    file_replace(html_path, fourchan_thumbs_regex, THUMB_DIR_NAME + "/")
    
    # (future) Download a local copy of all CSS files, and convert HTML links to use them (we need to use beautifulsoup to get links...)
    #file_replace(html_path, "http://static.4chan.org/css/(\w+.\d+).css", "css/\\1.css")

# Grab thread JSON from 4chan API
def dump_json(dst_dir, board, thread):
    json_filename = "%s.json" % thread
    json_path = os.path.join(dst_dir, json_filename)
    
    json_thread = requests.get(FOURCHAN_API_URL % (board, thread))
    json.dump(json_thread.json, open(json_path, 'w'), sort_keys=True, indent=2, separators=(',', ': '))
	  
# Get all external links quoted in comments
def list_external_links(json_thread, dst_dir):
    # The Ultimate URL Regex
    # http://stackoverflow.com/questions/520031/whats-the-cleanest-way-to-extract-urls-from-a-string-using-python
    linkregex = re.compile(r"""((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.‌​][a-z]{2,4}/)(?:[^\s()<>]+|(([^\s()<>]+|(([^\s()<>]+)))*))+(?:(([^\s()<>]+|(‌​([^\s()<>]+)))*)|[^\s`!()[]{};:'".,<>?«»“”‘’]))""", re.DOTALL)

    # File to store list of all external links quoted in comments (overwrite upon each loop iteration)
    linklist_dst = os.path.join(dst_dir, "external_links.txt")
    linklist_file = open(linklist_dst, "w")

    for post in json_thread.json['posts']:
	if not linkregex.search(post['com']):
	    continue
	else:
	    # We need to get rid of all <wbr> tags before parsing
	    cleaned_com = re.sub(r'\<wbr\>', '', post['com'])
	    linklist = re.findall(linkregex, cleaned_com)
	    for item in linklist:
		print("Found link to external site, saving in %s:\n%s\n" % (linklist_dst, item[0]))
		linklist_file.write(item[0])	# re.findall creates tuple
		linklist_file.write('\n')	# subdivide with newlines

    # Close linklist file after loop
    linklist_file.close()

# Download images, thumbs, and gather links to external urls
def get_files(dst_dir, board, thread, nothumbs, thumbsonly):
    
    # Grab thread JSON from 4chan API
    json_thread = requests.get(FOURCHAN_API_URL % (board, thread))
    
    for post in json_thread.json['posts']:
	if post.get('filename', False):
	    # If file is deleted, move on
	    if post.get('filedeleted', False):
		continue

	    # Download images, if not only downloading thumbnails
	    if not thumbsonly:
		dump_images(post, dst_dir, board)

	    # Download thumbnails by default, but not if user doesn't want it
	    if thumbsonly or (not nothumbs):
		dump_thumbs(post, dst_dir, board)

    # Get all external links quoted in comments
    list_external_links(json_thread, dst_dir)

def main(args):
    # Copy data from arguments
    thread = args.get('<url>').split('/')[5]
    board  = args.get('<url>').split('/')[3]
    path   = args.get('--path')
    nothumbs = args.get('--nothumbs', False)
    thumbsonly = args.get('--thumbsonly', False)
    delay  = args.get('--delay')

    # Set a default path if none is given
    if (path == None):
      path = os.path.join(os.getcwd() + os.path.sep + _DEFAULT_FOLDER)

    # try/except loop to handle Ctrl-C
    try:
      # Begin file dump loop
      while 1:
	  print(' :: Board: %s' % board)
	  print(' :: Thread: %s' % thread)
	  
	  # Create paths if they don't exist
	  dst_dir = os.path.join(path, board, thread)
	  make_sure_path_exists(dst_dir)
	  
	  # Download images, thumbs, and gather links to external urls 
	  get_files(dst_dir, board, thread, nothumbs, thumbsonly)

	  # Dumps thread in raw HTML format to `<thread-id>.html`
	  dump_html(dst_dir, board, thread)
	  
	  # Dumps thread in JSON format to `<thread-id>.json` file, pretty printed
	  dump_json(dst_dir, board, thread)
	  
	  # Wait to execute code again
	  print("Waiting %s seconds before retrying (Type Ctrl-C to stop)\n" % delay)
	  time.sleep(int(delay))

    except KeyboardInterrupt:
      print("\n")
      print(_DUMP_COMPLETE_STRING)
      raise SystemExit(0)
    
    print(_DUMP_COMPLETE_STRING)

if __name__ == '__main__':
  args = docopt(doc, version=0.3)
  main(args)