Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

dependency updates & cookie handling #20

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 66 additions & 50 deletions gbd.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
import os
import urllib
import traceback
import regex as re
import requests
from time import sleep
import tempfile
from seleniumwire import webdriver
from progressbar import progressbar as bar
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By

print("""
Google Books Downloader by @aprikyan, 2020.
Expand All @@ -24,13 +26,13 @@ def get_book_url():
(e.g. https://books.google.com/books?id=buc0AAAAMAAJ&printsec=frontcover&sa=X&ved=2ahUKEwj-y8T4r5vrAhWKLewKHaIQBnYQ6AEwAXoECAQQAg#v=onepage&f=false)

Your input: """)

if re.findall(r"id=[A-Za-z0-9]+", url):
id_part = re.findall(r"id=[A-Za-z0-9]+", url)[-1]
else:
print("Invalid input. Please try again.")
get_book_url()

return (f"https://books.google.com/books?{id_part}&pg=1&hl=en#v=onepage&q&f=false",
f"https://books.google.com/books?{id_part}&pg=1&hl=en&f=false&output=embed&source=gbs_embed")

Expand All @@ -41,10 +43,10 @@ def get_book_data(url):
"""
driver.get(url)
driver.refresh()
sleep(3)
title = driver.find_element_by_class_name("gb-volume-title").text
author = driver.find_element_by_class_name("addmd").text
sleep(2)
title = driver.find_element(By.CLASS_NAME, "gb-volume-title").text
author = driver.find_element(By.CLASS_NAME, "addmd").text

return f"{title} (b{author[1:]})"

def capture_requests(url):
Expand All @@ -54,15 +56,15 @@ def capture_requests(url):
"""
driver.get(url)
driver.refresh()
sleep(5)
sleep(2)
checkpoint = ""
while checkpoint != driver.find_element_by_class_name("pageImageDisplay"):
checkpoint = driver.find_element_by_class_name("pageImageDisplay")

while checkpoint != driver.find_element(By.CLASS_NAME, "pageImageDisplay"):
checkpoint = driver.find_element(By.CLASS_NAME, "pageImageDisplay")
checkpoint.click()
# scrolling ~25 pages
for i in range(25):
html = driver.find_element_by_tag_name("body")
html = driver.find_element(By.TAG_NAME, "body")
html.click()
html.send_keys(Keys.SPACE)
sleep(2)
Expand All @@ -86,12 +88,12 @@ def save_backup():
save = input("""
Would you like to save a backup file (type Yes or No)?
Your input: """).upper()
if save == "YES":

if save == "YES" or save == "Y":
with open(f"Backup of {book_data}.txt", "w") as f:
f.write(str(all_pages))
print(f"Succesfully backed up the book in \"Backup of {book_data}.txt\"!")

elif save != "NO":
print("Invalid input. Please try again.")
save_backup()
Expand All @@ -104,7 +106,7 @@ def select_pages(user_input, all_pages):
"""
ranges = user_input.replace(" ", "").split(",")
page_numbers = []

if "all" in ranges:
return all_pages
while "odd" in ranges:
Expand All @@ -121,7 +123,7 @@ def select_pages(user_input, all_pages):
page_numbers.append((int(segment), all_pages[int(segment)]))

return dict(set(page_numbers))

def get_cookie(url):
"""
Driver needs to behave like a real
Expand All @@ -131,11 +133,13 @@ def get_cookie(url):
cookies = []
driver.get(url)
driver.refresh()

for request in driver.requests:
if request.headers:
if "Cookie" in request.headers.keys():
cookies.append(request.headers["Cookie"])
if len(cookies) == 0:
cookies = driver.get_cookies()

return cookies[0]

Expand All @@ -145,24 +149,27 @@ def download_imgs(pages, cookie, directory):
the cookie to use and the directory
to save to, and then does the magic.
"""
proxy = urllib.request.ProxyHandler({})
opener = urllib.request.build_opener(proxy)
opener.addheaders = [("user-agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.1 Safari/603.1.30"),
("cookie", cookie)]
urllib.request.install_opener(opener)


headers = {'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.1 Safari/603.1.30",
"cookie": f"NID={cookie['value']}", }

for number, url in bar(pages.items()):
urllib.request.urlretrieve(url, os.path.join(directory, f"page{number}.png"))
response = requests.get(url, headers=headers, stream=True)
response.raise_for_status() # Check for HTTP request errors

with open(os.path.join(directory, f"page{number}.png"), 'wb') as file:
for chunk in response.iter_content(chunk_size=8192):
file.write(chunk)

def step1():
global book_data, all_pages

from_url = input("""
Would you like to download a book from URL? Type No if you have a backup, otherwise type Yes.

Your input: """).upper()
if from_url == "YES":

if from_url == "YES" or from_url == "Y":
data_url, pages_url = get_book_url()
book_data = get_book_data(data_url)
print(f"\nWe will now process the pages of \"{book_data}\" one by one. Sit back and relax, as this may take some time, depending on the number of its pages.\n")
Expand All @@ -171,85 +178,94 @@ def step1():
print("""Now that most of the job is done (yahoo!), it is highly recommended to backup the current progress we have made, so as not to lose it if an error happens to be thrown afterward.
Also, if you would like to download another segment of this book later, the backup will be used then to save your precious time.""")
save_backup()

elif from_url == "NO":
backup = input("""
Enter the location of the backup file.
(e.g. C:/Users/User/Downloads/Backup_of_booktitle.txt)

Your input: """)
Your input: """)

try:
book_data = os.path.basename(backup)[10:-4]
all_pages = eval(open(backup).read())
except:
print("Invalid input. Please try again.")
step1()

else:
print("Invalid input. Please try again.")
step1()

def step2():
global selected_pages, cookie

selection = input("""
Step 2: Specify the pages to be downloaded. You may use the combinations of:
- **all**: download all pages available
- exact numbers (e.g. 5, 3, 16)
- ranges (e.g. 11-13, 1-100)
- keywords odd and/or even, to download odd or even pages respectively
- keyword all, to download all pages available
- commas to seperate the tokens
Your input may look like "1, 10-50, odd, 603".
Note that only pages available for preview will be downloaded.

Your input: """)

try:
selected_pages = select_pages(selection, all_pages)

except:
print("Invalid input. Please try again.")
step2()

# it's a surprise tool that will help us later
cookie = get_cookie(list(all_pages.items())[0][1])

def step3():
def step3():
main_directory = input("""
Step 3 (optional): Specify the location to download the book pages to (a new folder will be created in that directory).
ENTER to save them right here.

Your input: """)


if main_directory == "":
main_directory = tempfile.TemporaryDirectory()

try:
new_directory = os.path.join(main_directory, book_data)
if not os.path.exists(new_directory):
os.mkdir(new_directory)
except:
print("Invalid input. Please try again.")
step3()

try:
new_directory = main_directory
if not os.path.exists(new_directory):
os.mkdir(new_directory)
except:
print(f"Invalid input\"{main_directory}\". Please try again, or leave the input blank to use a folder in temp.")
step3()

print(f"\nWe will now download all {len(selected_pages)} pages you selected. This will take a minute or two.\n")
print(f"\nDownload folder is: {new_directory}\n")
download_imgs(selected_pages, cookie, new_directory)

if __name__ == "__main__":
global driver

chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--log-level=3")
chrome_options.add_argument("--log-level=-1")
chrome_options.add_experimental_option("excludeSwitches", ["enable-logging"])
chrome_options.add_experimental_option("prefs", {"safebrowsing.enabled": True})
driver = webdriver.Chrome("chromedriver.exe", options=chrome_options)

try:
step1()
step2()
step3()

except Exception as e:
with open("google-books-downloader crash.log", "w") as log:
with open("google-books-downloader_crash.log", "w") as log:
log.write(traceback.format_exc())
print(f"""
Something went wrong :/
Expand All @@ -260,17 +276,17 @@ def step3():
- you entered a valid URL of a Google Books book
- your inputs correspond the formatting
- you have permission to save/create files in this and the download directories

If it still repeats and you think this is an error, please report it on github.com/aprikyan/google-books-downloader.
When reporting, do not forget to attach the following file to the issue:
{os.path.join(os.getcwd(), "google-books-downloader crash.log")}
{os.path.join(os.getcwd(), "google-books-downloader_crash.log")}
""")

else:
print(f"""
The selected pages were successfully downloaded into the "{book_data}" folder!

Note that for your convenience the pages are saved as images. If you would like to combine them in a PDF (or another format), it might be done using specialized websites and apps.""")

# combining in PDF involves asking about its DPI, size, etc, and
# it would take much time and RAM, so it's better to leave it to user
7 changes: 4 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
regex
progressbar
selenium==3.141.0
selenium-wire>=2.1.0
progressbar2
selenium==4.0.0
selenium-wire>=5.1.0
requests