aprikyan · manavortex · Nov 23, 2024
diff --git a/gbd.py b/gbd.py
@@ -1,12 +1,14 @@
 import os
-import urllib
 import traceback
 import regex as re
+import requests
 from time import sleep
+import tempfile
 from seleniumwire import webdriver
 from progressbar import progressbar as bar
 from selenium.webdriver.common.keys import Keys
 from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.common.by import By
 
 print("""
 Google Books Downloader by @aprikyan, 2020.
@@ -24,13 +26,13 @@ def get_book_url():
 (e.g. https://books.google.com/books?id=buc0AAAAMAAJ&printsec=frontcover&sa=X&ved=2ahUKEwj-y8T4r5vrAhWKLewKHaIQBnYQ6AEwAXoECAQQAg#v=onepage&f=false)
 
 Your input: """)
-    
+
     if re.findall(r"id=[A-Za-z0-9]+", url):
         id_part = re.findall(r"id=[A-Za-z0-9]+", url)[-1]
     else:
         print("Invalid input. Please try again.")
         get_book_url()
-        
+
     return (f"https://books.google.com/books?{id_part}&pg=1&hl=en#v=onepage&q&f=false",
             f"https://books.google.com/books?{id_part}&pg=1&hl=en&f=false&output=embed&source=gbs_embed")
 
@@ -41,10 +43,10 @@ def get_book_data(url):
     """
     driver.get(url)
     driver.refresh()
-    sleep(3)
-    title = driver.find_element_by_class_name("gb-volume-title").text
-    author = driver.find_element_by_class_name("addmd").text
-    
+    sleep(2)
+    title = driver.find_element(By.CLASS_NAME, "gb-volume-title").text
+    author = driver.find_element(By.CLASS_NAME, "addmd").text
+
     return f"{title} (b{author[1:]})"
 
 def capture_requests(url):
@@ -54,15 +56,15 @@ def capture_requests(url):
     """
     driver.get(url)
     driver.refresh()
-    sleep(5)
+    sleep(2)
     checkpoint = ""
-    
-    while checkpoint != driver.find_element_by_class_name("pageImageDisplay"):
-        checkpoint = driver.find_element_by_class_name("pageImageDisplay")
+
+    while checkpoint != driver.find_element(By.CLASS_NAME, "pageImageDisplay"):
+        checkpoint = driver.find_element(By.CLASS_NAME, "pageImageDisplay")
         checkpoint.click()
         # scrolling ~25 pages
         for i in range(25):
-            html = driver.find_element_by_tag_name("body")
+            html = driver.find_element(By.TAG_NAME, "body")
             html.click()
             html.send_keys(Keys.SPACE)
         sleep(2)
@@ -86,12 +88,12 @@ def save_backup():
     save = input("""
 Would you like to save a backup file (type Yes or No)?
 Your input: """).upper()
-    
-    if save == "YES":
+
+    if save == "YES" or save == "Y":
         with open(f"Backup of {book_data}.txt", "w") as f:
             f.write(str(all_pages))
         print(f"Succesfully backed up the book in \"Backup of {book_data}.txt\"!")
-        
+
     elif save != "NO":
         print("Invalid input. Please try again.")
         save_backup()
@@ -104,7 +106,7 @@ def select_pages(user_input, all_pages):
     """
     ranges = user_input.replace(" ", "").split(",")
     page_numbers = []
-    
+
     if "all" in ranges:
         return all_pages
     while "odd" in ranges:
@@ -121,7 +123,7 @@ def select_pages(user_input, all_pages):
             page_numbers.append((int(segment), all_pages[int(segment)]))
 
     return dict(set(page_numbers))
-    
+
 def get_cookie(url):
     """
     Driver needs to behave like a real
@@ -131,11 +133,13 @@ def get_cookie(url):
     cookies = []
     driver.get(url)
     driver.refresh()
-    
+
     for request in driver.requests:
         if request.headers:
             if "Cookie" in request.headers.keys():
                 cookies.append(request.headers["Cookie"])
+    if len(cookies) == 0:
+       cookies =  driver.get_cookies()
 
     return cookies[0]
 
@@ -145,24 +149,27 @@ def download_imgs(pages, cookie, directory):
     the cookie to use and the directory
     to save to, and then does the magic.
     """
-    proxy = urllib.request.ProxyHandler({})
-    opener = urllib.request.build_opener(proxy)
-    opener.addheaders = [("user-agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.1 Safari/603.1.30"),
-                        ("cookie", cookie)]
-    urllib.request.install_opener(opener)
-
+
+    headers = {'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.1 Safari/603.1.30",
+            "cookie": f"NID={cookie['value']}", }
+
     for number, url in bar(pages.items()):
-        urllib.request.urlretrieve(url, os.path.join(directory, f"page{number}.png"))
+        response = requests.get(url, headers=headers, stream=True)
+        response.raise_for_status()  # Check for HTTP request errors
+
+        with open(os.path.join(directory, f"page{number}.png"), 'wb') as file:
+            for chunk in response.iter_content(chunk_size=8192):
+                file.write(chunk)
 
 def step1():
     global book_data, all_pages
-    
+
     from_url = input("""
 Would you like to download a book from URL? Type No if you have a backup, otherwise type Yes.
 
 Your input: """).upper()
-    
-    if from_url == "YES":
+
+    if from_url == "YES" or from_url == "Y":
         data_url, pages_url = get_book_url()
         book_data = get_book_data(data_url)
         print(f"\nWe will now process the pages of \"{book_data}\" one by one. Sit back and relax, as this may take some time, depending on the number of its pages.\n")
@@ -171,85 +178,94 @@ def step1():
         print("""Now that most of the job is done (yahoo!), it is highly recommended to backup the current progress we have made, so as not to lose it if an error happens to be thrown afterward.
 Also, if you would like to download another segment of this book later, the backup will be used then to save your precious time.""")
         save_backup()
-        
+
     elif from_url == "NO":
         backup = input("""
 Enter the location of the backup file.
 (e.g. C:/Users/User/Downloads/Backup_of_booktitle.txt)
 
-Your input: """)        
-        
+Your input: """)
+
         try:
             book_data = os.path.basename(backup)[10:-4]
             all_pages = eval(open(backup).read())
         except:
             print("Invalid input. Please try again.")
             step1()
-            
+
     else:
         print("Invalid input. Please try again.")
         step1()
 
 def step2():
     global selected_pages, cookie
-    
+
     selection = input("""
 Step 2: Specify the pages to be downloaded. You may use the combinations of:
+-   **all**: download all pages available
 -   exact numbers (e.g. 5, 3, 16)
 -   ranges (e.g. 11-13, 1-100)
 -   keywords odd and/or even, to download odd or even pages respectively
--   keyword all, to download all pages available
 -   commas to seperate the tokens
 Your input may look like "1, 10-50, odd, 603".
 Note that only pages available for preview will be downloaded.
 
 Your input: """)
-    
+
     try:
         selected_pages = select_pages(selection, all_pages)
-        
+
     except:
         print("Invalid input. Please try again.")
         step2()
 
     # it's a surprise tool that will help us later
     cookie = get_cookie(list(all_pages.items())[0][1])
 
-def step3():    
+def step3():
     main_directory = input("""
 Step 3 (optional): Specify the location to download the book pages to (a new folder will be created in that directory).
 ENTER to save them right here.
 
 Your input: """)
-
+
+    if main_directory == "":
+        main_directory = tempfile.TemporaryDirectory()
+
     try:
         new_directory = os.path.join(main_directory, book_data)
         if not os.path.exists(new_directory):
             os.mkdir(new_directory)
     except:
-        print("Invalid input. Please try again.")
-        step3()
-
+        try:
+            new_directory = main_directory
+            if not os.path.exists(new_directory):
+                os.mkdir(new_directory)
+        except:
+            print(f"Invalid input\"{main_directory}\". Please try again, or leave the input blank to use a folder in temp.")
+            step3()
+
     print(f"\nWe will now download all {len(selected_pages)} pages you selected. This will take a minute or two.\n")
+    print(f"\nDownload folder is: {new_directory}\n")
     download_imgs(selected_pages, cookie, new_directory)
 
 if __name__ == "__main__":
     global driver
-    
+
     chrome_options = Options()
     chrome_options.add_argument("--headless")
-    chrome_options.add_argument("--log-level=3")
+    chrome_options.add_argument("--log-level=-1")
     chrome_options.add_experimental_option("excludeSwitches", ["enable-logging"])
     chrome_options.add_experimental_option("prefs", {"safebrowsing.enabled": True})
     driver = webdriver.Chrome("chromedriver.exe", options=chrome_options)
-    
+
     try:
         step1()
         step2()
         step3()
-        
+
     except Exception as e:
-        with open("google-books-downloader crash.log", "w") as log:
+        with open("google-books-downloader_crash.log", "w") as log:
             log.write(traceback.format_exc())
         print(f"""
 Something went wrong :/
@@ -260,17 +276,17 @@ def step3():
 -   you entered a valid URL of a Google Books book
 -   your inputs correspond the formatting
 -   you have permission to save/create files in this and the download directories
-    
+
 If it still repeats and you think this is an error, please report it on github.com/aprikyan/google-books-downloader.
 When reporting, do not forget to attach the following file to the issue:
-    {os.path.join(os.getcwd(), "google-books-downloader crash.log")}
+    {os.path.join(os.getcwd(), "google-books-downloader_crash.log")}
 """)
-        
+
     else:
         print(f"""
 The selected pages were successfully downloaded into the "{book_data}" folder!
 
 Note that for your convenience the pages are saved as images. If you would like to combine them in a PDF (or another format), it might be done using specialized websites and apps.""")
-        
+
         # combining in PDF involves asking about its DPI, size, etc, and
         # it would take much time and RAM, so it's better to leave it to user
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,5 @@
 regex
-progressbar
-selenium==3.141.0
-selenium-wire>=2.1.0
+progressbar2
+selenium==4.0.0
+selenium-wire>=5.1.0
+requests