From 9ef4d075278c37555d8c5ffbc3e290a3eca661be Mon Sep 17 00:00:00 2001
From: OSINT-TECHNOLOGIES
 <77023667+OSINT-TECHNOLOGIES@users.noreply.github.com>
Date: Fri, 19 Jul 2024 19:40:20 +0300
Subject: [PATCH] Added check on corrupted PDF files

---
 pagesearch/pagesearch_parsers.py | 32 ++++++++++++++++++++++++++++----
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/pagesearch/pagesearch_parsers.py b/pagesearch/pagesearch_parsers.py
index f2802ac..6cc62db 100644
--- a/pagesearch/pagesearch_parsers.py
+++ b/pagesearch/pagesearch_parsers.py
@@ -33,6 +33,23 @@ def find_keywords_in_pdfs(ps_docs_path, keywords: list) -> dict:
         print(Fore.RED + f"Can't find keywords. Reason: {e}")
         pass
 
+def clean_bad_pdfs(ps_docs_path):
+    pdf_files = [f for f in os.listdir(ps_docs_path) if f.lower().endswith(".pdf")]
+    bad_pdfs = []
+    for pdf_file in pdf_files:
+        try:
+            full_path = os.path.join(ps_docs_path, pdf_file)
+            fitz.open(filename=full_path)
+        except Exception:
+            bad_pdfs.append(pdf_file)
+            pass
+    if len(bad_pdfs) > 0:
+        print(Fore.GREEN + f"Found {len(bad_pdfs)} corrupted PDF files. Deleting..." + Style.RESET_ALL)
+        for pdfs in bad_pdfs:
+            os.remove(os.path.join(ps_docs_path, pdfs))
+    else:
+        print(Fore.GREEN + "Corrupted PDF files were not found" + Style.RESET_ALL)
+
 def subdomains_parser(subdomains_list, report_folder, keywords, keywords_flag):
     ps_docs_path = report_folder + '//ps_documents'
     if not os.path.exists(ps_docs_path):
@@ -113,19 +130,26 @@ def subdomains_parser(subdomains_list, report_folder, keywords, keywords_flag):
                                 with open(extracted_path, 'wb') as file:
                                     file.write(response.content)
                                 print(Fore.GREEN + "File was successfully saved")
+                            elif href and href.lower().endswith(('.txt')):
+                                filename = os.path.basename(href)
+                                extracted_path = os.path.join(ps_docs_path, f"extracted_{os.path.splitext(filename)[0]}.txt")
+                                with open(extracted_path, 'wb') as file:
+                                    file.write(response.content)
+                                print(Fore.GREEN + "File was successfully saved")
             print(Fore.LIGHTGREEN_EX + "-------------------------------------------------")
         except Exception as e:
             print(Fore.RED + "File extraction failed. Reason: {}".format(e) + Style.RESET_ALL)
-            print(Fore.LIGHTGREEN_EX + "-------------------------------------------------")
+            print(Fore.LIGHTGREEN_EX + "-------------------------------------------------" + Style.RESET_ALL)
             pass
+    clean_bad_pdfs(ps_docs_path)
     if keywords_flag == 1:
-        print(Fore.GREEN + "Starting keywords searching..." + Style.RESET_ALL)
+        print(Fore.GREEN + "Starting keywords searching in PDF files" + Style.RESET_ALL)
         try:
             pdf_results = find_keywords_in_pdfs(ps_docs_path, keywords)
             for pdf_file, found_keywords in pdf_results.items():
-                print(Fore.GREEN + f"Keywords " + Fore.LIGHTCYAN_EX + Style.BRIGHT + f"{', '.join(found_keywords)}" + Style.RESET_ALL + Fore.GREEN + f" found in '{pdf_file}'")
+                print(Fore.GREEN + f"Keywords " + Fore.LIGHTCYAN_EX + Style.BRIGHT + f"{', '.join(found_keywords)}" + Style.RESET_ALL + Fore.GREEN + f" found in '{pdf_file}'" + Style.RESET_ALL)
         except Exception as e:
             print(Fore.RED + f"Can't find keywords. Reason: {e}")
     elif keywords_flag == 0:
         print(Fore.RED + "Keywords gathering won't start because of None user input" + Style.RESET_ALL)
-    print(Fore.LIGHTGREEN_EX + "-------------------------------------------------")
+    print(Fore.LIGHTGREEN_EX + "-------------------------------------------------" + Style.RESET_ALL)