Skip to content

Commit

Permalink
Added counter of inspected links, added txt report of gathered data
Browse files Browse the repository at this point in the history
  • Loading branch information
OSINT-TECHNOLOGIES authored Aug 15, 2024
1 parent f9705a1 commit be48458
Showing 1 changed file with 10 additions and 3 deletions.
13 changes: 10 additions & 3 deletions pagesearch/pagesearch_deepsearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,25 +8,26 @@ def sitemap_inspection(report_folder):
if os.path.exists(report_folder + '//03-sitemap_links.txt'):
try:
accessed_links_counter = 0
print(Fore.GREEN + "Trying to access sitemap_links.txt file..." + Style.RESET_ALL)
print(Fore.GREEN + "Trying to access sitemap_links.txt file" + Style.RESET_ALL)
with open(report_folder + '//03-sitemap_links.txt', "r") as file:
links = file.readlines()
print(Fore.GREEN + "Reading file and forming links list..." + Style.RESET_ALL)
print(Fore.GREEN + "Reading file and forming links list" + Style.RESET_ALL)
ps_docs_path = report_folder + '//sitemap_inspection'
if not os.path.exists(ps_docs_path):
os.makedirs(ps_docs_path)
total_emails = []
email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
links = [link.strip() for link in links]
total_links_counter = len(links)
print(Fore.GREEN + "Gathering e-mails..." + Style.RESET_ALL)
for url in links:
response = requests.get(url)
if response.status_code == 200:
accessed_links_counter += 1
soup = BeautifulSoup(response.content, 'html.parser')
emails = re.findall(email_pattern, soup.text)
total_emails.append(emails)
print(Fore.GREEN + f" Inspecting links: " + Style.RESET_ALL + Fore.LIGHTCYAN_EX + Style.BRIGHT + f"Link #{accessed_links_counter}" + Style.RESET_ALL, end="\r")

ds_emails_list = [x for x in total_emails if x]
ds_emails_cleaned = [', '.join(sublist) for sublist in ds_emails_list]
ds_emails_return = list(set(ds_emails_cleaned))
Expand All @@ -35,8 +36,14 @@ def sitemap_inspection(report_folder):
print(Fore.GREEN + f"\nDuring PageSearch Sitemap Inspection process:\n[+] Total {total_links_counter} links were checked")
print(Fore.GREEN + f"[+] Among them, {accessed_links_counter} links were accessible")
print(Fore.GREEN + f"[+] In result, {len(ds_emails_return)} unique e-mail addresses were found")
with open(ps_docs_path + "//inspection_logs.txt", "w") as si_logs:
si_logs.write('# THIS IS PAGESEARCH SITEMAP INSPECTION LOGS' + '\n')
si_logs.write('# HERE YOU CAN FIND INFO THAT WAS DISCOVERED DURING SITEMAP INSPECTION PROCESS' + '\n' * 3)
si_logs.write(f'[+] LINKS: Received {total_links_counter} links. Amount of accessible links: {accessed_links_counter}' + '\n')
si_logs.write(f'[+] EMAILS: Returned and stored in PDF/XLSX report. Total {len(ds_emails_return)} unique emails found' + '\n')
return ds_emails_return
except FileNotFoundError:
print(Fore.RED + f"Cannot start PageSearch in Sitemap Inspection mode because sitemap_links.txt file doesn't exist" + Style.RESET_ALL)
else:
print(Fore.RED + f"Cannot start PageSearch in Sitemap Inspection mode because sitemap_links.txt file doesn't exist" + Style.RESET_ALL)

0 comments on commit be48458

Please sign in to comment.