Skip to content

Commit

Permalink
Crawler to strip null bytes
Browse files Browse the repository at this point in the history
  • Loading branch information
kearfy committed Mar 15, 2024
1 parent d72ab3d commit f9addbd
Showing 1 changed file with 7 additions and 2 deletions.
9 changes: 7 additions & 2 deletions plugins/crawler/index.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,11 @@ export async function onSuccess() {
const urls = sitemap.urlset.url;
console.log(`[CW] The sitemap contains ${urls.length} url(s)`);

const pathnames = urls.map((url) => decodeURI(new URL(url.loc[0]).pathname));
const pathnames = urls.map((url) => {
let pathname = decodeURI(new URL(url.loc[0]).pathname);
if (pathname.endsWith('/')) pathname = pathname.slice(0, -1);
return pathname;
});
const chunkSize = 1;

for (let i = 0; i < pathnames.length; i += chunkSize) {
Expand All @@ -48,7 +52,7 @@ export async function onSuccess() {
console.log(`[CW] Crawling page ${index + 1}/${pathnames.length}: ${pathname}`);

const filePath = `${buildDir}${pathname}/index.html`;
const fileContent = fs.readFileSync(filePath, "utf-8");
const fileContent = fs.readFileSync(filePath, "utf-8").replace(/\0/g, '');
const document = parseHTML(fileContent);

const scrapByQuerySelector = (query) => document.querySelectorAll(query)
Expand Down Expand Up @@ -128,5 +132,6 @@ export async function onSuccess() {
console.log(`[CW] Skipping stale page removal, not on prod`);
}

console.log(`[CW] Closing connection to SurrealDB`);
await db.close();
}

0 comments on commit f9addbd

Please sign in to comment.