From 5ff8cecf64ee898f7c440742d381475997074509 Mon Sep 17 00:00:00 2001 From: Tobias Bocanegra Date: Wed, 28 Feb 2024 09:22:11 +0100 Subject: [PATCH] feat: add limits fixes #420 --- package-lock.json | 8 +-- package.json | 2 +- src/index.js | 109 ++++++++++++++++++++++++------------ src/mdast-process-images.js | 38 +++++++++---- test/fixtures/images.html | 3 + test/fixtures/images.md | 2 + test/index.test.js | 77 ++++++++++++++++++++++++- 7 files changed, 184 insertions(+), 55 deletions(-) diff --git a/package-lock.json b/package-lock.json index f59be2e3..a0ad1419 100644 --- a/package-lock.json +++ b/package-lock.json @@ -12,7 +12,7 @@ "@adobe/fetch": "4.1.1", "@adobe/helix-admin-support": "2.3.22", "@adobe/helix-markdown-support": "7.1.0", - "@adobe/helix-mediahandler": "2.4.11", + "@adobe/helix-mediahandler": "2.4.12", "@adobe/helix-shared-body-data": "2.0.2", "@adobe/helix-shared-process-queue": "3.0.1", "@adobe/helix-shared-utils": "3.0.1", @@ -290,9 +290,9 @@ } }, "node_modules/@adobe/helix-mediahandler": { - "version": "2.4.11", - "resolved": "https://registry.npmjs.org/@adobe/helix-mediahandler/-/helix-mediahandler-2.4.11.tgz", - "integrity": "sha512-dNSDQpTWEPotuT57gRvJ9gZLHmeltZmd7ymhmHAbp6joqhCoGboxm7RHFeBPAHqDnw2n+AJxPVGYTcVE7U8pOw==", + "version": "2.4.12", + "resolved": "https://registry.npmjs.org/@adobe/helix-mediahandler/-/helix-mediahandler-2.4.12.tgz", + "integrity": "sha512-t8wSSm8OdWLTYCA85KQBDckh+v+HWiD07WyKweFM5gLZzbkd2ALVrAYwf4e8FcsTK7tDVwaK/ScZyWlIaKDQNg==", "dependencies": { "@adobe/fetch": "4.1.1", "@aws-sdk/abort-controller": "3.370.0", diff --git a/package.json b/package.json index 36f230a5..89a339d8 100644 --- a/package.json +++ b/package.json @@ -50,7 +50,7 @@ "@adobe/fetch": "4.1.1", "@adobe/helix-admin-support": "2.3.22", "@adobe/helix-markdown-support": "7.1.0", - "@adobe/helix-mediahandler": "2.4.11", + "@adobe/helix-mediahandler": "2.4.12", "@adobe/helix-shared-body-data": "2.0.2", "@adobe/helix-shared-process-queue": "3.0.1", "@adobe/helix-shared-utils": "3.0.1", diff --git a/src/index.js b/src/index.js index 880eacbd..dc951f13 100644 --- a/src/index.js +++ b/src/index.js @@ -12,12 +12,18 @@ import wrap from '@adobe/helix-shared-wrap'; import { helixStatus } from '@adobe/helix-status'; import bodyData from '@adobe/helix-shared-body-data'; -import { Response, h1NoCache } from '@adobe/fetch'; +import { + Response, + h1NoCache, + timeoutSignal, + AbortError, +} from '@adobe/fetch'; import { cleanupHeaderValue } from '@adobe/helix-shared-utils'; import { MediaHandler } from '@adobe/helix-mediahandler'; import { fetchFstab, getContentBusId } from '@adobe/helix-admin-support'; import pkgJson from './package.cjs'; import { html2md } from './html2md.js'; +import { TooManyImagesError } from './mdast-process-images.js'; /* c8 ignore next 7 */ export const { fetch } = h1NoCache(); @@ -122,26 +128,44 @@ async function run(request, ctx) { reqHeaders['x-content-source-location'] = sourceLocation; } - const res = await fetch(url, { - headers: reqHeaders, - }); - if (!res.ok) { - const { status } = res; - if (status >= 400 && status < 500) { - switch (status) { - case 401: - case 403: - case 404: - return error(`resource not found: ${url}`, status); - default: - return error(`error fetching resource at ${url}`, status); + let html; + let res; + // limit response time of content provider to 10s + const signal = timeoutSignal(ctx.env?.HTML_FETCH_TIMEOUT || 10_000); + try { + res = await fetch(url, { + headers: reqHeaders, + signal, + }); + html = await res.text(); + if (!res.ok) { + const { status } = res; + if (status >= 400 && status < 500) { + switch (status) { + case 401: + case 403: + case 404: + return error(`resource not found: ${url}`, status); + default: + return error(`error fetching resource at ${url}`, status); + } + } else { + // propagate other errors as 502 + return error(`error fetching resource at ${url}: ${status}`, 502); } - } else { - // propagate other errors as 502 - return error(`error fetching resource at ${url}: ${status}`, 502); } + // limit response size of content provider to 1mb + if (html.length > 1024 * 1024) { + return error(`error fetching resource at ${url}: html source larger than 1mb`, 409); + } + } catch (e) { + if (e instanceof AbortError) { + return error(`error fetching resource at ${url}: timeout after 10s`, 504); + } + return error(`error fetching resource at ${url}: ${e.message}`, 502); + } finally { + signal.clear(); } - const html = await res.text(); // only use media handler when loaded via fstab. otherwise images are not processed. let mediaHandler; @@ -170,32 +194,43 @@ async function run(request, ctx) { filter: /* c8 ignore next */ (blob) => ((blob.contentType || '').startsWith('image/')), blobAgent: `html2md-${pkgJson.version}`, noCache, + fetchTimeout: 5000, // limit image fetches to 5s forceHttp1: true, }); } - const md = await html2md(html, { - mediaHandler, - log, - url, - }); + try { + const md = await html2md(html, { + mediaHandler, + log, + url, + }); - const headers = { - 'content-type': 'text/markdown; charset=utf-8', - 'content-length': md.length, - 'cache-control': 'no-store, private, must-revalidate', - 'x-source-location': cleanupHeaderValue(url), - }; + const headers = { + 'content-type': 'text/markdown; charset=utf-8', + 'content-length': md.length, + 'cache-control': 'no-store, private, must-revalidate', + 'x-source-location': cleanupHeaderValue(url), + }; - const lastMod = res.headers.get('last-modified'); - if (lastMod) { - headers['last-modified'] = lastMod; - } + const lastMod = res.headers.get('last-modified'); + if (lastMod) { + headers['last-modified'] = lastMod; + } - return new Response(md, { - status: 200, - headers, - }); + return new Response(md, { + status: 200, + headers, + }); + } catch (e) { + if (e instanceof TooManyImagesError) { + return error(`error fetching resource at ${url}: ${e.message}`, 409); + } + /* c8 ignore next 2 */ + return error(`error fetching resource at ${url}: ${e.message}`, 500); + } finally { + await mediaHandler?.fetchContext.reset(); + } } export const main = wrap(run) diff --git a/src/mdast-process-images.js b/src/mdast-process-images.js index 142f77af..486e4b31 100644 --- a/src/mdast-process-images.js +++ b/src/mdast-process-images.js @@ -9,9 +9,12 @@ * OF ANY KIND, either express or implied. See the License for the specific language * governing permissions and limitations under the License. */ -import { visit } from 'unist-util-visit'; +import { visit, CONTINUE } from 'unist-util-visit'; import processQueue from '@adobe/helix-shared-process-queue'; +export class TooManyImagesError extends Error { +} + /** * Process images * @param {Console} log @@ -24,33 +27,48 @@ export async function processImages(log, tree, mediaHandler, baseUrl) { return; } // gather all image nodes - const images = []; + const images = new Map(); + const register = (node) => { + if (images.has(node.url)) { + images.get(node.url).push(node); + } else { + images.set(node.url, [node]); + } + }; + visit(tree, (node) => { if (node.type === 'image') { const { url = '' } = node; if (url.indexOf(':') < 0) { // eslint-disable-next-line no-param-reassign node.url = new URL(url, baseUrl).href; - images.push(node); + register(node); } else if (url.startsWith('https://')) { - images.push(node); + register(node); } } - return visit.CONTINUE; + return CONTINUE; }); + if (images.size > 100) { + throw new TooManyImagesError(`maximum number of images reached: ${images.size} of 100 max.`); + } + // upload images - await processQueue(images, async (node) => { - const { url } = node; + await processQueue(images.entries(), async ([url, nodes]) => { try { - const blob = await mediaHandler.getBlob(node.url, baseUrl); + const blob = await mediaHandler.getBlob(url, baseUrl); // eslint-disable-next-line no-param-reassign - node.url = blob?.uri || 'about:error'; + url = blob?.uri || 'about:error'; + /* c8 ignore next 6 */ } catch (e) { // in case of invalid urls, or other errors log.warn(`Failed to fetch image for url '${url}': ${e.message}`); // eslint-disable-next-line no-param-reassign - node.url = 'about:error'; + url = 'about:error'; + } + for (const node of nodes) { + node.url = url; } }, 8); } diff --git a/test/fixtures/images.html b/test/fixtures/images.html index c0d87f91..798c937e 100644 --- a/test/fixtures/images.html +++ b/test/fixtures/images.html @@ -14,6 +14,9 @@

Hello, World.

+ + + diff --git a/test/fixtures/images.md b/test/fixtures/images.md index 16981585..b166d751 100644 --- a/test/fixtures/images.md +++ b/test/fixtures/images.md @@ -6,6 +6,8 @@ ![][image0] +![][image0] + ![][image1] ![][image2] diff --git a/test/index.test.js b/test/index.test.js index 34f944bb..e4d8a484 100644 --- a/test/index.test.js +++ b/test/index.test.js @@ -126,7 +126,7 @@ describe('Index Tests', () => { assert.strictEqual((await result.text()).trim(), expected.trim()); assert.deepStrictEqual(result.headers.plain(), { 'cache-control': 'no-store, private, must-revalidate', - 'content-length': '811', + 'content-length': '824', 'content-type': 'text/markdown; charset=utf-8', 'last-modified': 'Sat, 22 Feb 2031 15:28:00 GMT', 'x-source-location': 'https://www.example.com/blog/article', @@ -188,7 +188,7 @@ describe('Index Tests', () => { assert.strictEqual((await result.text()).trim(), expected.trim()); assert.deepStrictEqual(result.headers.plain(), { 'cache-control': 'no-store, private, must-revalidate', - 'content-length': '811', + 'content-length': '824', 'content-type': 'text/markdown; charset=utf-8', 'last-modified': 'Sat, 22 Feb 2031 15:28:00 GMT', 'x-source-location': 'https://www.example.com/blog/article', @@ -409,7 +409,45 @@ mountpoints: }); }); - it('returns 502 for am error response', async () => { + it('returns 409 for too many different images', async () => { + let html = '
'; + for (let i = 0; i < 101; i += 1) { + html += ``; + } + html += '
'; + + nock.fstab(); + nock('https://www.example.com') + .get('/') + .reply(200, html); + + const result = await main(reqUrl('/'), { log: console, env: {} }); + assert.strictEqual(result.status, 409); + assert.strictEqual(await result.text(), ''); + assert.deepStrictEqual(result.headers.plain(), { + 'cache-control': 'no-store, private, must-revalidate', + 'content-type': 'text/plain; charset=utf-8', + 'x-error': 'error fetching resource at https://www.example.com/: maximum number of images reached: 101 of 100 max.', + }); + }); + + it('returns 409 for a large html', async () => { + nock.fstab(); + nock('https://www.example.com') + .get('/') + .reply(200, 'x'.repeat(1024 ** 2 + 1)); + + const result = await main(reqUrl('/'), { log: console }); + assert.strictEqual(result.status, 409); + assert.strictEqual(await result.text(), ''); + assert.deepStrictEqual(result.headers.plain(), { + 'cache-control': 'no-store, private, must-revalidate', + 'content-type': 'text/plain; charset=utf-8', + 'x-error': 'error fetching resource at https://www.example.com/: html source larger than 1mb', + }); + }); + + it('returns 502 for a error response', async () => { nock.fstab(); nock('https://www.example.com') .get('/') @@ -424,4 +462,37 @@ mountpoints: 'x-error': 'error fetching resource at https://www.example.com/: 500', }); }); + + it('returns 502 for a fetch error', async () => { + nock.fstab(); + nock('https://www.example.com') + .get('/') + .replyWithError(new Error('boom!')); + + const result = await main(reqUrl('/'), { log: console }); + assert.strictEqual(result.status, 502); + assert.strictEqual(await result.text(), ''); + assert.deepStrictEqual(result.headers.plain(), { + 'cache-control': 'no-store, private, must-revalidate', + 'content-type': 'text/plain; charset=utf-8', + 'x-error': 'error fetching resource at https://www.example.com/: boom!', + }); + }); + + it('returns 504 when html fetch times out', async () => { + nock.fstab(); + nock('https://www.example.com') + .get('/') + .delay(100) + .reply(404); + + const result = await main(reqUrl('/'), { log: console, env: { HTML_FETCH_TIMEOUT: 10 } }); + assert.strictEqual(result.status, 504); + assert.strictEqual(await result.text(), ''); + assert.deepStrictEqual(result.headers.plain(), { + 'cache-control': 'no-store, private, must-revalidate', + 'content-type': 'text/plain; charset=utf-8', + 'x-error': 'error fetching resource at https://www.example.com/: timeout after 10s', + }); + }); });