From 251cfcdde7ac54525eef49341a9ede52d368cf71 Mon Sep 17 00:00:00 2001 From: Alexandre Capt Date: Mon, 13 Nov 2023 14:10:30 +0100 Subject: [PATCH] feat: improve default import (#261) --- package-lock.json | 4 +- src/importer/HTML2x.js | 23 +--- src/importer/defaults/generateDocumentPath.js | 24 +++++ .../defaults/rules/adjustImageUrls.js | 28 +++++ src/importer/defaults/rules/convertIcons.js | 25 +++++ src/importer/defaults/rules/createMetadata.js | 102 ++++++++++++++++++ .../rules/transformBackgroundImages.js | 21 ++++ src/importer/defaults/transformDOM.js | 42 ++++++++ src/index.js | 13 +++ src/utils/DOMUtils.js | 6 +- test/TestUtils.js | 21 ++++ test/browser/DOMUtils.test.js | 67 ++++++++++++ test/importers/HTML2x.spec.js | 41 ++----- test/importers/PageImporter.spec.js | 4 + .../fixtures/adjust-image-urls.expected.html | 7 ++ .../fixtures/adjust-image-urls.input.html | 10 ++ .../fixtures/background-image.expected.html | 13 +++ .../fixtures/background-image.input.html | 10 ++ .../defaults/fixtures/cleanup.expected.html | 5 + .../defaults/fixtures/cleanup.input.html | 11 ++ .../defaults/fixtures/default.expected.html | 4 + .../defaults/fixtures/default.input.html | 6 ++ .../defaults/fixtures/icons.expected.html | 4 + .../defaults/fixtures/icons.input.html | 6 ++ .../fixtures/metadata.all.diff.expected.html | 40 +++++++ .../fixtures/metadata.all.diff.input.html | 17 +++ .../fixtures/metadata.all.same.expected.html | 20 ++++ .../fixtures/metadata.all.same.input.html | 17 +++ .../fixtures/metadata.basic.expected.html | 16 +++ .../fixtures/metadata.basic.input.html | 9 ++ .../fixtures/metadata.image.expected.html | 12 +++ .../fixtures/metadata.image.input.html | 9 ++ .../fixtures/metadata.og.expected.html | 16 +++ .../defaults/fixtures/metadata.og.input.html | 9 ++ .../fixtures/metadata.twitter.expected.html | 16 +++ .../fixtures/metadata.twitter.input.html | 9 ++ .../defaults/generateDocumentPath.spec.js | 32 ++++++ test/importers/defaults/transformDOM.spec.js | 94 ++++++++++++++++ test/importers/fixtures/video.spec.html | 11 ++ test/importers/fixtures/video.spec.md | 7 ++ test/utils/DOMUtils.spec.js | 7 +- 41 files changed, 774 insertions(+), 64 deletions(-) create mode 100644 src/importer/defaults/generateDocumentPath.js create mode 100644 src/importer/defaults/rules/adjustImageUrls.js create mode 100644 src/importer/defaults/rules/convertIcons.js create mode 100644 src/importer/defaults/rules/createMetadata.js create mode 100644 src/importer/defaults/rules/transformBackgroundImages.js create mode 100644 src/importer/defaults/transformDOM.js create mode 100644 test/TestUtils.js create mode 100644 test/browser/DOMUtils.test.js create mode 100644 test/importers/defaults/fixtures/adjust-image-urls.expected.html create mode 100644 test/importers/defaults/fixtures/adjust-image-urls.input.html create mode 100644 test/importers/defaults/fixtures/background-image.expected.html create mode 100644 test/importers/defaults/fixtures/background-image.input.html create mode 100644 test/importers/defaults/fixtures/cleanup.expected.html create mode 100644 test/importers/defaults/fixtures/cleanup.input.html create mode 100644 test/importers/defaults/fixtures/default.expected.html create mode 100644 test/importers/defaults/fixtures/default.input.html create mode 100644 test/importers/defaults/fixtures/icons.expected.html create mode 100644 test/importers/defaults/fixtures/icons.input.html create mode 100644 test/importers/defaults/fixtures/metadata.all.diff.expected.html create mode 100644 test/importers/defaults/fixtures/metadata.all.diff.input.html create mode 100644 test/importers/defaults/fixtures/metadata.all.same.expected.html create mode 100644 test/importers/defaults/fixtures/metadata.all.same.input.html create mode 100644 test/importers/defaults/fixtures/metadata.basic.expected.html create mode 100644 test/importers/defaults/fixtures/metadata.basic.input.html create mode 100644 test/importers/defaults/fixtures/metadata.image.expected.html create mode 100644 test/importers/defaults/fixtures/metadata.image.input.html create mode 100644 test/importers/defaults/fixtures/metadata.og.expected.html create mode 100644 test/importers/defaults/fixtures/metadata.og.input.html create mode 100644 test/importers/defaults/fixtures/metadata.twitter.expected.html create mode 100644 test/importers/defaults/fixtures/metadata.twitter.input.html create mode 100644 test/importers/defaults/generateDocumentPath.spec.js create mode 100644 test/importers/defaults/transformDOM.spec.js create mode 100644 test/importers/fixtures/video.spec.html create mode 100644 test/importers/fixtures/video.spec.md diff --git a/package-lock.json b/package-lock.json index 53f0424..8738ff6 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "@adobe/helix-importer", - "version": "2.9.41", + "version": "3.0.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@adobe/helix-importer", - "version": "2.9.41", + "version": "3.0.0", "license": "Apache-2.0", "dependencies": { "@adobe/helix-markdown-support": "7.1.0", diff --git a/src/importer/HTML2x.js b/src/importer/HTML2x.js index ad9fa41..2f6682c 100644 --- a/src/importer/HTML2x.js +++ b/src/importer/HTML2x.js @@ -17,6 +17,8 @@ import PageImporterResource from './PageImporterResource.js'; import MemoryHandler from '../storage/MemoryHandler.js'; import Utils from '../utils/Utils.js'; import BrowserUtils from '../utils/BrowserUtils.js'; +import defaultTransformDOM from './defaults/transformDOM.js'; +import defaultGenerateDocumentPath from './defaults/generateDocumentPath.js'; // import docxStylesXML from '../resources/styles.xml'; @@ -35,27 +37,6 @@ function setBackgroundImagesFromCSS(document) { } } -async function defaultTransformDOM({ - // eslint-disable-next-line no-unused-vars - url, document, html, params, -}) { - return document.body; -} - -async function defaultGenerateDocumentPath({ - // eslint-disable-next-line no-unused-vars - url, document, html, params, -}) { - let p = new URL(url).pathname; - if (p.endsWith('/')) { - p = `${p}index`; - } - return decodeURIComponent(p) - .toLowerCase() - .replace(/\.html$/, '') - .replace(/[^a-z0-9/]/gm, '-'); -} - async function html2x( url, doc, diff --git a/src/importer/defaults/generateDocumentPath.js b/src/importer/defaults/generateDocumentPath.js new file mode 100644 index 0000000..9912ead --- /dev/null +++ b/src/importer/defaults/generateDocumentPath.js @@ -0,0 +1,24 @@ +/* + * Copyright 2023 Adobe. All rights reserved. + * This file is licensed to you under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. You may obtain a copy + * of the License at http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS + * OF ANY KIND, either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ +export default async function generateDocumentPath({ + // eslint-disable-next-line no-unused-vars + url, document, html, params, +}) { + let p = new URL(url).pathname; + if (p.endsWith('/')) { + p = `${p}index`; + } + return decodeURIComponent(p) + .toLowerCase() + .replace(/\.html$/, '') + .replace(/[^a-z0-9/]/gm, '-'); +} diff --git a/src/importer/defaults/rules/adjustImageUrls.js b/src/importer/defaults/rules/adjustImageUrls.js new file mode 100644 index 0000000..8a10c3f --- /dev/null +++ b/src/importer/defaults/rules/adjustImageUrls.js @@ -0,0 +1,28 @@ +/* + * Copyright 2023 Adobe. All rights reserved. + * This file is licensed to you under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. You may obtain a copy + * of the License at http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS + * OF ANY KIND, either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ + +export default function adjustImageUrls(main, url) { + [...main.querySelectorAll('img')].forEach((img) => { + const src = img.getAttribute('src'); + if (src && (src.startsWith('./') || src.startsWith('/') || src.startsWith('../'))) { + try { + const u = new URL(src, url); + // eslint-disable-next-line no-param-reassign + img.src = u.toString(); + } catch (e) { + // eslint-disable-next-line no-console + console.log(`Unable to adjust image URL ${img.src} - removing image`); + img.remove(); + } + } + }); +} diff --git a/src/importer/defaults/rules/convertIcons.js b/src/importer/defaults/rules/convertIcons.js new file mode 100644 index 0000000..b2692cb --- /dev/null +++ b/src/importer/defaults/rules/convertIcons.js @@ -0,0 +1,25 @@ +/* + * Copyright 2023 Adobe. All rights reserved. + * This file is licensed to you under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. You may obtain a copy + * of the License at http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS + * OF ANY KIND, either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ + +export default function convertIcons(main, document) { + [...main.querySelectorAll('img')].forEach((img) => { + const src = img.getAttribute('src'); + if (src && src.endsWith('.svg')) { + const span = document.createElement('span'); + const name = src.split('/').pop().split('.')[0].toLowerCase().trim().replace(/[^a-z0-9]/g, '-'); + if (name) { + span.innerHTML = `:${name}:`; + img.replaceWith(span); + } + } + }); +} diff --git a/src/importer/defaults/rules/createMetadata.js b/src/importer/defaults/rules/createMetadata.js new file mode 100644 index 0000000..4dd5ddf --- /dev/null +++ b/src/importer/defaults/rules/createMetadata.js @@ -0,0 +1,102 @@ +/* + * Copyright 2023 Adobe. All rights reserved. + * This file is licensed to you under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. You may obtain a copy + * of the License at http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS + * OF ANY KIND, either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ + +import Blocks from '../../../utils/Blocks.js'; + +function getMetadata(name, document) { + const attr = name && name.includes(':') ? 'property' : 'name'; + const meta = [...document.head.querySelectorAll(`meta[${attr}="${name}"]`)] + .map((m) => m.content) + .join(', '); + return meta || ''; +} + +export default function createMetadata(main, document) { + const meta = {}; + + const title = document.querySelector('title'); + if (title) { + meta.Title = title.textContent.replace(/[\n\t]/gm, ''); + } + + const desc = getMetadata('description', document); + if (desc) { + meta.Description = desc; + } + + const img = getMetadata('og:image', document); + if (img) { + const el = document.createElement('img'); + el.src = img; + meta.Image = el; + + const imgAlt = getMetadata('og:image:alt', document); + if (imgAlt) { + el.alt = imgAlt; + } + } + + const ogtitle = getMetadata('og:title', document); + if (ogtitle && ogtitle !== meta.Title) { + if (meta.Title) { + meta['og:title'] = ogtitle; + } else { + meta.Title = ogtitle; + } + } + + const ogdesc = getMetadata('og:description', document); + if (ogdesc && ogdesc !== meta.Description) { + if (meta.Description) { + meta['og:description'] = ogdesc; + } else { + meta.Description = ogdesc; + } + } + + const ttitle = getMetadata('twitter:title', document); + if (ttitle && ttitle !== meta.Title) { + if (meta.Title) { + meta['twitter:title'] = ttitle; + } else { + meta.Title = ttitle; + } + } + + const tdesc = getMetadata('twitter:description', document); + if (tdesc && tdesc !== meta.Description) { + if (meta.Description) { + meta['twitter:description'] = tdesc; + } else { + meta.Description = tdesc; + } + } + + const timg = getMetadata('twitter:image', document); + if (timg && timg !== img) { + const el = document.createElement('img'); + el.src = timg; + meta['twitter:image'] = el; + + const imgAlt = getMetadata('twitter:image:alt', document); + if (imgAlt) { + el.alt = imgAlt; + } + } + + if (Object.keys(meta).length > 0) { + const block = Blocks.getMetadataBlock(document, meta); + main.append(block); + } + + return meta; +} diff --git a/src/importer/defaults/rules/transformBackgroundImages.js b/src/importer/defaults/rules/transformBackgroundImages.js new file mode 100644 index 0000000..65041cb --- /dev/null +++ b/src/importer/defaults/rules/transformBackgroundImages.js @@ -0,0 +1,21 @@ +/* + * Copyright 2023 Adobe. All rights reserved. + * This file is licensed to you under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. You may obtain a copy + * of the License at http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS + * OF ANY KIND, either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ + +import DOMUtils from '../../../utils/DOMUtils.js'; + +export default function transformBackgroundImages(main, document) { + [...main.querySelectorAll('[style*="background-image: url"]')].forEach((element) => { + const img = DOMUtils.getImgFromBackground(element, document); + element.prepend(img); + element.style.removeProperty('background-image'); + }); +} diff --git a/src/importer/defaults/transformDOM.js b/src/importer/defaults/transformDOM.js new file mode 100644 index 0000000..4e14ea6 --- /dev/null +++ b/src/importer/defaults/transformDOM.js @@ -0,0 +1,42 @@ +/* + * Copyright 2023 Adobe. All rights reserved. + * This file is licensed to you under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. You may obtain a copy + * of the License at http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS + * OF ANY KIND, either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ +import DOMUtils from '../../utils/DOMUtils.js'; +import createMetadata from './rules/createMetadata.js'; +import adjustImageUrls from './rules/adjustImageUrls.js'; +import convertIcons from './rules/convertIcons.js'; +import transformBackgroundImages from './rules/transformBackgroundImages.js'; + +export default async function transformDOM({ + // eslint-disable-next-line no-unused-vars + url, document, html, params, +}) { + const main = document.body; + + // attempt to remove non-content elements + DOMUtils.remove(main, [ + 'header', + '.header', + 'nav', + '.nav', + 'footer', + '.footer', + 'iframe', + 'noscript', + ]); + + createMetadata(main, document); + transformBackgroundImages(main, document); + adjustImageUrls(main, url); + convertIcons(main, document); + + return main; +} diff --git a/src/index.js b/src/index.js index 5f01278..bce634f 100644 --- a/src/index.js +++ b/src/index.js @@ -27,6 +27,18 @@ import WPUtils from './wp/WPUtils.js'; import { html2md, html2docx } from './importer/HTML2x.js'; +import createMetadata from './importer/defaults/rules/createMetadata.js'; +import adjustImageUrls from './importer/defaults/rules/adjustImageUrls.js'; +import convertIcons from './importer/defaults/rules/convertIcons.js'; +import transformBackgroundImages from './importer/defaults/rules/transformBackgroundImages.js'; + +const rules = { + createMetadata, + adjustImageUrls, + convertIcons, + transformBackgroundImages, +}; + export { PageImporter, PageImporterParams, @@ -42,4 +54,5 @@ export { WPUtils, html2md, html2docx, + rules, }; diff --git a/src/utils/DOMUtils.js b/src/utils/DOMUtils.js index 6db0d4f..934ebc5 100644 --- a/src/utils/DOMUtils.js +++ b/src/utils/DOMUtils.js @@ -258,7 +258,9 @@ export default class DOMUtils { const styleAttr = element?.getAttribute('style')?.split(';'); if (styleAttr) { styleAttr.forEach((style) => { - const [prop, value] = style.split(':'); + const split = style.split(':'); + const prop = split.shift(); + const value = split.join(':').trim(); if (prop === 'background-image') { const trimmedValue = value.replace(/\s/g, ''); const elStyle = element.style; @@ -267,7 +269,7 @@ export default class DOMUtils { }); const url = element.style.backgroundImage; if (url && url.toLowerCase() !== 'none') { - const src = url.replace(/url\(/gm, '').replace(/'/gm, '').replace(/\)/gm, ''); + const src = url.replace(/url\(/gm, '').replace(/'/gm, '').replace(/"/gm, '').replace(/\)/gm, ''); const img = document.createElement('img'); img.src = src; return img; diff --git a/test/TestUtils.js b/test/TestUtils.js new file mode 100644 index 0000000..7bfe3a8 --- /dev/null +++ b/test/TestUtils.js @@ -0,0 +1,21 @@ +/* + * Copyright 2023 Adobe. All rights reserved. + * This file is licensed to you under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. You may obtain a copy + * of the License at http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS + * OF ANY KIND, either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ + +import { JSDOM } from 'jsdom'; + +export default class TestUtils { + // test environment createDocumentFromString version using JSDOM + static createDocumentFromString(html) { + const { document } = new JSDOM(html, { runScripts: undefined }).window; + return document; + } +} diff --git a/test/browser/DOMUtils.test.js b/test/browser/DOMUtils.test.js new file mode 100644 index 0000000..ec8f866 --- /dev/null +++ b/test/browser/DOMUtils.test.js @@ -0,0 +1,67 @@ +/* + * Copyright 2023 Adobe. All rights reserved. + * This file is licensed to you under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. You may obtain a copy + * of the License at http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS + * OF ANY KIND, either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ +/* eslint-env mocha */ +/* eslint-disable no-unused-expressions */ + +import { expect } from '@esm-bundle/chai'; +import BrowserUtils from '../../src/utils/BrowserUtils.js'; +import DOMUtils from '../../src/utils/DOMUtils.js'; + +const createElement = (document, tag, attrs, styles, innerHTML) => { + const element = document.createElement(tag); + // eslint-disable-next-line no-restricted-syntax, guard-for-in + for (const a in attrs) { + element.setAttribute(a, attrs[a]); + } + // eslint-disable-next-line no-restricted-syntax, guard-for-in + for (const p in styles) { + element.style[p] = styles[p]; + } + element.innerHTML = innerHTML; + return element; +}; + +describe('DOMUtils#element', () => { + const test = (tag, attrs, styles, innerHTML, expected) => { + const document = BrowserUtils.createDocumentFromString(''); + const element = createElement(document, tag, attrs, styles, innerHTML); + const ret = DOMUtils.getImgFromBackground(element, document); + if (expected) { + expect(ret).to.not.be.null; + expect(ret.outerHTML).to.equal(expected); + } else { + expect(ret).to.be.null; + } + }; + + it('no background-image style', () => { + test('p', {}, {}, 'Some content', null); + test('img', { src: 'https://www.server.com/image.jpg', title: 'Some title' }, {}, '', null); + test('p', {}, { 'background-image': 'none' }, 'Some content', null); + }); + + it('with background-image style', () => { + test('p', {}, { 'background-image': 'url(https://www.server.com/image.jpg)' }, 'Some content', ''); + test('p', {}, { 'background-image': 'url("https://www.server.com/image.jpg")' }, 'Some content', ''); + test('p', {}, { 'background-image': 'url(\'https://www.server.com/image.jpg\')' }, 'Some content', ''); + test('p', {}, { 'background-image': 'url(http://localhost:3001/image.jpg)' }, 'Some content', ''); + }); + + // `createElement` uses JSDOM to create the test-DOM + // the workaround in DOMUtils#getImgFromBackground exists _precisely_ + // because of a potential bug in JSDOM due to which it doesn't + // parse `url()` with whitespaces correctly + // browser specific version of the test + it('with background-image style containing whitespace in url()', () => { + test('p', {}, { 'background-image': 'url( /image.jpg )' }, 'Some content', ''); + }); +}); diff --git a/test/importers/HTML2x.spec.js b/test/importers/HTML2x.spec.js index cb09338..4f5b7d5 100644 --- a/test/importers/HTML2x.spec.js +++ b/test/importers/HTML2x.spec.js @@ -14,7 +14,6 @@ import { deepStrictEqual, ok, strictEqual, fail, } from 'assert'; import { describe, it } from 'mocha'; -import { JSDOM } from 'jsdom'; import { docx2md } from '@adobe/helix-docx2md'; import MockMediaHandler from '../mocks/MockMediaHandler.js'; @@ -22,39 +21,11 @@ import DOMUtils from '../../src/utils/DOMUtils.js'; import { html2md, html2docx, - defaultGenerateDocumentPath, - defaultTransformDOM, } from '../../src/importer/HTML2x.js'; -// test environment createDocumentFromString version using JSDOM -const createDocumentFromString = (html) => { - const { document } = new JSDOM(html, { runScripts: undefined }).window; - return document; -}; - -describe('defaultTransformDOM tests', () => { - it('default transformation', async () => { - const document = createDocumentFromString('

Hello World

'); - const out = await defaultTransformDOM({ document }); - strictEqual(out.outerHTML, '

Hello World

'); - }); -}); +import TestUtils from '../TestUtils.js'; -describe('defaultGenerateDocumentPath tests', () => { - it('default paths', async () => { - strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com' }), '/index'); - strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com/' }), '/index'); - strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com/index.html' }), '/index'); - strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com/index' }), '/index'); - strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com/page' }), '/page'); - strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com/page.html' }), '/page'); - strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com/folder/page' }), '/folder/page'); - strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com/folder/page.html' }), '/folder/page'); - strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com/folder/page/' }), '/folder/page/index'); - strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com/folder/page with spaces.html' }), '/folder/page-with-spaces'); - strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com/folder/PagE_with_3xtr4_charactére.html' }), '/folder/page-with-3xtr4-charact-re'); - }); -}); +const { createDocumentFromString } = TestUtils; describe('html2x parameters', () => { const URL = 'https://www.sample.com/page.html'; @@ -408,8 +379,8 @@ describe('html2md tests', () => { const out = await html2md('https://www.sample.com/page.html', '', null, { createDocumentFromString, }); - strictEqual(out.html.trim(), ''); - strictEqual(out.md.trim(), '![][image0]\n\n[image0]: ./image.jpg'); + strictEqual(out.html.trim(), ''); + strictEqual(out.md.trim(), '![][image0]\n\n[image0]: https://www.sample.com/image.jpg'); }); it('html2md allows to preprocess the document', async () => { @@ -422,8 +393,8 @@ describe('html2md tests', () => { }, { createDocumentFromString, }); - strictEqual(out.html.trim(), ''); - strictEqual(out.md.trim(), '![][image0]\n\n[image0]: ./image.jpg'); + strictEqual(out.html.trim(), ''); + strictEqual(out.md.trim(), '![][image0]\n\n[image0]: https://www.sample.com/image.jpg'); }); it('html2md removes original hrs but keeps md section breaks', async () => { diff --git a/test/importers/PageImporter.spec.js b/test/importers/PageImporter.spec.js index fd35140..a984373 100644 --- a/test/importers/PageImporter.spec.js +++ b/test/importers/PageImporter.spec.js @@ -237,4 +237,8 @@ describe('PageImporter tests - fixtures', () => { it('import - sub and sup', async () => { await featureTest('subsup'); }); + + it('import - video', async () => { + await featureTest('video'); + }); }); diff --git a/test/importers/defaults/fixtures/adjust-image-urls.expected.html b/test/importers/defaults/fixtures/adjust-image-urls.expected.html new file mode 100644 index 0000000..da597a7 --- /dev/null +++ b/test/importers/defaults/fixtures/adjust-image-urls.expected.html @@ -0,0 +1,7 @@ + +

Hello World

+ + + + + \ No newline at end of file diff --git a/test/importers/defaults/fixtures/adjust-image-urls.input.html b/test/importers/defaults/fixtures/adjust-image-urls.input.html new file mode 100644 index 0000000..6133d9c --- /dev/null +++ b/test/importers/defaults/fixtures/adjust-image-urls.input.html @@ -0,0 +1,10 @@ + + +

Hello World

+ + + + + + + \ No newline at end of file diff --git a/test/importers/defaults/fixtures/background-image.expected.html b/test/importers/defaults/fixtures/background-image.expected.html new file mode 100644 index 0000000..532a5ae --- /dev/null +++ b/test/importers/defaults/fixtures/background-image.expected.html @@ -0,0 +1,13 @@ + +

Hello World

+
+ + some content here +
+
+ +
+
+ +
+ \ No newline at end of file diff --git a/test/importers/defaults/fixtures/background-image.input.html b/test/importers/defaults/fixtures/background-image.input.html new file mode 100644 index 0000000..fbe28dd --- /dev/null +++ b/test/importers/defaults/fixtures/background-image.input.html @@ -0,0 +1,10 @@ + + +

Hello World

+
+ some content here +
+
+
+ + \ No newline at end of file diff --git a/test/importers/defaults/fixtures/cleanup.expected.html b/test/importers/defaults/fixtures/cleanup.expected.html new file mode 100644 index 0000000..e87208f --- /dev/null +++ b/test/importers/defaults/fixtures/cleanup.expected.html @@ -0,0 +1,5 @@ + +
+

Hello World

+
+ \ No newline at end of file diff --git a/test/importers/defaults/fixtures/cleanup.input.html b/test/importers/defaults/fixtures/cleanup.input.html new file mode 100644 index 0000000..d9beebe --- /dev/null +++ b/test/importers/defaults/fixtures/cleanup.input.html @@ -0,0 +1,11 @@ + + +
Top header
+ +
+ +

Hello World

+
+ + + \ No newline at end of file diff --git a/test/importers/defaults/fixtures/default.expected.html b/test/importers/defaults/fixtures/default.expected.html new file mode 100644 index 0000000..51440d2 --- /dev/null +++ b/test/importers/defaults/fixtures/default.expected.html @@ -0,0 +1,4 @@ + +

Hello World

+

Some text with a span, a link anotherpage and a sub.

+ \ No newline at end of file diff --git a/test/importers/defaults/fixtures/default.input.html b/test/importers/defaults/fixtures/default.input.html new file mode 100644 index 0000000..36b7056 --- /dev/null +++ b/test/importers/defaults/fixtures/default.input.html @@ -0,0 +1,6 @@ + + +

Hello World

+

Some text with a span, a link anotherpage and a sub.

+ + \ No newline at end of file diff --git a/test/importers/defaults/fixtures/icons.expected.html b/test/importers/defaults/fixtures/icons.expected.html new file mode 100644 index 0000000..90984b6 --- /dev/null +++ b/test/importers/defaults/fixtures/icons.expected.html @@ -0,0 +1,4 @@ + +

Hello World

:icon1: +

This is text with an icon :icon2:

+ \ No newline at end of file diff --git a/test/importers/defaults/fixtures/icons.input.html b/test/importers/defaults/fixtures/icons.input.html new file mode 100644 index 0000000..f911f9c --- /dev/null +++ b/test/importers/defaults/fixtures/icons.input.html @@ -0,0 +1,6 @@ + + +

Hello World

+

This is text with an icon

+ + \ No newline at end of file diff --git a/test/importers/defaults/fixtures/metadata.all.diff.expected.html b/test/importers/defaults/fixtures/metadata.all.diff.expected.html new file mode 100644 index 0000000..2e25af5 --- /dev/null +++ b/test/importers/defaults/fixtures/metadata.all.diff.expected.html @@ -0,0 +1,40 @@ + +

Hello World

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Metadata
TitlePage title - tite element
DescriptionPage description - description meta
ImageThis is the image alt text - og:image:alt meta
og:titlePage title - og:title meta
og:descriptionPage description - og:description meta
twitter:titlePage title - twitter:title meta
twitter:descriptionPage description - twitter:description meta
twitter:imageThis is the image alt text - twitter:image:alt meta
+ \ No newline at end of file diff --git a/test/importers/defaults/fixtures/metadata.all.diff.input.html b/test/importers/defaults/fixtures/metadata.all.diff.input.html new file mode 100644 index 0000000..4fb8f88 --- /dev/null +++ b/test/importers/defaults/fixtures/metadata.all.diff.input.html @@ -0,0 +1,17 @@ + + + Page title - tite element + + + + + + + + + + + +

Hello World

+ + \ No newline at end of file diff --git a/test/importers/defaults/fixtures/metadata.all.same.expected.html b/test/importers/defaults/fixtures/metadata.all.same.expected.html new file mode 100644 index 0000000..e3451a8 --- /dev/null +++ b/test/importers/defaults/fixtures/metadata.all.same.expected.html @@ -0,0 +1,20 @@ + +

Hello World

+ + + + + + + + + + + + + + + + +
Metadata
TitlePage title
DescriptionPage description
ImageThis is the image alt text
+ \ No newline at end of file diff --git a/test/importers/defaults/fixtures/metadata.all.same.input.html b/test/importers/defaults/fixtures/metadata.all.same.input.html new file mode 100644 index 0000000..8821abc --- /dev/null +++ b/test/importers/defaults/fixtures/metadata.all.same.input.html @@ -0,0 +1,17 @@ + + + Page title + + + + + + + + + + + +

Hello World

+ + \ No newline at end of file diff --git a/test/importers/defaults/fixtures/metadata.basic.expected.html b/test/importers/defaults/fixtures/metadata.basic.expected.html new file mode 100644 index 0000000..2ef0c76 --- /dev/null +++ b/test/importers/defaults/fixtures/metadata.basic.expected.html @@ -0,0 +1,16 @@ + +

Hello World

+ + + + + + + + + + + + +
Metadata
TitlePage title
DescriptionPage description
+ \ No newline at end of file diff --git a/test/importers/defaults/fixtures/metadata.basic.input.html b/test/importers/defaults/fixtures/metadata.basic.input.html new file mode 100644 index 0000000..dd0179a --- /dev/null +++ b/test/importers/defaults/fixtures/metadata.basic.input.html @@ -0,0 +1,9 @@ + + + Page title + + + +

Hello World

+ + +

Hello World

+ + + + + + + + +
Metadata
ImageThis is the image alt text
+ \ No newline at end of file diff --git a/test/importers/defaults/fixtures/metadata.image.input.html b/test/importers/defaults/fixtures/metadata.image.input.html new file mode 100644 index 0000000..f618bed --- /dev/null +++ b/test/importers/defaults/fixtures/metadata.image.input.html @@ -0,0 +1,9 @@ + + + + + + +

Hello World

+ + \ No newline at end of file diff --git a/test/importers/defaults/fixtures/metadata.og.expected.html b/test/importers/defaults/fixtures/metadata.og.expected.html new file mode 100644 index 0000000..5d82b8c --- /dev/null +++ b/test/importers/defaults/fixtures/metadata.og.expected.html @@ -0,0 +1,16 @@ + +

Hello World

+ + + + + + + + + + + + +
Metadata
TitlePage title - og:title meta
DescriptionPage description - og:description meta
+ \ No newline at end of file diff --git a/test/importers/defaults/fixtures/metadata.og.input.html b/test/importers/defaults/fixtures/metadata.og.input.html new file mode 100644 index 0000000..91fb6ea --- /dev/null +++ b/test/importers/defaults/fixtures/metadata.og.input.html @@ -0,0 +1,9 @@ + + + + + + +

Hello World

+ + \ No newline at end of file diff --git a/test/importers/defaults/fixtures/metadata.twitter.expected.html b/test/importers/defaults/fixtures/metadata.twitter.expected.html new file mode 100644 index 0000000..42de23a --- /dev/null +++ b/test/importers/defaults/fixtures/metadata.twitter.expected.html @@ -0,0 +1,16 @@ + +

Hello World

+ + + + + + + + + + + + +
Metadata
TitlePage title - twitter:title meta
DescriptionPage description - twitter:description meta
+ \ No newline at end of file diff --git a/test/importers/defaults/fixtures/metadata.twitter.input.html b/test/importers/defaults/fixtures/metadata.twitter.input.html new file mode 100644 index 0000000..b06c12a --- /dev/null +++ b/test/importers/defaults/fixtures/metadata.twitter.input.html @@ -0,0 +1,9 @@ + + + + + + +

Hello World

+ + \ No newline at end of file diff --git a/test/importers/defaults/generateDocumentPath.spec.js b/test/importers/defaults/generateDocumentPath.spec.js new file mode 100644 index 0000000..02b4625 --- /dev/null +++ b/test/importers/defaults/generateDocumentPath.spec.js @@ -0,0 +1,32 @@ +/* + * Copyright 2023 Adobe. All rights reserved. + * This file is licensed to you under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. You may obtain a copy + * of the License at http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS + * OF ANY KIND, either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ + +import { strictEqual } from 'assert'; +import { describe, it } from 'mocha'; + +import defaultGenerateDocumentPath from '../../../src/importer/defaults/generateDocumentPath.js'; + +describe('defaultGenerateDocumentPath tests', () => { + it('default paths', async () => { + strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com' }), '/index'); + strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com/' }), '/index'); + strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com/index.html' }), '/index'); + strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com/index' }), '/index'); + strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com/page' }), '/page'); + strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com/page.html' }), '/page'); + strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com/folder/page' }), '/folder/page'); + strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com/folder/page.html' }), '/folder/page'); + strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com/folder/page/' }), '/folder/page/index'); + strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com/folder/page with spaces.html' }), '/folder/page-with-spaces'); + strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com/folder/PagE_with_3xtr4_charactére.html' }), '/folder/page-with-3xtr4-charact-re'); + }); +}); diff --git a/test/importers/defaults/transformDOM.spec.js b/test/importers/defaults/transformDOM.spec.js new file mode 100644 index 0000000..5f77819 --- /dev/null +++ b/test/importers/defaults/transformDOM.spec.js @@ -0,0 +1,94 @@ +/* + * Copyright 2023 Adobe. All rights reserved. + * This file is licensed to you under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. You may obtain a copy + * of the License at http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS + * OF ANY KIND, either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ +import path from 'path'; +import fs from 'fs-extra'; +import { dirname } from 'dirname-filename-esm'; +import { strictEqual } from 'assert'; +import { describe, it } from 'mocha'; + +import defaultTransformDOM from '../../../src/importer/defaults/transformDOM.js'; +import TestUtils from '../../TestUtils.js'; + +// eslint-disable-next-line no-underscore-dangle +const __dirname = dirname(import.meta); + +const { createDocumentFromString } = TestUtils; + +describe('defaultTransformDOM tests', () => { + const runTest = async (feature, config) => { + const spec = await fs.readFile(path.resolve(__dirname, 'fixtures', `${feature}.input.html`), 'utf-8'); + const document = createDocumentFromString(spec); + const out = await defaultTransformDOM({ document, ...config }); + const expected = await fs.readFile(path.resolve(__dirname, 'fixtures', `${feature}.expected.html`), 'utf-8'); + strictEqual(out.outerHTML.replace(/(?:\r\n|\r|\n|\s\s)/g, ''), expected.replace(/(?:\r\n|\r|\n|\s\s)/g, '')); + }; + + it('default transformation', async () => { + await runTest('default'); + }); + + it('default transformation handles basic metadata', async () => { + await runTest('metadata.basic'); + }); + + it('default transformation handles img and alt metadata', async () => { + await runTest('metadata.image', { + url: 'https://wwww.sample.com/path/page.html', + }); + }); + + it('default transformation handles identical metadata', async () => { + await runTest('metadata.all.same', { + url: 'https://wwww.sample.com/path/page.html', + }); + }); + + it('default transformation handles different metadata', async () => { + await runTest('metadata.all.diff', { + url: 'https://wwww.sample.com/path/page.html', + }); + }); + + it('default transformation handles falls back to og metadata', async () => { + await runTest('metadata.og', { + url: 'https://wwww.sample.com/path/page.html', + }); + }); + + it('default transformation handles falls back to twitter metadata', async () => { + await runTest('metadata.twitter', { + url: 'https://wwww.sample.com/path/page.html', + }); + }); + + it('default transformation removes non content elements', async () => { + await runTest('cleanup'); + }); + + it('default transformation adjusts image urls', async () => { + await runTest('adjust-image-urls', { + url: 'https://wwww.sample.com/path/page.html', + }); + }); + + it('default transformation converts icons', async () => { + await runTest('icons', { + url: 'https://wwww.sample.com/path/page.html', + }); + }); + + it('default transformation converts background-image styles into image element', async () => { + await runTest('background-image', { + url: 'https://wwww.sample.com/path/page.html', + }); + }); +}); diff --git a/test/importers/fixtures/video.spec.html b/test/importers/fixtures/video.spec.html new file mode 100644 index 0000000..2725c99 --- /dev/null +++ b/test/importers/fixtures/video.spec.html @@ -0,0 +1,11 @@ + + +

videos

+

video link

+ + + \ No newline at end of file diff --git a/test/importers/fixtures/video.spec.md b/test/importers/fixtures/video.spec.md new file mode 100644 index 0000000..bc34419 --- /dev/null +++ b/test/importers/fixtures/video.spec.md @@ -0,0 +1,7 @@ +# videos + +[video link](https://www.sample.com/video.mp4) + +[![][image0]](https://www.sample.com/video.mp4) + +[image0]: ./poster.png \ No newline at end of file diff --git a/test/utils/DOMUtils.spec.js b/test/utils/DOMUtils.spec.js index 6ebb38a..b7d1965 100644 --- a/test/utils/DOMUtils.spec.js +++ b/test/utils/DOMUtils.spec.js @@ -408,15 +408,15 @@ describe('DOMUtils#getImgFromBackground', () => { it('no background-image style', () => { test(createElement('p', {}, {}, 'Some content'), null); - test(createElement('img', { src: 'https://www.server.com/image.jpg', title: 'Some title' }, {}, ''), null); - test(createElement('p', {}, { 'background-image': 'none' }, 'Some content'), null); }); it('with background-image style', () => { test(createElement('p', {}, { 'background-image': 'url(https://www.server.com/image.jpg)' }, 'Some content'), ''); - test(createElement('div', { class: 'someclass' }, { 'background-image': 'url("https://www.server.com/image.jpg")', background: 'rgb(0, 0, 0) none repeat scroll 0% 0% / auto padding-box border-box' }, '
Some divs
More divs
'), ''); + test(createElement('p', {}, { 'background-image': 'url("https://www.server.com/image.jpg")' }, 'Some content'), ''); + test(createElement('p', {}, { 'background-image': 'url(\'https://www.server.com/image.jpg\')' }, 'Some content'), ''); + test(createElement('p', {}, { 'background-image': 'url(http://localhost:3001/image.jpg)' }, 'Some content'), ''); }); // `createElement` uses JSDOM to create the test-DOM @@ -426,6 +426,5 @@ describe('DOMUtils#getImgFromBackground', () => { // disabling the test, keeping it as a reference xit('with background-image style containing whitespace in url()', () => { test(createElement('p', {}, { 'background-image': 'url( /image.jpg )' }, 'Some content'), ''); - test(createElement('div', { class: 'someclass' }, { 'background-image': 'url( https://www.server.com/image.jpg )', background: 'rgb(0, 0, 0) none repeat scroll 0% 0% / auto padding-box border-box' }, '
Some divs
More divs
'), ''); }); });