Skip to content

Commit

Permalink
feat: improve default import (#261)
Browse files Browse the repository at this point in the history
  • Loading branch information
kptdobe authored Nov 13, 2023
1 parent fdd42b8 commit 251cfcd
Show file tree
Hide file tree
Showing 41 changed files with 774 additions and 64 deletions.
4 changes: 2 additions & 2 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

23 changes: 2 additions & 21 deletions src/importer/HTML2x.js
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ import PageImporterResource from './PageImporterResource.js';
import MemoryHandler from '../storage/MemoryHandler.js';
import Utils from '../utils/Utils.js';
import BrowserUtils from '../utils/BrowserUtils.js';
import defaultTransformDOM from './defaults/transformDOM.js';
import defaultGenerateDocumentPath from './defaults/generateDocumentPath.js';

// import docxStylesXML from '../resources/styles.xml';

Expand All @@ -35,27 +37,6 @@ function setBackgroundImagesFromCSS(document) {
}
}

async function defaultTransformDOM({
// eslint-disable-next-line no-unused-vars
url, document, html, params,
}) {
return document.body;
}

async function defaultGenerateDocumentPath({
// eslint-disable-next-line no-unused-vars
url, document, html, params,
}) {
let p = new URL(url).pathname;
if (p.endsWith('/')) {
p = `${p}index`;
}
return decodeURIComponent(p)
.toLowerCase()
.replace(/\.html$/, '')
.replace(/[^a-z0-9/]/gm, '-');
}

async function html2x(
url,
doc,
Expand Down
24 changes: 24 additions & 0 deletions src/importer/defaults/generateDocumentPath.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
/*
* Copyright 2023 Adobe. All rights reserved.
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. You may obtain a copy
* of the License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
* OF ANY KIND, either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/
export default async function generateDocumentPath({
// eslint-disable-next-line no-unused-vars
url, document, html, params,
}) {
let p = new URL(url).pathname;
if (p.endsWith('/')) {
p = `${p}index`;
}
return decodeURIComponent(p)
.toLowerCase()
.replace(/\.html$/, '')
.replace(/[^a-z0-9/]/gm, '-');
}
28 changes: 28 additions & 0 deletions src/importer/defaults/rules/adjustImageUrls.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
/*
* Copyright 2023 Adobe. All rights reserved.
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. You may obtain a copy
* of the License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
* OF ANY KIND, either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/

export default function adjustImageUrls(main, url) {
[...main.querySelectorAll('img')].forEach((img) => {
const src = img.getAttribute('src');
if (src && (src.startsWith('./') || src.startsWith('/') || src.startsWith('../'))) {
try {
const u = new URL(src, url);
// eslint-disable-next-line no-param-reassign
img.src = u.toString();
} catch (e) {
// eslint-disable-next-line no-console
console.log(`Unable to adjust image URL ${img.src} - removing image`);
img.remove();
}
}
});
}
25 changes: 25 additions & 0 deletions src/importer/defaults/rules/convertIcons.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
/*
* Copyright 2023 Adobe. All rights reserved.
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. You may obtain a copy
* of the License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
* OF ANY KIND, either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/

export default function convertIcons(main, document) {
[...main.querySelectorAll('img')].forEach((img) => {
const src = img.getAttribute('src');
if (src && src.endsWith('.svg')) {
const span = document.createElement('span');
const name = src.split('/').pop().split('.')[0].toLowerCase().trim().replace(/[^a-z0-9]/g, '-');
if (name) {
span.innerHTML = `:${name}:`;
img.replaceWith(span);
}
}
});
}
102 changes: 102 additions & 0 deletions src/importer/defaults/rules/createMetadata.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
/*
* Copyright 2023 Adobe. All rights reserved.
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. You may obtain a copy
* of the License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
* OF ANY KIND, either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/

import Blocks from '../../../utils/Blocks.js';

function getMetadata(name, document) {
const attr = name && name.includes(':') ? 'property' : 'name';
const meta = [...document.head.querySelectorAll(`meta[${attr}="${name}"]`)]
.map((m) => m.content)
.join(', ');
return meta || '';
}

export default function createMetadata(main, document) {
const meta = {};

const title = document.querySelector('title');
if (title) {
meta.Title = title.textContent.replace(/[\n\t]/gm, '');
}

const desc = getMetadata('description', document);
if (desc) {
meta.Description = desc;
}

const img = getMetadata('og:image', document);
if (img) {
const el = document.createElement('img');
el.src = img;
meta.Image = el;

const imgAlt = getMetadata('og:image:alt', document);
if (imgAlt) {
el.alt = imgAlt;
}
}

const ogtitle = getMetadata('og:title', document);
if (ogtitle && ogtitle !== meta.Title) {
if (meta.Title) {
meta['og:title'] = ogtitle;
} else {
meta.Title = ogtitle;
}
}

const ogdesc = getMetadata('og:description', document);
if (ogdesc && ogdesc !== meta.Description) {
if (meta.Description) {
meta['og:description'] = ogdesc;
} else {
meta.Description = ogdesc;
}
}

const ttitle = getMetadata('twitter:title', document);
if (ttitle && ttitle !== meta.Title) {
if (meta.Title) {
meta['twitter:title'] = ttitle;
} else {
meta.Title = ttitle;
}
}

const tdesc = getMetadata('twitter:description', document);
if (tdesc && tdesc !== meta.Description) {
if (meta.Description) {
meta['twitter:description'] = tdesc;
} else {
meta.Description = tdesc;
}
}

const timg = getMetadata('twitter:image', document);
if (timg && timg !== img) {
const el = document.createElement('img');
el.src = timg;
meta['twitter:image'] = el;

const imgAlt = getMetadata('twitter:image:alt', document);
if (imgAlt) {
el.alt = imgAlt;
}
}

if (Object.keys(meta).length > 0) {
const block = Blocks.getMetadataBlock(document, meta);
main.append(block);
}

return meta;
}
21 changes: 21 additions & 0 deletions src/importer/defaults/rules/transformBackgroundImages.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
/*
* Copyright 2023 Adobe. All rights reserved.
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. You may obtain a copy
* of the License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
* OF ANY KIND, either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/

import DOMUtils from '../../../utils/DOMUtils.js';

export default function transformBackgroundImages(main, document) {
[...main.querySelectorAll('[style*="background-image: url"]')].forEach((element) => {
const img = DOMUtils.getImgFromBackground(element, document);
element.prepend(img);
element.style.removeProperty('background-image');
});
}
42 changes: 42 additions & 0 deletions src/importer/defaults/transformDOM.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
/*
* Copyright 2023 Adobe. All rights reserved.
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. You may obtain a copy
* of the License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
* OF ANY KIND, either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/
import DOMUtils from '../../utils/DOMUtils.js';
import createMetadata from './rules/createMetadata.js';
import adjustImageUrls from './rules/adjustImageUrls.js';
import convertIcons from './rules/convertIcons.js';
import transformBackgroundImages from './rules/transformBackgroundImages.js';

export default async function transformDOM({
// eslint-disable-next-line no-unused-vars
url, document, html, params,
}) {
const main = document.body;

// attempt to remove non-content elements
DOMUtils.remove(main, [
'header',
'.header',
'nav',
'.nav',
'footer',
'.footer',
'iframe',
'noscript',
]);

createMetadata(main, document);
transformBackgroundImages(main, document);
adjustImageUrls(main, url);
convertIcons(main, document);

return main;
}
13 changes: 13 additions & 0 deletions src/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,18 @@ import WPUtils from './wp/WPUtils.js';

import { html2md, html2docx } from './importer/HTML2x.js';

import createMetadata from './importer/defaults/rules/createMetadata.js';
import adjustImageUrls from './importer/defaults/rules/adjustImageUrls.js';
import convertIcons from './importer/defaults/rules/convertIcons.js';
import transformBackgroundImages from './importer/defaults/rules/transformBackgroundImages.js';

const rules = {
createMetadata,
adjustImageUrls,
convertIcons,
transformBackgroundImages,
};

export {
PageImporter,
PageImporterParams,
Expand All @@ -42,4 +54,5 @@ export {
WPUtils,
html2md,
html2docx,
rules,
};
6 changes: 4 additions & 2 deletions src/utils/DOMUtils.js
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,9 @@ export default class DOMUtils {
const styleAttr = element?.getAttribute('style')?.split(';');
if (styleAttr) {
styleAttr.forEach((style) => {
const [prop, value] = style.split(':');
const split = style.split(':');
const prop = split.shift();
const value = split.join(':').trim();
if (prop === 'background-image') {
const trimmedValue = value.replace(/\s/g, '');
const elStyle = element.style;
Expand All @@ -267,7 +269,7 @@ export default class DOMUtils {
});
const url = element.style.backgroundImage;
if (url && url.toLowerCase() !== 'none') {
const src = url.replace(/url\(/gm, '').replace(/'/gm, '').replace(/\)/gm, '');
const src = url.replace(/url\(/gm, '').replace(/'/gm, '').replace(/"/gm, '').replace(/\)/gm, '');
const img = document.createElement('img');
img.src = src;
return img;
Expand Down
21 changes: 21 additions & 0 deletions test/TestUtils.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
/*
* Copyright 2023 Adobe. All rights reserved.
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. You may obtain a copy
* of the License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
* OF ANY KIND, either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/

import { JSDOM } from 'jsdom';

export default class TestUtils {
// test environment createDocumentFromString version using JSDOM
static createDocumentFromString(html) {
const { document } = new JSDOM(html, { runScripts: undefined }).window;
return document;
}
}
Loading

0 comments on commit 251cfcd

Please sign in to comment.