Skip to content

Commit

Permalink
feat: move string utility methods from onedrive-support (#1033)
Browse files Browse the repository at this point in the history
  • Loading branch information
dominique-pfister authored Dec 7, 2024
1 parent 1c979b8 commit 334fdaf
Show file tree
Hide file tree
Showing 2 changed files with 237 additions and 1 deletion.
98 changes: 98 additions & 0 deletions packages/helix-shared-string/src/string.js
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,101 @@ export function multiline(str) {
.map((l) => l.slice(prefixLen)) // discard prefixes
.join('\n');
}

/**
* Splits the given name at the last '.', returning the extension and the base name.
* @param {string} name Filename
* @returns {string[]} Returns an array containing the base name and extension.
*/
export function splitByExtension(name) {
const idx = name.lastIndexOf('.');
const baseName = idx > 0 && idx < name.length - 1 ? name.substring(0, idx) : name;
const ext = idx > 0 && idx < name.length - 1 ? name.substring(idx + 1).toLowerCase() : '';
return [baseName, ext];
}

/**
* Sanitizes the given string by :
* - convert to lower case
* - normalize all unicode characters
* - replace all non-alphanumeric characters with a dash
* - remove all consecutive dashes
* - remove all leading and trailing dashes
*
* @param {string} name
* @returns {string} sanitized name
*/
export function sanitizeName(name) {
return name
.toLowerCase()
.normalize('NFD')
.replace(/[\u0300-\u036f]/g, '')
.replace(/[^a-z0-9]+/g, '-')
.replace(/^-|-$/g, '');
}

/**
* Sanitizes the file path by:
* - convert to lower case
* - normalize all unicode characters
* - replace all non-alphanumeric characters with a dash
* - remove all consecutive dashes
* - remove all leading and trailing dashes
*
* Note that only the basename of the file path is sanitized. i.e. The ancestor path and the
* extension is not affected.
*
* @param {string} filepath the file path
* @param {object} opts Options
* @param {boolean} [opts.ignoreExtension] if {@code true} ignores the extension
* @returns {string} sanitized file path
*/
export function sanitizePath(filepath, opts = {}) {
const idx = filepath.lastIndexOf('/') + 1;
const extIdx = opts.ignoreExtension ? -1 : filepath.lastIndexOf('.');
const pfx = filepath.substring(0, idx);
const basename = extIdx < idx ? filepath.substring(idx) : filepath.substring(idx, extIdx);
const ext = extIdx < idx ? '' : filepath.substring(extIdx);
const name = sanitizeName(basename);
return `${pfx}${name}${ext}`;
}

/**
* Compute the edit distance using a recursive algorithm. since we only expect to have relative
* short filenames, the algorithm shouldn't be too expensive.
*
* @param {string} s0 Input string
* @param {string} s1 Input string
* @returns {number|*}
*/
export function editDistance(s0, s1) {
// make sure that s0 length is greater than s1 length
if (s0.length < s1.length) {
const t = s1;
// eslint-disable-next-line no-param-reassign
s1 = s0;
// eslint-disable-next-line no-param-reassign
s0 = t;
}
const l0 = s0.length;
const l1 = s1.length;

// init first row
const resultMatrix = [[]];
for (let c = 0; c < l1 + 1; c += 1) {
resultMatrix[0][c] = c;
}
// fill out the distance matrix and find the best path
for (let i = 1; i < l0 + 1; i += 1) {
resultMatrix[i] = [i];
for (let j = 1; j < l1 + 1; j += 1) {
const replaceCost = (s0.charAt(i - 1) === s1.charAt(j - 1)) ? 0 : 1;
resultMatrix[i][j] = Math.min(
resultMatrix[i - 1][j] + 1, // insert
resultMatrix[i][j - 1] + 1, // remove
resultMatrix[i - 1][j - 1] + replaceCost,
);
}
}
return resultMatrix[l0][l1];
}
140 changes: 139 additions & 1 deletion packages/helix-shared-string/test/string.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
/* eslint-env mocha */

import assert from 'assert';
import { multiline } from '../src/string.js';
import {
multiline, editDistance, sanitizeName, sanitizePath, splitByExtension,
} from '../src/string.js';

describe('String tests', () => {
it('multiline()', () => {
Expand All @@ -39,3 +41,139 @@ describe('String tests', () => {
`);
});
});

describe('splitByExtension Tests', () => {
it('extension split works for empty string', () => {
assert.deepStrictEqual(['', ''], splitByExtension(''));
});

it('extension split works for string w/o extension', () => {
assert.deepStrictEqual(['foo', ''], splitByExtension('foo'));
});

it('extension split works for string with extension', () => {
assert.deepStrictEqual(['foo', 'txt'], splitByExtension('foo.txt'));
});

it('extension split works for string with dots and extension', () => {
assert.deepStrictEqual(['foo.bar', 'txt'], splitByExtension('foo.bar.txt'));
});

it('extension split works for string ending with a dot', () => {
assert.deepStrictEqual(['foo.', ''], splitByExtension('foo.'));
});

it('extension split works for string starting with a dot', () => {
assert.deepStrictEqual(['.foo', ''], splitByExtension('.foo'));
});
});

describe('sanitize Tests', () => {
it('sanitize works for empty string', () => {
assert.strictEqual(sanitizeName(''), '');
});

it('sanitize transform string to lower case', () => {
assert.strictEqual(sanitizeName('MyDocument'), 'mydocument');
});

it('sanitize transforms non-alpha to dashes', () => {
assert.strictEqual(sanitizeName('My 2. Document'), 'my-2-document');
});

it('sanitize removes leading dashes', () => {
assert.strictEqual(sanitizeName('.My 2. Document'), 'my-2-document');
});

it('sanitize removes trailing dashes', () => {
assert.strictEqual(sanitizeName('.My 2. Document-'), 'my-2-document');
});

it('sanitize normalizes unicode', () => {
assert.strictEqual(sanitizeName('Föhren Smürd'), 'fohren-smurd');
});
});

describe('editDistance Tests', () => {
it('editDistances works for empty strings', () => {
assert.strictEqual(0, editDistance('', ''));
});

it('editDistances works for equal strings', () => {
assert.strictEqual(0, editDistance('foo', 'foo'));
});

it('editDistances works for appended characters', () => {
assert.strictEqual(3, editDistance('foo', 'foo123'));
});

it('editDistances works for removed characters from the end', () => {
assert.strictEqual(3, editDistance('foo123', 'foo'));
});

it('editDistances works for replaced characters', () => {
assert.strictEqual(3, editDistance('My Document', 'my-document'));
});

it('editDistances works for more complicate replacements', () => {
assert.strictEqual(5, editDistance('My 1. Document', 'my-1-document'));
});

it('editDistances works for more complicate replacements (2)', () => {
assert.strictEqual(10, editDistance('my-1-document', 'My 1. Document.docx'));
});

it('editDistances is reasonably fast for long names)', () => {
const t0 = Date.now();
assert.strictEqual(66, editDistance(
'my-1-document my-1-document my-1-document my-1-document my-1-document my-1-document my-1-document my-1-document my-1-document my-1-document my-1-document my-1-document ',
'My 1. Document My 1. Document My 1. Document My 1. Document My 1. Document My 1. Document My 1. Document My 1. Document My 1. Document My 1. Document My 1. Document .docx',
));
const t1 = Date.now();
assert.ok(t1 - t0 < 100);
});
});

describe('sanitizePath Tests', () => {
it('sanitizePath works for empty string', () => {
assert.strictEqual(sanitizePath(''), '');
});

it('sanitizePath transform string to lower case', () => {
assert.strictEqual(sanitizePath('MyDocument'), 'mydocument');
});

it('sanitizePath can ignore extension', () => {
assert.strictEqual(sanitizePath('.MyDocument', {
ignoreExtension: true,
}), 'mydocument');
});

it('sanitizePath works with dots in path and no extension', () => {
assert.strictEqual(sanitizePath('/foo.bar/My Document'), '/foo.bar/my-document');
});

it('sanitizePath only transforms last path segment', () => {
assert.strictEqual(sanitizePath('/Untitled Folder/MyDocument'), '/Untitled Folder/mydocument');
});

it('sanitizePath only transforms root segment', () => {
assert.strictEqual(sanitizePath('/MyDocument'), '/mydocument');
});

it('sanitizePath transforms non-alpha to dashes', () => {
assert.strictEqual(sanitizePath('My 2. Document.docx'), 'my-2-document.docx');
});

it('sanitizePath removes leading dashes', () => {
assert.strictEqual(sanitizePath('.My 2. Document.docx'), 'my-2-document.docx');
});

it('sanitizePath removes trailing dashes', () => {
assert.strictEqual(sanitizePath('.My 2. Document!.docx'), 'my-2-document.docx');
});

it('sanitizePath normalizes unicode', () => {
assert.strictEqual(sanitizePath('Föhren Smürd'), 'fohren-smurd');
});
});

0 comments on commit 334fdaf

Please sign in to comment.