Skip to content

Commit

Permalink
i18n: upgrade to latest icu formatter (#13834)
Browse files Browse the repository at this point in the history
  • Loading branch information
connorjclark authored Oct 5, 2023
1 parent 7782592 commit 3c57e70
Show file tree
Hide file tree
Showing 6 changed files with 167 additions and 74 deletions.
50 changes: 25 additions & 25 deletions core/scripts/i18n/collect-strings.js
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ import {pathToFileURL} from 'url';
import glob from 'glob';
import {expect} from 'expect';
import tsc from 'typescript';
import MessageParser from 'intl-messageformat-parser';
import MessageParser from '@formatjs/icu-messageformat-parser';
import esMain from 'es-main';
import isDeepEqual from 'lodash/isEqual.js';

Expand All @@ -24,6 +24,7 @@ import {pruneObsoleteLhlMessages} from './prune-obsolete-lhl-messages.js';
import {countTranslatedMessages} from './count-translated.js';
import {LH_ROOT} from '../../../shared/root.js';
import {resolveModulePath} from '../../../shared/esm-utils.js';
import {escapeIcuMessage} from '../../../shared/localization/format.js';

// Match declarations of UIStrings, terminating in either a `};\n` (very likely to always be right)
// or `}\n\n` (allowing semicolon to be optional, but insisting on a double newline so that an
Expand Down Expand Up @@ -189,36 +190,35 @@ function convertMessageToCtc(lhlMessage, examples = {}) {
* @param {string} lhlMessage
*/
function _lhlValidityChecks(lhlMessage) {
let parsedMessage;
let parsedMessageElements;
try {
parsedMessage = MessageParser.parse(lhlMessage);
parsedMessageElements = MessageParser.parse(escapeIcuMessage(lhlMessage), {ignoreTag: true});
} catch (err) {
if (err.name !== 'SyntaxError') throw err;
// Improve the intl-messageformat-parser syntax error output.
/** @type {Array<{text: string}>} */
const expected = err.expected;
const expectedStr = expected.map(exp => `'${exp.text}'`).join(', ');
throw new Error(`Did not find the expected syntax (one of ${expectedStr}) in message "${lhlMessage}"`);
throw new Error(`[${err.message}] Did not find the expected syntax in message: ${err.originalMessage}`);
}

for (const element of parsedMessage.elements) {
if (element.type !== 'argumentElement' || !element.format) continue;

if (element.format.type === 'pluralFormat' || element.format.type === 'selectFormat') {
// `plural`/`select` arguments can't have content before or after them.
// See http://userguide.icu-project.org/formatparse/messages#TOC-Complex-Argument-Types
// e.g. https://github.com/GoogleChrome/lighthouse/pull/11068#discussion_r451682796
if (parsedMessage.elements.length > 1) {
throw new Error(`Content cannot appear outside plural or select ICU messages. Instead, repeat that content in each option (message: '${lhlMessage}')`);
}

// Each option value must also be a valid lhlMessage.
for (const option of element.format.options) {
const optionStr = lhlMessage.slice(option.value.location.start.offset, option.value.location.end.offset);
_lhlValidityChecks(optionStr);
/**
* @param {MessageParser.MessageFormatElement[]} elements
*/
function validate(elements) {
for (const element of elements) {
if (element.type === MessageParser.TYPE.plural || element.type === MessageParser.TYPE.select) {
// `plural`/`select` arguments can't have content before or after them.
// See http://userguide.icu-project.org/formatparse/messages#TOC-Complex-Argument-Types
// e.g. https://github.com/GoogleChrome/lighthouse/pull/11068#discussion_r451682796
if (elements.length > 1) {
throw new Error(`Content cannot appear outside plural or select ICU messages. Instead, repeat that content in each option (message: '${lhlMessage}')`);
}

for (const option of Object.values(element.options)) {
validate(option.value);
}
}
}
}

validate(parsedMessageElements);
}

/**
Expand Down Expand Up @@ -389,7 +389,7 @@ function _processPlaceholderDirectIcu(icu, examples) {
for (const [key, value] of Object.entries(examples)) {
// Make sure all examples have ICU vars
if (!icu.message.includes(`{${key}}`)) {
throw Error(`Example '${key}' provided, but has not corresponding ICU replacement in message "${icu.message}"`);
throw Error(`Example '${key}' provided, but has no corresponding ICU replacement in message "${icu.message}"`);
}
const eName = `ICU_${idx++}`;
tempMessage = tempMessage.replace(`{${key}}`, `$${eName}$`);
Expand Down Expand Up @@ -517,7 +517,7 @@ function parseUIStrings(sourceStr, liveUIStrings) {
const key = getIdentifier(property);

// Use live message to avoid having to e.g. concat strings broken into parts.
const message = liveUIStrings[key];
const message = (liveUIStrings[key]);

// @ts-expect-error - Not part of the public tsc interface yet.
const jsDocComments = tsc.getJSDocCommentsAndTags(property);
Expand Down
14 changes: 8 additions & 6 deletions core/scripts/i18n/prune-obsolete-lhl-messages.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ import fs from 'fs';
import path from 'path';

import glob from 'glob';
import MessageParser from 'intl-messageformat-parser';
import MessageParser from '@formatjs/icu-messageformat-parser';

import {collectAllCustomElementsFromICU} from '../../../shared/localization/format.js';
import {collectAllCustomElementsFromICU, escapeIcuMessage} from '../../../shared/localization/format.js';
import {LH_ROOT} from '../../../shared/root.js';
import {readJson} from '../../test/test-utils.js';

Expand All @@ -24,8 +24,10 @@ import {readJson} from '../../test/test-utils.js';
* @return {boolean}
*/
function equalArguments(goldenArgumentIds, lhlMessage) {
const parsedMessage = MessageParser.parse(lhlMessage);
const lhlArgumentElements = collectAllCustomElementsFromICU(parsedMessage.elements);
const parsedMessageElements = MessageParser.parse(escapeIcuMessage(lhlMessage), {
ignoreTag: true,
});
const lhlArgumentElements = collectAllCustomElementsFromICU(parsedMessageElements);
const lhlArgumentIds = [...lhlArgumentElements.keys()];

if (goldenArgumentIds.length !== lhlArgumentIds.length) return false;
Expand Down Expand Up @@ -96,8 +98,8 @@ function getGoldenLocaleArgumentIds(goldenLhl) {
const goldenLocaleArgumentIds = {};

for (const [messageId, {message}] of Object.entries(goldenLhl)) {
const parsedMessage = MessageParser.parse(message);
const goldenArgumentElements = collectAllCustomElementsFromICU(parsedMessage.elements);
const parsedMessageElements = MessageParser.parse(escapeIcuMessage(message), {ignoreTag: true});
const goldenArgumentElements = collectAllCustomElementsFromICU(parsedMessageElements);
const goldenArgumentIds = [...goldenArgumentElements.keys()].sort();

goldenLocaleArgumentIds[messageId] = goldenArgumentIds;
Expand Down
8 changes: 4 additions & 4 deletions core/test/scripts/i18n/collect-strings-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,7 @@ describe('#_lhlValidityChecks', () => {
it('errors when using non-supported custom-formatted ICU format', () => {
const message = 'Hello World took {var, badFormat, milliseconds}.';
expect(() => collect.convertMessageToCtc(message)).toThrow(
/Did not find the expected syntax \(one of 'number', 'date', 'time', 'plural', 'selectordinal', 'select'\) in message "Hello World took {var, badFormat, milliseconds}."$/);
/\[INVALID_ARGUMENT_TYPE\] Did not find the expected syntax in message: Hello World took {var, badFormat, milliseconds}.$/);
});

it('errors when there is content outside of a plural argument', () => {
Expand Down Expand Up @@ -370,14 +370,14 @@ describe('#_lhlValidityChecks', () => {
/Content cannot appear outside plural or select ICU messages.*=1 {1 request} other {# requests}}'\)$/);
});

it('errors when there is content outside of nested plural aguments', () => {
it('errors when there is content outside of nested plural arguments', () => {
const message = `{user_gender, select,
female {Ms. {name} received {count, plural, =1 {one award.} other {# awards.}}}
male {Mr. {name} received {count, plural, =1 {one award.} other {# awards.}}}
other {{name} received {count, plural, =1 {one award.} other {# awards.}}}
}`;
expect(() => collect.convertMessageToCtc(message, {name: 'Elbert'})).toThrow(
/Content cannot appear outside plural or select ICU messages.*\(message: 'Ms. {name} received {count, plural, =1 {one award.} other {# awards.}}'\)$/);
/Content cannot appear outside plural or select ICU messages.*\(message: '{user_gender, select/);
});
/* eslint-enable max-len */
});
Expand Down Expand Up @@ -562,7 +562,7 @@ describe('Convert Message to Placeholder', () => {
const message = 'Hello name.';
expect(() => collect.convertMessageToCtc(message, {name: 'Mary'}))
// eslint-disable-next-line max-len
.toThrow(/Example 'name' provided, but has not corresponding ICU replacement in message "Hello name."/);
.toThrow(/Example 'name' provided, but has no corresponding ICU replacement in message "Hello name."/);
});

it('errors when direct ICU has no examples', () => {
Expand Down
4 changes: 2 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@
},
"devDependencies": {
"@build-tracker/cli": "^1.0.0-beta.15",
"@formatjs/icu-messageformat-parser": "^2.6.2",
"@esbuild-kit/esm-loader": "^2.1.1",
"@esbuild-plugins/node-modules-polyfill": "^0.1.4",
"@jest/fake-timers": "^28.1.0",
Expand Down Expand Up @@ -152,7 +153,6 @@
"gh-pages": "^2.0.1",
"glob": "^7.1.3",
"idb-keyval": "2.2.0",
"intl-messageformat-parser": "^1.8.1",
"jest-mock": "^27.3.0",
"jest-snapshot": "^28.1.0",
"jsdom": "^12.2.0",
Expand Down Expand Up @@ -188,7 +188,7 @@
"devtools-protocol": "0.0.1200039",
"enquirer": "^2.3.6",
"http-link-header": "^1.1.1",
"intl-messageformat": "^4.4.0",
"intl-messageformat": "^10.5.3",
"jpeg-js": "^0.4.4",
"js-library-detector": "^6.7.0",
"lighthouse-logger": "^2.0.1",
Expand Down
109 changes: 73 additions & 36 deletions shared/localization/format.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,19 @@ import {getModuleDirectory} from '../esm-utils.js';
import {isObjectOfUnknownValues, isObjectOrArrayOfUnknownValues} from '../type-verifiers.js';
import {locales} from './locales.js';

// From @formatjs/icu-messageformat-parser - copy here so we don't need to bundle all that.
const TYPE = /** @type {const} */ ({
literal: 0,
argument: 1,
number: 2,
date: 3,
time: 4,
select: 5,
plural: 6,
pound: 7,
tag: 8,
});

const moduleDir = getModuleDirectory(import.meta);

/** Contains available locales with messages. May be an empty object if bundled. */
Expand All @@ -30,11 +43,11 @@ const CANONICAL_LOCALES = fs.readdirSync(moduleDir + '/locales/')
.map(locale => locale.replace('.json', ''))
.sort();

/** @typedef {import('intl-messageformat-parser').Element} MessageElement */
/** @typedef {import('intl-messageformat-parser').ArgumentElement} ArgumentElement */
/** @typedef {import('@formatjs/icu-messageformat-parser').MessageFormatElement} MessageFormatElement */

const MESSAGE_I18N_ID_REGEX = / | [^\s]+$/;

/** @type {Partial<import('intl-messageformat').Formats>} */
const formats = {
number: {
bytes: {
Expand All @@ -57,40 +70,39 @@ const formats = {
};

/**
* Function to retrieve all 'argumentElement's from an ICU message. An argumentElement
* is an ICU element with an argument in it, like '{varName}' or '{varName, number, bytes}'. This
* differs from 'messageElement's which are just arbitrary text in a message.
* Function to retrieve all elements from an ICU message AST that are associated
* with a named input, like '{varName}' or '{varName, number, bytes}'. This
* differs from literal message types which are just arbitrary text.
*
* Notes:
* This function will recursively inspect plural elements for nested argumentElements.
* This function recursively inspects plural elements for nested elements,
* but since the output is a Map they are deduplicated.
* e.g. "=1{hello {icu}} =other{hello {icu}}" will produce one element in the output,
* with "icu" as its key.
*
* We need to find all the elements from the plural format sections, but
* they need to be deduplicated. I.e. "=1{hello {icu}} =other{hello {icu}}"
* the variable "icu" would appear twice if it wasn't de duplicated. And they cannot
* be stored in a set because they are not equal since their locations are different,
* thus they are stored via a Map keyed on the "id" which is the ICU varName.
* TODO: don't do that deduplication because messages within a plural message could be number
* messages with different styles.
*
* @param {Array<MessageElement>} icuElements
* @param {Map<string, ArgumentElement>} [seenElementsById]
* @return {Map<string, ArgumentElement>}
* @param {Array<MessageFormatElement>} icuElements
* @param {Map<string, MessageFormatElement>} [customElements]
* @return {Map<string, MessageFormatElement>}
*/
function collectAllCustomElementsFromICU(icuElements, seenElementsById = new Map()) {
function collectAllCustomElementsFromICU(icuElements, customElements = new Map()) {
for (const el of icuElements) {
// We are only interested in elements that need ICU formatting (argumentElements)
if (el.type !== 'argumentElement') continue;
if (el.type === TYPE.literal || el.type === TYPE.pound) continue;

seenElementsById.set(el.id, el);
customElements.set(el.value, el);

// Plurals need to be inspected recursively
if (!el.format || el.format.type !== 'pluralFormat') continue;
// Look at all options of the plural (=1{} =other{}...)
for (const option of el.format.options) {
// Run collections on each option's elements
collectAllCustomElementsFromICU(option.value.elements, seenElementsById);
if (el.type === TYPE.plural) {
// Look at all options of the plural (=1{} =other{}...)
for (const option of Object.values(el.options)) {
// Run collections on each option's elements
collectAllCustomElementsFromICU(option.value, customElements);
}
}
}

return seenElementsById;
return customElements;
}

/**
Expand All @@ -103,23 +115,22 @@ function collectAllCustomElementsFromICU(icuElements, seenElementsById = new Map
* @return {Record<string, string | number>}
*/
function _preformatValues(messageFormatter, values = {}, lhlMessage) {
const elementMap = collectAllCustomElementsFromICU(messageFormatter.getAst().elements);
const argumentElements = [...elementMap.values()];
const customElements = collectAllCustomElementsFromICU(messageFormatter.getAst());

/** @type {Record<string, string | number>} */
const formattedValues = {};

for (const {id, format} of argumentElements) {
for (const [id, element] of customElements) {
// Throw an error if a message's value isn't provided
if (id && (id in values) === false) {
if (!(id in values)) {
throw new Error(`ICU Message "${lhlMessage}" contains a value reference ("${id}") ` +
`that wasn't provided`);
}

const value = values[id];

// Direct `{id}` replacement and non-numeric values need no formatting.
if (!format || format.type !== 'numberFormat') {
if (element.type !== TYPE.number) {
formattedValues[id] = value;
continue;
}
Expand All @@ -130,13 +141,13 @@ function _preformatValues(messageFormatter, values = {}, lhlMessage) {
}

// Format values for known styles.
if (format.style === 'milliseconds') {
if (element.style === 'milliseconds') {
// Round all milliseconds to the nearest 10.
formattedValues[id] = Math.round(value / 10) * 10;
} else if (format.style === 'seconds' && id === 'timeInMs') {
} else if (element.style === 'seconds' && id === 'timeInMs') {
// Convert all seconds to the correct unit (currently only for `timeInMs`).
formattedValues[id] = Math.round(value / 100) / 10;
} else if (format.style === 'bytes') {
} else if (element.style === 'bytes') {
// Replace all the bytes with KB.
formattedValues[id] = value / 1024;
} else {
Expand All @@ -162,6 +173,19 @@ function _preformatValues(messageFormatter, values = {}, lhlMessage) {
return formattedValues;
}

/**
* Our strings were made when \ was the escape character, but now it is '. To avoid churn,
* let's convert to the new style in memory.
* @param {string} message
* @return {string}
*/
function escapeIcuMessage(message) {
return message
.replace(/'/g, `''`)
.replace(/\\{/g, `'{`)
.replace(/\\}/g, `'}`);
}

/**
* Format string `message` by localizing `values` and inserting them. `message`
* is assumed to already be in the given locale.
Expand All @@ -172,19 +196,31 @@ function _preformatValues(messageFormatter, values = {}, lhlMessage) {
* @return {string}
*/
function formatMessage(message, values, locale) {
message = escapeIcuMessage(message);

// Parsing and formatting can be slow. Don't attempt if the string can't
// contain ICU placeholders, in which case formatting is already complete.
if (!message.includes('{') && values === undefined) return message;

// When using accented english, force the use of a different locale for number formatting.
const localeForMessageFormat = (locale === 'en-XA' || locale === 'en-XL') ? 'de-DE' : locale;

const formatter = new IntlMessageFormat(message, localeForMessageFormat, formats);
// This package is not correctly bundled.
/** @type {typeof IntlMessageFormat} */
// @ts-expect-error bundler woes
const IntlMessageFormatCtor = IntlMessageFormat.IntlMessageFormat || IntlMessageFormat;
const formatter = new IntlMessageFormatCtor(message, localeForMessageFormat, formats, {
ignoreTag: true,
});

// Preformat values for the message format like KB and milliseconds.
const valuesForMessageFormat = _preformatValues(formatter, values, message);

return formatter.format(valuesForMessageFormat);
const formattedResult = formatter.format(valuesForMessageFormat);
// We only format to strings.
if (typeof formattedResult !== 'string') {
throw new Error('unexpected formatted result');
}
return formattedResult;
}

/**
Expand Down Expand Up @@ -459,4 +495,5 @@ export {
getIcuMessageIdParts,
getAvailableLocales,
getCanonicalLocales,
escapeIcuMessage,
};
Loading

0 comments on commit 3c57e70

Please sign in to comment.