Skip to content

Commit

Permalink
Unicode 15.1 support
Browse files Browse the repository at this point in the history
  • Loading branch information
stevengj committed Oct 18, 2023
1 parent 1cb28a6 commit 8c7149b
Show file tree
Hide file tree
Showing 7 changed files with 10,995 additions and 10,891 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ endif()
if(UTF8PROC_ENABLE_TESTING)
enable_testing()
file(MAKE_DIRECTORY data)
set(UNICODE_VERSION 15.0.0)
set(UNICODE_VERSION 15.1.0)
file(DOWNLOAD https://www.unicode.org/Public/${UNICODE_VERSION}/ucd/NormalizationTest.txt ${CMAKE_BINARY_DIR}/data/NormalizationTest.txt SHOW_PROGRESS)
file(DOWNLOAD https://www.unicode.org/Public/${UNICODE_VERSION}/ucd/auxiliary/GraphemeBreakTest.txt ${CMAKE_BINARY_DIR}/data/GraphemeBreakTest.txt SHOW_PROGRESS)
add_executable(case test/tests.h test/tests.c utf8proc.h test/case.c)
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ The C library is found in this directory after successful compilation
and is named `libutf8proc.a` (for the static library) and
`libutf8proc.so` (for the dynamic library).

The Unicode version supported is 15.0.0.
The Unicode version supported is 15.1.0.

For Unicode normalizations, the following options are used:

Expand Down
2 changes: 1 addition & 1 deletion data/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ CharWidths.txt: charwidths.jl EastAsianWidth.txt
$(JULIA) charwidths.jl > $@

# Unicode data version (must also update utf8proc_unicode_version function)
UNICODE_VERSION=15.0.0
UNICODE_VERSION=15.1.0

UnicodeData.txt:
$(CURL) $(CURLFLAGS) -o $@ https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/UnicodeData.txt
Expand Down
33 changes: 30 additions & 3 deletions data/data_generator.rb
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,32 @@
end
end

$icb_linker_list = File.read("DerivedCoreProperties.txt", :encoding => 'utf-8')[/# Indic_Conjunct_Break=Linker.*?# Total code points:/m]
$icb = Hash.new("UTF8PROC_INDIC_CONJUNCT_BREAK_NONE")
$icb_linker_list.each_line do |entry|
if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
$1.hex.upto($2.hex) { |e2| $icb[e2] = "UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER" }
elsif entry =~ /^[0-9A-F]+/
$icb[$&.hex] = "UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER"
end
end
$icb_consonant_list = File.read("DerivedCoreProperties.txt", :encoding => 'utf-8')[/# Indic_Conjunct_Break=Consonant.*?# Total code points:/m]
$icb_consonant_list.each_line do |entry|
if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
$1.hex.upto($2.hex) { |e2| $icb[e2] = "UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT" }
elsif entry =~ /^[0-9A-F]+/
$icb[$&.hex] = "UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT"
end
end
$icb_extend_list = File.read("DerivedCoreProperties.txt", :encoding => 'utf-8')[/# Indic_Conjunct_Break=Extend.*?# Total code points:/m]
$icb_extend_list.each_line do |entry|
if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
$1.hex.upto($2.hex) { |e2| $icb[e2] = "UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND" }
elsif entry =~ /^[0-9A-F]+/
$icb[$&.hex] = "UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND"
end
end

$grapheme_boundclass_list = File.read("GraphemeBreakProperty.txt", :encoding => 'utf-8')
$grapheme_boundclass = Hash.new("UTF8PROC_BOUNDCLASS_OTHER")
$grapheme_boundclass_list.each_line do |entry|
Expand Down Expand Up @@ -174,7 +200,7 @@ def cpary2c(array)
return "UINT16_MAX" if array.nil? || array.length == 0
lencode = array.length - 1 #no sequence has len 0, so we encode len 1 as 0, len 2 as 1, ...
array = cpary2utf16encoded(array)
if lencode >= 3 #we have only 2 bits for the length
if lencode >= 3 #we have only 2 bits for the length
array = [lencode] + array
lencode = 3
end
Expand Down Expand Up @@ -249,7 +275,8 @@ def c_entry(comb_indicies)
"#{$ignorable.include?(code)}, " <<
"#{%W[Zl Zp Cc Cf].include?(category) and not [0x200C, 0x200D].include?(category)}, " <<
"#{$charwidth[code]}, 0, " <<
"#{$grapheme_boundclass[code]}},\n"
"#{$grapheme_boundclass[code]}, " <<
"#{$icb[code]}},\n"
end
end

Expand Down Expand Up @@ -415,7 +442,7 @@ def c_entry(comb_indicies)
$stdout << "};\n\n"

$stdout << "static const utf8proc_property_t utf8proc_properties[] = {\n"
$stdout << " {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false,false,false,false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},\n"
$stdout << " {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false,false,false,false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER, UTF8PROC_INDIC_CONJUNCT_BREAK_NONE},\n"
properties.each { |line|
$stdout << line
}
Expand Down
62 changes: 46 additions & 16 deletions utf8proc.c
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ UTF8PROC_DLLEXPORT const char *utf8proc_version(void) {
}

UTF8PROC_DLLEXPORT const char *utf8proc_unicode_version(void) {
return "15.0.0";
return "15.1.0";
}

UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode) {
Expand Down Expand Up @@ -288,31 +288,52 @@ static utf8proc_bool grapheme_break_simple(int lbc, int tbc) {
true; // GB999
}

static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t *state)
static utf8proc_bool grapheme_break_extended(int lbc, int tbc, int licb, int ticb, utf8proc_int32_t *state)
{
if (state) {
int lbc_override;
if (*state == UTF8PROC_BOUNDCLASS_START)
*state = lbc_override = lbc;
int state_bc = *state & 0xff; // 1st byte of state is bound class
int state_ibc = *state >> 8; // 2nd byte of state is indic conjunct break
if (state_bc == UTF8PROC_BOUNDCLASS_START)
state_bc = lbc;
else
lbc_override = *state;
utf8proc_bool break_permitted = grapheme_break_simple(lbc_override, tbc);
lbc = state_bc;
utf8proc_bool break_permitted = grapheme_break_simple(lbc, tbc);

// Special support for GB9c:
if (licb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT
|| state_ibc == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT
|| state_ibc == UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND_LINKER1)
state_ibc = licb;
else if (state_ibc == UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND)
state_ibc = licb == UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER ?
UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND_LINKER1 : UTF8PROC_INDIC_CONJUNCT_BREAK_NONE;
else if (state_ibc == UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER)
state_ibc = licb == UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND ?
UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND2 : UTF8PROC_INDIC_CONJUNCT_BREAK_NONE;
else if (state_ibc == UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND2)
state_ibc = licb == UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER ?
UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER: UTF8PROC_INDIC_CONJUNCT_BREAK_NONE;
if (state_ibc == UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER
&& ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT)
break_permitted = false;

// Special support for GB 12/13 made possible by GB999. After two RI
// class codepoints we want to force a break. Do this by resetting the
// second RI's bound class to UTF8PROC_BOUNDCLASS_OTHER, to force a break
// after that character according to GB999 (unless of course such a break is
// forbidden by a different rule such as GB9).
if (*state == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)
*state = UTF8PROC_BOUNDCLASS_OTHER;
if (state_bc == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)
state_bc = UTF8PROC_BOUNDCLASS_OTHER;
// Special support for GB11 (emoji extend* zwj / emoji)
else if (*state == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) {
else if (state_bc == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) {
if (tbc == UTF8PROC_BOUNDCLASS_EXTEND) // fold EXTEND codepoints into emoji
*state = UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC;
state_bc = UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC;
else if (tbc == UTF8PROC_BOUNDCLASS_ZWJ)
*state = UTF8PROC_BOUNDCLASS_E_ZWG; // state to record emoji+zwg combo
state_bc = UTF8PROC_BOUNDCLASS_E_ZWG; // state to record emoji+zwg combo
else
*state = tbc;
state_bc = tbc;

*state = state_bc + (state_ibc << 8);
}
else
*state = tbc;
Expand All @@ -326,8 +347,12 @@ static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break_stateful(
utf8proc_int32_t c1, utf8proc_int32_t c2, utf8proc_int32_t *state) {

return grapheme_break_extended(utf8proc_get_property(c1)->boundclass,
utf8proc_get_property(c2)->boundclass,
const utf8proc_property_t *p1 = utf8proc_get_property(c1);
const utf8proc_property_t *p2 = utf8proc_get_property(c2);
return grapheme_break_extended(p1->boundclass,
p2->boundclass,
p1->indic_conjunct_break,
p2->indic_conjunct_break,
state);
}

Expand Down Expand Up @@ -499,7 +524,12 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc,
if (options & UTF8PROC_CHARBOUND) {
utf8proc_bool boundary;
int tbc = property->boundclass;
boundary = grapheme_break_extended(*last_boundclass, tbc, last_boundclass);
/* for now, punt on indic conjunct breaks for UTF8PROC_CHARBOUND; you
should use utf8proc_grapheme_break_stateful for full grapheme-break
detection these days */
boundary = grapheme_break_extended(*last_boundclass, tbc,
UTF8PROC_INDIC_CONJUNCT_BREAK_NONE, UTF8PROC_INDIC_CONJUNCT_BREAK_NONE,
last_boundclass);
if (boundary) {
if (bufsize >= 1) dst[0] = -1; /* sentinel value for grapheme break */
if (bufsize >= 2) dst[1] = uc;
Expand Down
14 changes: 13 additions & 1 deletion utf8proc.h
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,8 @@ typedef struct utf8proc_property_struct {
* Boundclass.
* @see utf8proc_boundclass_t.
*/
unsigned boundclass:8;
unsigned boundclass:6;
unsigned indic_conjunct_break:2;
} utf8proc_property_t;

/** Unicode categories. */
Expand Down Expand Up @@ -388,6 +389,17 @@ typedef enum {
UTF8PROC_BOUNDCLASS_E_ZWG = 20, /* UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC + ZWJ */
} utf8proc_boundclass_t;

/** Indic_Conjunct_Break property. (TR44) */
typedef enum {
UTF8PROC_INDIC_CONJUNCT_BREAK_NONE = 0,
UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER = 1,
UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT = 2,
UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND = 3,

UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND_LINKER1 = 4, // used for state in grapheme-break code
UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND2 = 5, // used for state in grapheme-break code
} utf8proc_indic_conjunct_break_t;

/**
* Function pointer type passed to @ref utf8proc_map_custom and
* @ref utf8proc_decompose_custom, which is used to specify a user-defined
Expand Down
Loading

0 comments on commit 8c7149b

Please sign in to comment.