Skip to content

Commit

Permalink
Merge pull request #219 from irisTa56/fix_zero_width_joiner_between_s…
Browse files Browse the repository at this point in the history
…urrogate_pairs
  • Loading branch information
devinus authored Jun 1, 2024
2 parents 6fc4b7a + 8ce1309 commit ef8b22d
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 17 deletions.
42 changes: 25 additions & 17 deletions lib/poison/parser.ex
Original file line number Diff line number Diff line change
Expand Up @@ -536,8 +536,8 @@ defmodule Poison.Parser do
# http://www.ietf.org/rfc/rfc2781.txt
# http://perldoc.perl.org/Encode/Unicode.html#Surrogate-Pairs
# http://mathiasbynens.be/notes/javascript-encoding#surrogate-pairs
defguardp is_surrogate(cp) when cp in 0xD800..0xDFFF
defguardp is_surrogate_pair(hi, lo) when hi in 0xD800..0xDBFF and lo in 0xDC00..0xDFFF
defguardp is_hi_surrogate(cp) when cp in 0xD800..0xDBFF
defguardp is_lo_surrogate(cp) when cp in 0xDC00..0xDFFF

defmacrop get_codepoint(seq, skip) do
quote bind_quoted: [seq: seq, skip: skip] do
Expand All @@ -552,28 +552,36 @@ defmodule Poison.Parser do

@compile {:inline, string_escape_unicode: 5}

defp string_escape_unicode(<<"\\u", seq2::binary-size(4), rest::bits>>, data, skip, acc, seq1) do
hi = get_codepoint(seq1, skip)
lo = get_codepoint(seq2, skip + 6)
defp string_escape_unicode(rest, data, skip, acc, seq1) do
cp1 = get_codepoint(seq1, skip)

cond do
is_surrogate_pair(hi, lo) ->
codepoint = 0x10000 + ((hi &&& 0x03FF) <<< 10) + (lo &&& 0x03FF)
string_continue(rest, data, skip + 11, true, 0, [acc, codepoint])

is_surrogate(hi) ->
raise ParseError, skip: skip, value: "\\u#{seq1}\\u#{seq2}"
is_hi_surrogate(cp1) -> string_escape_surrogate_pair(rest, data, skip, acc, seq1, cp1)
is_lo_surrogate(cp1) -> raise ParseError, skip: skip, value: "\\u#{seq1}"
true -> string_continue(rest, data, skip + 5, true, 0, [acc, cp1])
end
end

is_surrogate(lo) ->
raise ParseError, skip: skip + 6, value: "\\u#{seq2}"
@compile {:inline, string_escape_surrogate_pair: 6}

true ->
string_continue(rest, data, skip + 11, true, 0, [acc, hi, lo])
defp string_escape_surrogate_pair(
<<"\\u", seq2::binary-size(4), rest::bits>>,
data,
skip,
acc,
seq1,
hi
) do
with lo when is_lo_surrogate(lo) <- get_codepoint(seq2, skip + 6) do
codepoint = 0x10000 + ((hi &&& 0x03FF) <<< 10) + (lo &&& 0x03FF)
string_continue(rest, data, skip + 11, true, 0, [acc, codepoint])
else
_ -> raise ParseError, skip: skip, value: "\\u#{seq1}\\u#{seq2}"
end
end

defp string_escape_unicode(rest, data, skip, acc, seq1) do
string_continue(rest, data, skip + 5, true, 0, [acc, get_codepoint(seq1, skip)])
defp string_escape_surrogate_pair(_rest, _data, skip, _acc, seq1, _hi) do
raise ParseError, skip: skip, value: "\\u#{seq1}"
end

## Whitespace
Expand Down
19 changes: 19 additions & 0 deletions test/poison/parser_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -126,13 +126,32 @@ defmodule Poison.ParserTest do
parse!(~s("\\uxxxx"))
end

assert_raise ParseError,
~s(cannot parse value at position 2: "\\\\uD800\\\\uDBFF"),
fn ->
parse!(~s("\\uD800\\uDBFF"))
end

assert_raise ParseError,
~s(cannot parse value at position 2: "\\\\uD800"),
fn ->
parse!(~s("\\uD800"))
end

assert_raise ParseError,
~s(cannot parse value at position 2: "\\\\uDC00"),
fn ->
parse!(~s("\\uDC00"))
end

assert parse!(~s("\\"\\\\\\/\\b\\f\\n\\r\\t")) == ~s("\\/\b\f\n\r\t)
assert parse!(~s("\\u2603")) == "☃"
assert parse!(~s("\\u2028\\u2029")) == "\u2028\u2029"
assert parse!(~s("\\uD834\\uDD1E")) == "𝄞"
assert parse!(~s("\\uD834\\uDD1E")) == "𝄞"
assert parse!(~s("\\uD799\\uD799")) == "힙힙"
assert parse!(~s("✔︎")) == "✔︎"
assert parse!(~s("\\uD83D\\uDC68\\u200D\\uD83D\\uDC76")) == "👨‍👶"
end

property "strings" do
Expand Down

0 comments on commit ef8b22d

Please sign in to comment.