From 3f59b0e0ee835549d068ad4ad85936ddf0ed04cb Mon Sep 17 00:00:00 2001
From: Scott Huberty <52462026+scott-huberty@users.noreply.github.com>
Date: Wed, 1 Jan 2025 08:05:54 -0800
Subject: [PATCH] FIX: Support UTF-8 encoding for JSON files (#1357)

* WIP: add ensure_ascii flage to _write_json

* Revert "WIP: add ensure_ascii flage to _write_json"

This reverts commit 4c4767913a1523efa1095a20110982e092c99b37.

* Dont Force ASCII encoding in _write_json

* TST: Add a test

TIL: That json.loads will always convert unicode. So to test that unicode was properly encoded while writing to disk, I had to had to just read the text on disk without the json module

* DOC: update changelog

* Commit Dan's suggestion

Instead of closing and re-opening the file, rewind the "playhead" to the start of the open file, then use fid.read() as usual

Co-authored-by: Daniel McCloy <dan@mccloy.info>

---------

Co-authored-by: Daniel McCloy <dan@mccloy.info>
Co-authored-by: Stefan Appelhoff <stefan.appelhoff@mailbox.org>
---
 doc/whats_new.rst            |  2 ++
 mne_bids/tests/test_write.py | 11 +++++++++--
 mne_bids/utils.py            |  2 +-
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/doc/whats_new.rst b/doc/whats_new.rst
index b685cd3cd..456af0299 100644
--- a/doc/whats_new.rst
+++ b/doc/whats_new.rst
@@ -23,6 +23,7 @@ The following authors had contributed before. Thank you for sticking around! 
 
 * `Stefan Appelhoff`_
 * `Daniel McCloy`_
+* `Scott Huberty`_
 
 Detailed list of changes
 ~~~~~~~~~~~~~~~~~~~~~~~~
@@ -47,6 +48,7 @@ Detailed list of changes
 ^^^^^^^^^^^^
 
 - :func:`mne_bids.read_raw_bids` can optionally return an ``event_id`` dictionary suitable for use with :func:`mne.events_from_annotations`, and if a ``values`` column is present in ``events.tsv`` it will be used as the source of the integer event ID codes, by `Daniel McCloy`_ (:gh:`1349`)
+- :func:`mne_bids.make_dataset_description` now correctly encodes the dataset description as UTF-8 on disk, by `Scott Huberty`_ (:gh:`1357`)
 
 ⚕️ Code health
 ^^^^^^^^^^^^^^
diff --git a/mne_bids/tests/test_write.py b/mne_bids/tests/test_write.py
index a22252dd7..623c869d3 100644
--- a/mne_bids/tests/test_write.py
+++ b/mne_bids/tests/test_write.py
@@ -376,7 +376,7 @@ def test_make_dataset_description(tmp_path, monkeypatch):
     make_dataset_description(
         path=tmp_path,
         name="tst2",
-        authors="MNE B., MNE P.",
+        authors="MNE B., MNE P., MNE Ł.",
         funding="GSOC2019, GSOC2021",
         references_and_links="https://doi.org/10.21105/joss.01896",
         dataset_type="derivative",
@@ -386,7 +386,14 @@ def test_make_dataset_description(tmp_path, monkeypatch):
 
     with open(op.join(tmp_path, "dataset_description.json"), encoding="utf-8") as fid:
         dataset_description_json = json.load(fid)
-        assert dataset_description_json["Authors"] == ["MNE B.", "MNE P."]
+        assert dataset_description_json["Authors"] == ["MNE B.", "MNE P.", "MNE Ł."]
+        # If the text on disk is unicode, json.load will convert it. So let's test that
+        # the text was encoded correctly on disk.
+        fid.seek(0)
+        # don't use json.load here, as it will convert unicode to str
+        dataset_description_string = fid.read()
+        # Check that U+0141 was correctly encoded as Ł on disk
+        assert "MNE Ł." in dataset_description_string
 
     # Check we raise warnings and errors where appropriate
     with pytest.raises(
diff --git a/mne_bids/utils.py b/mne_bids/utils.py
index 49abf6efd..da0f7066e 100644
--- a/mne_bids/utils.py
+++ b/mne_bids/utils.py
@@ -233,7 +233,7 @@ def _write_json(fname, dictionary, overwrite=False):
             f'"{fname}" already exists. Please set overwrite to True.'
         )
 
-    json_output = json.dumps(dictionary, indent=4)
+    json_output = json.dumps(dictionary, indent=4, ensure_ascii=False)
     with open(fname, "w", encoding="utf-8") as fid:
         fid.write(json_output)
         fid.write("\n")