From d85ccc61e10084f943deaa9f5717158b9b12a7e3 Mon Sep 17 00:00:00 2001 From: supercoderhawk Date: Tue, 4 Aug 2020 19:57:52 +0800 Subject: [PATCH] implement get_jsonline_chunk_lazy and get_jsonline_chunk --- pysenal/io/file.py | 28 ++++++++++++++++++++++++++++ tests/io/test_file.py | 8 ++++++++ 2 files changed, 36 insertions(+) diff --git a/pysenal/io/file.py b/pysenal/io/file.py index ee3d54d..81b9186 100644 --- a/pysenal/io/file.py +++ b/pysenal/io/file.py @@ -7,6 +7,7 @@ from collections import Iterable import configparser from ..utils.logger import get_logger +from ..utils.utils import get_chunk _ENCODING_UTF8 = 'utf-8' @@ -198,6 +199,33 @@ def read_jsonline_lazy(filename, encoding=_ENCODING_UTF8, default=None): file.close() +def get_jsonline_chunk_lazy(filename, chunk_size, encoding=_ENCODING_UTF8, default=None): + """ + use generator to read jsonline items chunk by chunk + :param filename: source jsonline file + :param chunk_size: chunk size + :param encoding: file encoding + :param default: default value to return when file is not existed + :return: chunk of some items + """ + file_generator = read_jsonline_lazy(filename, encoding, default) + for chunk in get_chunk(file_generator, chunk_size): + yield chunk + + +def get_jsonline_chunk(filename, chunk_size, encoding=_ENCODING_UTF8, default=None): + """ + read jsonline items chunk by chunk + :param filename: source jsonline file + :param chunk_size: chunk size + :param encoding: file encoding + :param default: default value to return when file is not existed + :return: chunk of some items + """ + chunk_generator = get_chunk(read_jsonline_lazy(filename, encoding, default), chunk_size) + return list(chunk_generator) + + def write_jsonline(filename, items, encoding=_ENCODING_UTF8, serialize_method=None): """ write items to file with json line format diff --git a/tests/io/test_file.py b/tests/io/test_file.py index 5f1edad..901b6a6 100644 --- a/tests/io/test_file.py +++ b/tests/io/test_file.py @@ -1,6 +1,7 @@ # -*- coding: UTF-8 -*- import tempfile import pytest +import types from decimal import Decimal from pysenal.io.file import * from pysenal.utils import json_serialize @@ -71,6 +72,13 @@ def test_read_jsonline(example_json, fake_filename): assert read_jsonline(TEST_DATA_DIR + 'a.jsonl') == example_json +def test_read_jsonline_chunk(example_json): + assert get_jsonline_chunk(TEST_DATA_DIR + 'a.jsonl', 2) == [example_json] + generator = get_jsonline_chunk_lazy(TEST_DATA_DIR + 'a.jsonl', 2) + assert isinstance(generator, types.GeneratorType) + assert list(generator) == [example_json] + + def test_write_lines(example_lines): dirname = tempfile.gettempdir() + '/' filename = dirname + 'a.txt'