From 8aa9724f279a3768f72a883584ad24f9a4b738aa Mon Sep 17 00:00:00 2001 From: Jacob Khaliqi Date: Thu, 7 Nov 2024 12:03:32 -0800 Subject: [PATCH] refactor arrow reader to parquet reader in MetadataTest.cpp --- velox/dwio/parquet/reader/Metadata.cpp | 5 + velox/dwio/parquet/reader/Metadata.h | 3 + velox/dwio/parquet/reader/ParquetReader.cpp | 42 ++ velox/dwio/parquet/reader/ParquetReader.h | 7 + .../dwio/parquet/tests/writer/CMakeLists.txt | 22 + .../tests/writer/FileSerializeTest.cpp | 619 ++++++++++++++++++ .../tests => tests/writer}/MetadataTest.cpp | 79 ++- .../tests => tests/writer}/StatisticsTest.cpp | 101 ++- .../parquet/writer/arrow/tests/CMakeLists.txt | 10 +- 9 files changed, 847 insertions(+), 41 deletions(-) create mode 100644 velox/dwio/parquet/tests/writer/FileSerializeTest.cpp rename velox/dwio/parquet/{writer/arrow/tests => tests/writer}/MetadataTest.cpp (90%) rename velox/dwio/parquet/{writer/arrow/tests => tests/writer}/StatisticsTest.cpp (92%) diff --git a/velox/dwio/parquet/reader/Metadata.cpp b/velox/dwio/parquet/reader/Metadata.cpp index 771e68e8a5959..6234e11c47b74 100644 --- a/velox/dwio/parquet/reader/Metadata.cpp +++ b/velox/dwio/parquet/reader/Metadata.cpp @@ -219,6 +219,11 @@ ColumnChunkMetaDataPtr::getColumnStatistics( thriftColumnChunkPtr(ptr_)->meta_data.statistics, *type, numRows); }; +thrift::ColumnMetaData ColumnChunkMetaDataPtr::getColumnMetadata() { + VELOX_CHECK(hasStatistics()); + return thriftColumnChunkPtr(ptr_)->meta_data; +} + int64_t ColumnChunkMetaDataPtr::dataPageOffset() const { return thriftColumnChunkPtr(ptr_)->meta_data.data_page_offset; } diff --git a/velox/dwio/parquet/reader/Metadata.h b/velox/dwio/parquet/reader/Metadata.h index f99d46656d8ca..ee72d63897201 100644 --- a/velox/dwio/parquet/reader/Metadata.h +++ b/velox/dwio/parquet/reader/Metadata.h @@ -18,6 +18,7 @@ #include "velox/dwio/common/Statistics.h" #include "velox/dwio/common/compression/Compression.h" +#include "velox/dwio/parquet/thrift/ParquetThriftTypes.h" namespace facebook::velox::parquet { @@ -42,6 +43,8 @@ class ColumnChunkMetaDataPtr { const TypePtr type, int64_t numRows); + thrift::ColumnMetaData getColumnMetadata(); + /// Number of values. int64_t numValues() const; diff --git a/velox/dwio/parquet/reader/ParquetReader.cpp b/velox/dwio/parquet/reader/ParquetReader.cpp index 4e83898b69ff2..10fd8209572b8 100644 --- a/velox/dwio/parquet/reader/ParquetReader.cpp +++ b/velox/dwio/parquet/reader/ParquetReader.cpp @@ -18,6 +18,7 @@ #include //@manual +#include #include "velox/dwio/parquet/reader/ParquetColumnReader.h" #include "velox/dwio/parquet/reader/StructColumnReader.h" #include "velox/dwio/parquet/thrift/ThriftTransport.h" @@ -1166,4 +1167,45 @@ FileMetaDataPtr ParquetReader::fileMetaData() const { return readerBase_->fileMetaData(); } +const thrift::FileMetaData& ParquetReader::thriftFileMetaData() const { + return readerBase_->thriftFileMetaData(); +} + +void WriteBufferToFile( + const std::shared_ptr& buffer, + const std::string& file_path) { + auto source = std::make_shared<::arrow::io::BufferReader>(buffer); + ::arrow::Result> + file_out_result = ::arrow::io::FileOutputStream::Open(file_path); + if (!file_out_result.ok()) { + std::cerr << "Error opening file for writing: " + << file_out_result.status().ToString() << std::endl; + return; + } + std::shared_ptr<::arrow::io::FileOutputStream> file_out = + file_out_result.ValueOrDie(); + const int64_t buffer_size = buffer->size(); + int64_t bytes_written = 0; + while (bytes_written < buffer_size) { + const int64_t chunk_size = std::min( + static_cast(1024 * 1024), buffer_size - bytes_written); + std::shared_ptr chunk; + auto read_result = source->Read(chunk_size); + if (!read_result.ok()) { + std::cerr << "Error reading from buffer: " + << read_result.status().ToString() << std::endl; + return; + } + chunk = read_result.ValueOrDie(); + auto status = file_out->Write(chunk->data(), chunk->size()); + if (!status.ok()) { + std::cerr << "Error writing to file: " << status.ToString() << std::endl; + return; + } + bytes_written += chunk->size(); + } + std::cout << "Buffer successfully written to " << file_path + << std::endl; // delete this... +} + } // namespace facebook::velox::parquet diff --git a/velox/dwio/parquet/reader/ParquetReader.h b/velox/dwio/parquet/reader/ParquetReader.h index de6d7a9966dc8..51b73759f6a00 100644 --- a/velox/dwio/parquet/reader/ParquetReader.h +++ b/velox/dwio/parquet/reader/ParquetReader.h @@ -16,6 +16,7 @@ #pragma once +#include #include "velox/dwio/common/Reader.h" #include "velox/dwio/common/ReaderFactory.h" #include "velox/dwio/parquet/reader/Metadata.h" @@ -106,6 +107,8 @@ class ParquetReader : public dwio::common::Reader { FileMetaDataPtr fileMetaData() const; + const thrift::FileMetaData& thriftFileMetaData() const; + private: std::shared_ptr readerBase_; }; @@ -125,4 +128,8 @@ void registerParquetReaderFactory(); void unregisterParquetReaderFactory(); +void WriteBufferToFile( + const std::shared_ptr& buffer, + const std::string& file_path); + } // namespace facebook::velox::parquet diff --git a/velox/dwio/parquet/tests/writer/CMakeLists.txt b/velox/dwio/parquet/tests/writer/CMakeLists.txt index 01e718dc22c46..a42b824adbdb0 100644 --- a/velox/dwio/parquet/tests/writer/CMakeLists.txt +++ b/velox/dwio/parquet/tests/writer/CMakeLists.txt @@ -48,3 +48,25 @@ target_link_libraries( ${TEST_LINK_LIBS} GTest::gtest fmt::fmt) + +add_executable(velox_dwio_parquet_writer_test + MetadataTest.cpp StatisticsTest.cpp FileSerializeTest.cpp) + +add_test( + NAME velox_dwio_parquet_writer_test + COMMAND velox_dwio_parquet_writer_test + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) + +target_link_libraries( + velox_dwio_parquet_writer_test + velox_dwio_arrow_parquet_writer_test_lib + GTest::gmock + GTest::gtest + GTest::gtest_main + arrow + arrow_testing + velox_dwio_native_parquet_reader + velox_temp_path) + +# target_include_directories( velox_dwio_parquet_writer_test PRIVATE +# ${CMAKE_SOURCE_DIR}/velox/dwio/parquet/reader) diff --git a/velox/dwio/parquet/tests/writer/FileSerializeTest.cpp b/velox/dwio/parquet/tests/writer/FileSerializeTest.cpp new file mode 100644 index 0000000000000..6458bac196dde --- /dev/null +++ b/velox/dwio/parquet/tests/writer/FileSerializeTest.cpp @@ -0,0 +1,619 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Adapted from Apache Arrow. + +#include +#include + +#include "arrow/testing/gtest_compat.h" + +#include "velox/dwio/parquet/writer/arrow/ColumnWriter.h" +#include "velox/dwio/parquet/writer/arrow/FileWriter.h" +#include "velox/dwio/parquet/writer/arrow/Platform.h" +#include "velox/dwio/parquet/writer/arrow/Types.h" +#include "velox/dwio/parquet/writer/arrow/tests/ColumnReader.h" +#include "velox/dwio/parquet/writer/arrow/tests/FileReader.h" +#include "velox/dwio/parquet/writer/arrow/tests/TestUtil.h" + +#include "velox/dwio/parquet/reader/ParquetReader.h" +#include "velox/exec/tests/utils/TempFilePath.h" + +namespace facebook::velox::parquet::arrow { + +using schema::GroupNode; +using schema::NodePtr; +using schema::PrimitiveNode; +using ::testing::ElementsAre; + +namespace test { + std::shared_ptr makeScanSpec( + const RowTypePtr& rowType) { + auto scanSpec = std::make_shared(""); + scanSpec->addAllChildFields(*rowType); + return scanSpec; + } + +template +class TestSerialize : public PrimitiveTypedTest { + public: + void SetUp() { + num_columns_ = 4; + num_rowgroups_ = 4; + rows_per_rowgroup_ = 50; + rows_per_batch_ = 10; + this->SetUpSchema(Repetition::OPTIONAL, num_columns_); + } + + protected: + int num_columns_; + int num_rowgroups_; + int rows_per_rowgroup_; + int rows_per_batch_; + + void FileSerializeTest(Compression::type codec_type) { + FileSerializeTest(codec_type, codec_type); + } + + void FileSerializeTest( + Compression::type codec_type, + Compression::type expected_codec_type) { + auto sink = CreateOutputStream(); + auto gnode = std::static_pointer_cast(this->node_); + + WriterProperties::Builder prop_builder; + + for (int i = 0; i < num_columns_; ++i) { + prop_builder.compression(this->schema_.Column(i)->name(), codec_type); + } + std::shared_ptr writer_properties = prop_builder.build(); + + auto file_writer = ParquetFileWriter::Open(sink, gnode, writer_properties); + this->GenerateData(rows_per_rowgroup_); + for (int rg = 0; rg < num_rowgroups_ / 2; ++rg) { + RowGroupWriter* row_group_writer; + row_group_writer = file_writer->AppendRowGroup(); + for (int col = 0; col < num_columns_; ++col) { + auto column_writer = static_cast*>( + row_group_writer->NextColumn()); + column_writer->WriteBatch( + rows_per_rowgroup_, + this->def_levels_.data(), + nullptr, + this->values_ptr_); + column_writer->Close(); + // Ensure column() API which is specific to BufferedRowGroup cannot be + // called + ASSERT_THROW(row_group_writer->column(col), ParquetException); + } + EXPECT_EQ(0, row_group_writer->total_compressed_bytes()); + EXPECT_NE(0, row_group_writer->total_bytes_written()); + EXPECT_NE(0, row_group_writer->total_compressed_bytes_written()); + row_group_writer->Close(); + EXPECT_EQ(0, row_group_writer->total_compressed_bytes()); + EXPECT_NE(0, row_group_writer->total_bytes_written()); + EXPECT_NE(0, row_group_writer->total_compressed_bytes_written()); + } + // Write half BufferedRowGroups + for (int rg = 0; rg < num_rowgroups_ / 2; ++rg) { + RowGroupWriter* row_group_writer; + row_group_writer = file_writer->AppendBufferedRowGroup(); + for (int batch = 0; batch < (rows_per_rowgroup_ / rows_per_batch_); + ++batch) { + for (int col = 0; col < num_columns_; ++col) { + auto column_writer = static_cast*>( + row_group_writer->column(col)); + column_writer->WriteBatch( + rows_per_batch_, + this->def_levels_.data() + (batch * rows_per_batch_), + nullptr, + this->values_ptr_ + (batch * rows_per_batch_)); + // Ensure NextColumn() API which is specific to RowGroup cannot be + // called + ASSERT_THROW(row_group_writer->NextColumn(), ParquetException); + } + } + // total_compressed_bytes() may equal to 0 if no dictionary enabled and no + // buffered values. + EXPECT_EQ(0, row_group_writer->total_bytes_written()); + EXPECT_EQ(0, row_group_writer->total_compressed_bytes_written()); + for (int col = 0; col < num_columns_; ++col) { + auto column_writer = static_cast*>( + row_group_writer->column(col)); + column_writer->Close(); + } + row_group_writer->Close(); + EXPECT_EQ(0, row_group_writer->total_compressed_bytes()); + EXPECT_NE(0, row_group_writer->total_bytes_written()); + EXPECT_NE(0, row_group_writer->total_compressed_bytes_written()); + } + file_writer->Close(); + + PARQUET_ASSIGN_OR_THROW(auto buffer, sink->Finish()); + + int num_rows_ = num_rowgroups_ * rows_per_rowgroup_; + + auto source = std::make_shared<::arrow::io::BufferReader>(buffer); + auto tempFile = exec::test::TempFilePath::create(); + auto file_path = tempFile->getPath(); + WriteBufferToFile(buffer, file_path); + memory::MemoryManager::testingSetInstance({}); + std::shared_ptr rootPool_; + std::shared_ptr leafPool_; + rootPool_ = memory::memoryManager()->addRootPool("FileSerializeTest"); + leafPool_ = rootPool_->addLeafChild("FileSerializeTest"); + dwio::common::ReaderOptions readerOptions{leafPool_.get()}; + auto input = std::make_unique( + std::make_shared(file_path), + readerOptions.memoryPool()); + auto reader = + std::make_unique(std::move(input), readerOptions); + //ASSERT_EQ(num_columns_, reader->thriftFileMetaData().num_columns()); // FIND OUT NUMCOLUMNS... might have to loop through all rowgroups and get the numcolumns of it + ASSERT_EQ(num_rowgroups_, reader->fileMetaData().numRowGroups()); + ASSERT_EQ(num_rows_, reader->fileMetaData().numRows()); + + + auto file_reader = ParquetFileReader::Open(source); + ASSERT_EQ(num_columns_, file_reader->metadata()->num_columns()); + ASSERT_EQ(num_rowgroups_, file_reader->metadata()->num_row_groups()); + ASSERT_EQ(num_rows_, file_reader->metadata()->num_rows()); + + for (int rg = 0; rg < num_rowgroups_; ++rg) { + auto nrg_reader = reader->fileMetaData().rowGroup(rg); + ASSERT_EQ(num_columns_, nrg_reader.numColumns()); + ASSERT_EQ(rows_per_rowgroup_, nrg_reader.numRows()); + // Check that the specified compression was actually used. + ASSERT_EQ( + expected_codec_type, nrg_reader.columnChunk(0).compression()); + + const int64_t total_byte_size = nrg_reader.totalByteSize(); + const int64_t total_compressed_size = + nrg_reader.totalCompressedSize(); + if (expected_codec_type == Compression::UNCOMPRESSED) { + ASSERT_EQ(total_byte_size, total_compressed_size); + } else { + ASSERT_NE(total_byte_size, total_compressed_size); + } + + + + auto rg_reader = file_reader->RowGroup(rg); + auto rg_metadata = rg_reader->metadata(); + // ASSERT_EQ(num_columns_, rg_metadata->num_columns()); + // ASSERT_EQ(rows_per_rowgroup_, rg_metadata->num_rows()); + // // Check that the specified compression was actually used. + // ASSERT_EQ( + // expected_codec_type, rg_metadata->ColumnChunk(0)->compression()); + + // const int64_t total_byte_size = rg_metadata->total_byte_size(); + // const int64_t total_compressed_size = + // rg_metadata->total_compressed_size(); + // if (expected_codec_type == Compression::UNCOMPRESSED) { + // ASSERT_EQ(total_byte_size, total_compressed_size); + // } else { + // ASSERT_NE(total_byte_size, total_compressed_size); + // } + + int64_t total_column_byte_size = 0; + int64_t total_column_compressed_size = 0; + + for (int i = 0; i < num_columns_; ++i) { + int64_t values_read; + ASSERT_FALSE(nrg_reader.columnChunk(i).getColumnMetadata().index_page_offset); + total_column_byte_size += + nrg_reader.columnChunk(i).totalCompressedSize(); + total_column_compressed_size += + nrg_reader.columnChunk(i).totalUncompressedSize(); + + dwio::common::RowReaderOptions rowReaderOpts; + auto rowType = ROW({"column_0", "column_1", "column_2", "column_3"}, {BIGINT(), BIGINT(), BIGINT(), BIGINT()}); + rowReaderOpts.setScanSpec(makeScanSpec(rowType)); + auto rowReader = reader->createRowReader(rowReaderOpts); + // columnReader_ is in the parquetrowreader... + constexpr int kBatchSize = 1000; + auto result = BaseVector::create(rowType, kBatchSize, leafPool_.get()); + auto actualRows = rowReader->next(kBatchSize, result); + auto res = rowReader->next(kBatchSize, result); + // Seperator + + + ASSERT_FALSE(rg_metadata->ColumnChunk(i)->has_index_page()); + // total_column_byte_size += + // rg_metadata->ColumnChunk(i)->total_uncompressed_size(); + // total_column_compressed_size += + // rg_metadata->ColumnChunk(i)->total_compressed_size(); + + std::vector def_levels_out(rows_per_rowgroup_); + std::vector rep_levels_out(rows_per_rowgroup_); + auto col_reader = std::static_pointer_cast>( + rg_reader->Column(i)); + this->SetupValuesOut(rows_per_rowgroup_); + col_reader->ReadBatch( + rows_per_rowgroup_, + def_levels_out.data(), + rep_levels_out.data(), + this->values_out_ptr_, + &values_read); + this->SyncValuesOut(); + ASSERT_EQ(rows_per_rowgroup_, values_read); + ASSERT_EQ(this->values_, this->values_out_); + ASSERT_EQ(this->def_levels_, def_levels_out); + } + + ASSERT_EQ(total_byte_size, total_column_byte_size); + ASSERT_EQ(total_compressed_size, total_column_compressed_size); + } + } + + void UnequalNumRows( + int64_t max_rows, + const std::vector rows_per_column) { + auto sink = CreateOutputStream(); + auto gnode = std::static_pointer_cast(this->node_); + + std::shared_ptr props = + WriterProperties::Builder().build(); + + auto file_writer = ParquetFileWriter::Open(sink, gnode, props); + + RowGroupWriter* row_group_writer; + row_group_writer = file_writer->AppendRowGroup(); + + this->GenerateData(max_rows); + for (int col = 0; col < num_columns_; ++col) { + auto column_writer = static_cast*>( + row_group_writer->NextColumn()); + column_writer->WriteBatch( + rows_per_column[col], + this->def_levels_.data(), + nullptr, + this->values_ptr_); + column_writer->Close(); + } + row_group_writer->Close(); + file_writer->Close(); + } + + void UnequalNumRowsBuffered( + int64_t max_rows, + const std::vector rows_per_column) { + auto sink = CreateOutputStream(); + auto gnode = std::static_pointer_cast(this->node_); + + std::shared_ptr props = + WriterProperties::Builder().build(); + + auto file_writer = ParquetFileWriter::Open(sink, gnode, props); + + RowGroupWriter* row_group_writer; + row_group_writer = file_writer->AppendBufferedRowGroup(); + + this->GenerateData(max_rows); + for (int col = 0; col < num_columns_; ++col) { + auto column_writer = static_cast*>( + row_group_writer->column(col)); + column_writer->WriteBatch( + rows_per_column[col], + this->def_levels_.data(), + nullptr, + this->values_ptr_); + column_writer->Close(); + } + row_group_writer->Close(); + file_writer->Close(); + } + + void RepeatedUnequalRows() { + // Optional and repeated, so definition and repetition levels + this->SetUpSchema(Repetition::REPEATED); + + const int kNumRows = 100; + this->GenerateData(kNumRows); + + auto sink = CreateOutputStream(); + auto gnode = std::static_pointer_cast(this->node_); + std::shared_ptr props = + WriterProperties::Builder().build(); + auto file_writer = ParquetFileWriter::Open(sink, gnode, props); + + RowGroupWriter* row_group_writer; + row_group_writer = file_writer->AppendRowGroup(); + + this->GenerateData(kNumRows); + + std::vector definition_levels(kNumRows, 1); + std::vector repetition_levels(kNumRows, 0); + + { + auto column_writer = static_cast*>( + row_group_writer->NextColumn()); + column_writer->WriteBatch( + kNumRows, + definition_levels.data(), + repetition_levels.data(), + this->values_ptr_); + column_writer->Close(); + } + + definition_levels[1] = 0; + repetition_levels[3] = 1; + + { + auto column_writer = static_cast*>( + row_group_writer->NextColumn()); + column_writer->WriteBatch( + kNumRows, + definition_levels.data(), + repetition_levels.data(), + this->values_ptr_); + column_writer->Close(); + } + } + + void ZeroRowsRowGroup() { + auto sink = CreateOutputStream(); + auto gnode = std::static_pointer_cast(this->node_); + + std::shared_ptr props = + WriterProperties::Builder().build(); + + auto file_writer = ParquetFileWriter::Open(sink, gnode, props); + + RowGroupWriter* row_group_writer; + + row_group_writer = file_writer->AppendRowGroup(); + for (int col = 0; col < num_columns_; ++col) { + auto column_writer = static_cast*>( + row_group_writer->NextColumn()); + column_writer->Close(); + } + row_group_writer->Close(); + + row_group_writer = file_writer->AppendBufferedRowGroup(); + for (int col = 0; col < num_columns_; ++col) { + auto column_writer = static_cast*>( + row_group_writer->column(col)); + column_writer->Close(); + } + row_group_writer->Close(); + + file_writer->Close(); + } +}; + +typedef ::testing::Types< + Int32Type, + Int64Type, + Int96Type, + FloatType, + DoubleType, + BooleanType, + ByteArrayType, + FLBAType> + TestTypes; + +TYPED_TEST_SUITE(TestSerialize, TestTypes); + +TYPED_TEST(TestSerialize, SmallFileUncompressed) { + ASSERT_NO_FATAL_FAILURE(this->FileSerializeTest(Compression::UNCOMPRESSED)); +} + +TYPED_TEST(TestSerialize, TooFewRows) { + std::vector num_rows = {100, 100, 100, 99}; + ASSERT_THROW(this->UnequalNumRows(100, num_rows), ParquetException); + ASSERT_THROW(this->UnequalNumRowsBuffered(100, num_rows), ParquetException); +} + +TYPED_TEST(TestSerialize, TooManyRows) { + std::vector num_rows = {100, 100, 100, 101}; + ASSERT_THROW(this->UnequalNumRows(101, num_rows), ParquetException); + ASSERT_THROW(this->UnequalNumRowsBuffered(101, num_rows), ParquetException); +} + +TYPED_TEST(TestSerialize, ZeroRows) { + ASSERT_NO_THROW(this->ZeroRowsRowGroup()); +} + +TYPED_TEST(TestSerialize, RepeatedTooFewRows) { + ASSERT_THROW(this->RepeatedUnequalRows(), ParquetException); +} + +TYPED_TEST(TestSerialize, SmallFileSnappy) { + ASSERT_NO_FATAL_FAILURE(this->FileSerializeTest(Compression::SNAPPY)); +} + +#ifdef ARROW_WITH_BROTLI +TYPED_TEST(TestSerialize, SmallFileBrotli) { + ASSERT_NO_FATAL_FAILURE(this->FileSerializeTest(Compression::BROTLI)); +} +#endif + +TYPED_TEST(TestSerialize, SmallFileGzip) { + ASSERT_NO_FATAL_FAILURE(this->FileSerializeTest(Compression::GZIP)); +} + +TYPED_TEST(TestSerialize, SmallFileLz4) { + ASSERT_NO_FATAL_FAILURE(this->FileSerializeTest(Compression::LZ4)); +} + +TYPED_TEST(TestSerialize, SmallFileLz4Hadoop) { + ASSERT_NO_FATAL_FAILURE(this->FileSerializeTest(Compression::LZ4_HADOOP)); +} + +TYPED_TEST(TestSerialize, SmallFileZstd) { + ASSERT_NO_FATAL_FAILURE(this->FileSerializeTest(Compression::ZSTD)); +} + +TEST(TestBufferedRowGroupWriter, DisabledDictionary) { + // PARQUET-1706: + // Wrong dictionary_page_offset when writing only data pages via + // BufferedPageWriter + auto sink = CreateOutputStream(); + auto writer_props = WriterProperties::Builder().disable_dictionary()->build(); + schema::NodeVector fields; + fields.push_back( + PrimitiveNode::Make("col", Repetition::REQUIRED, Type::INT32)); + auto schema = std::static_pointer_cast( + GroupNode::Make("schema", Repetition::REQUIRED, fields)); + auto file_writer = ParquetFileWriter::Open(sink, schema, writer_props); + auto rg_writer = file_writer->AppendBufferedRowGroup(); + auto col_writer = static_cast(rg_writer->column(0)); + int value = 0; + col_writer->WriteBatch(1, nullptr, nullptr, &value); + rg_writer->Close(); + file_writer->Close(); + PARQUET_ASSIGN_OR_THROW(auto buffer, sink->Finish()); + auto tempFile = exec::test::TempFilePath::create(); + auto file_path = tempFile->getPath(); + WriteBufferToFile(buffer, file_path); + memory::MemoryManager::testingSetInstance({}); + std::shared_ptr rootPool_; + std::shared_ptr leafPool_; + rootPool_ = memory::memoryManager()->addRootPool("FileSerializeTest"); + leafPool_ = rootPool_->addLeafChild("FileSerializeTest"); + dwio::common::ReaderOptions readerOptions{leafPool_.get()}; + auto input = std::make_unique( + std::make_shared(file_path), + readerOptions.memoryPool()); + auto reader = + std::make_unique(std::move(input), readerOptions); + ASSERT_EQ(1, reader->thriftFileMetaData().row_groups.size()); + auto row_group = reader->fileMetaData().rowGroup(0); + ASSERT_EQ(1, row_group.numColumns()); + ASSERT_EQ(1, row_group.numRows()); + ASSERT_FALSE(row_group.columnChunk(0).hasDictionaryPageOffset()); +} + +TEST(TestBufferedRowGroupWriter, MultiPageDisabledDictionary) { + constexpr int kValueCount = 10000; + constexpr int kPageSize = 16384; + auto sink = CreateOutputStream(); + auto writer_props = WriterProperties::Builder() + .disable_dictionary() + ->data_pagesize(kPageSize) + ->build(); + schema::NodeVector fields; + fields.push_back( + PrimitiveNode::Make("col", Repetition::REQUIRED, Type::INT32)); + auto schema = std::static_pointer_cast( + GroupNode::Make("schema", Repetition::REQUIRED, fields)); + auto file_writer = ParquetFileWriter::Open(sink, schema, writer_props); + auto rg_writer = file_writer->AppendBufferedRowGroup(); + auto col_writer = static_cast(rg_writer->column(0)); + std::vector values_in; + for (int i = 0; i < kValueCount; ++i) { + values_in.push_back((i % 100) + 1); + } + col_writer->WriteBatch(kValueCount, nullptr, nullptr, values_in.data()); + rg_writer->Close(); + file_writer->Close(); + PARQUET_ASSIGN_OR_THROW(auto buffer, sink->Finish()); + + auto source = std::make_shared<::arrow::io::BufferReader>(buffer); + auto file_reader = ParquetFileReader::Open(source); + auto file_metadata = file_reader->metadata(); + ASSERT_EQ(1, file_reader->metadata()->num_row_groups()); + std::vector values_out(kValueCount); + for (int r = 0; r < file_metadata->num_row_groups(); ++r) { + auto rg_reader = file_reader->RowGroup(r); + ASSERT_EQ(1, rg_reader->metadata()->num_columns()); + ASSERT_EQ(kValueCount, rg_reader->metadata()->num_rows()); + int64_t total_values_read = 0; + std::shared_ptr col_reader; + ASSERT_NO_THROW(col_reader = rg_reader->Column(0)); + Int32Reader* int32_reader = static_cast(col_reader.get()); + int64_t vn = kValueCount; + int32_t* vx = values_out.data(); + while (int32_reader->HasNext()) { + int64_t values_read; + int32_reader->ReadBatch(vn, nullptr, nullptr, vx, &values_read); + vn -= values_read; + vx += values_read; + total_values_read += values_read; + } + ASSERT_EQ(kValueCount, total_values_read); + ASSERT_EQ(values_in, values_out); + } +} + +TEST(ParquetRoundtrip, AllNulls) { + auto primitive_node = + PrimitiveNode::Make("nulls", Repetition::OPTIONAL, nullptr, Type::INT32); + schema::NodeVector columns({primitive_node}); + + auto root_node = + GroupNode::Make("root", Repetition::REQUIRED, columns, nullptr); + + auto sink = CreateOutputStream(); + + auto file_writer = ParquetFileWriter::Open( + sink, std::static_pointer_cast(root_node)); + auto row_group_writer = file_writer->AppendRowGroup(); + auto column_writer = + static_cast(row_group_writer->NextColumn()); + + int32_t values[3]; + int16_t def_levels[] = {0, 0, 0}; + + column_writer->WriteBatch(3, def_levels, nullptr, values); + + column_writer->Close(); + row_group_writer->Close(); + file_writer->Close(); + + ReaderProperties props = default_reader_properties(); + props.enable_buffered_stream(); + PARQUET_ASSIGN_OR_THROW(auto buffer, sink->Finish()); + // START + auto tempFile = exec::test::TempFilePath::create(); + auto file_path = tempFile->getPath(); + WriteBufferToFile(buffer, file_path); + memory::MemoryManager::testingSetInstance({}); + std::shared_ptr rootPool_; + std::shared_ptr leafPool_; + rootPool_ = memory::memoryManager()->addRootPool("FileSerializeTest"); + leafPool_ = rootPool_->addLeafChild("FileSerializeTest"); + dwio::common::ReaderOptions readerOptions{leafPool_.get()}; + auto input = std::make_unique( + std::make_shared(file_path), + readerOptions.memoryPool()); + auto reader = + std::make_unique(std::move(input), readerOptions); + auto rg = reader->fileMetaData().rowGroup(0); + // row reader options rowreader + dwio::common::RowReaderOptions rowReaderOpts; + auto rowT = ROW({"nulls"}, {BIGINT()}); + rowReaderOpts.setScanSpec(makeScanSpec(rowT)); + auto rowReader = reader->createRowReader(rowReaderOpts); + // END + + auto source = std::make_shared<::arrow::io::BufferReader>(buffer); + auto file_reader = ParquetFileReader::Open(source, props); + auto row_group_reader = file_reader->RowGroup(0); + auto column_reader = + std::static_pointer_cast(row_group_reader->Column(0)); + + int64_t values_read; + def_levels[0] = -1; + def_levels[1] = -1; + def_levels[2] = -1; + column_reader->ReadBatch(3, def_levels, nullptr, values, &values_read); + EXPECT_THAT(def_levels, ElementsAre(0, 0, 0)); +} + +} // namespace test + +} // namespace facebook::velox::parquet::arrow diff --git a/velox/dwio/parquet/writer/arrow/tests/MetadataTest.cpp b/velox/dwio/parquet/tests/writer/MetadataTest.cpp similarity index 90% rename from velox/dwio/parquet/writer/arrow/tests/MetadataTest.cpp rename to velox/dwio/parquet/tests/writer/MetadataTest.cpp index f650870884350..8c813d336a85f 100644 --- a/velox/dwio/parquet/writer/arrow/tests/MetadataTest.cpp +++ b/velox/dwio/parquet/tests/writer/MetadataTest.cpp @@ -29,6 +29,11 @@ #include "velox/dwio/parquet/writer/arrow/tests/FileReader.h" #include "velox/dwio/parquet/writer/arrow/tests/TestUtil.h" +#include "velox/dwio/parquet/reader/ParquetReader.h" +#include "velox/exec/tests/utils/TempFilePath.h" + +#include + namespace facebook::velox::parquet::arrow { namespace metadata { @@ -365,6 +370,7 @@ TEST(Metadata, TestKeyValueMetadata) { } TEST(Metadata, TestAddKeyValueMetadata) { + memory::MemoryManager::testingSetInstance({}); schema::NodeVector fields; fields.push_back(schema::Int32("int_col", Repetition::REQUIRED)); auto schema = std::static_pointer_cast( @@ -394,21 +400,41 @@ TEST(Metadata, TestAddKeyValueMetadata) { file_writer->AddKeyValueMetadata(kv_meta_ignored), ParquetException); PARQUET_ASSIGN_OR_THROW(auto buffer, sink->Finish()); - auto source = std::make_shared<::arrow::io::BufferReader>(buffer); - auto file_reader = ParquetFileReader::Open(source); - ASSERT_NE(nullptr, file_reader->metadata()); - ASSERT_NE(nullptr, file_reader->metadata()->key_value_metadata()); - auto read_kv_meta = file_reader->metadata()->key_value_metadata(); + // Write the buffer to a temp file + auto tempFile = exec::test::TempFilePath::create(); + auto file_path = tempFile->getPath(); + WriteBufferToFile(buffer, file_path); + std::shared_ptr rootPool_; + std::shared_ptr leafPool_; + rootPool_ = memory::memoryManager()->addRootPool("MetadataTest"); + leafPool_ = rootPool_->addLeafChild("MetadataTest"); + dwio::common::ReaderOptions readerOptions{leafPool_.get()}; + auto input = std::make_unique( + std::make_shared(file_path), readerOptions.memoryPool()); + auto reader = + std::make_unique(std::move(input), readerOptions); + auto thriftMeta = reader->thriftFileMetaData(); + ASSERT_NE(0, thriftMeta.key_value_metadata.size()); // Verify keys that were added before file writer was closed are present. for (int i = 1; i <= 3; ++i) { - auto index = std::to_string(i); - PARQUET_ASSIGN_OR_THROW(auto value, read_kv_meta->Get("test_key_" + index)); + auto key = thriftMeta.key_value_metadata[i - 1].key; + auto value = thriftMeta.key_value_metadata[i - 1].value; + + auto index = std::to_string(i + 1); + if (i == 3) { + index = std::to_string(1); + } + + EXPECT_EQ("test_key_" + index, key); EXPECT_EQ("test_value_" + index, value); } // Verify keys that were added after file writer was closed are not present. - EXPECT_FALSE(read_kv_meta->Contains("test_key_4")); + for (const auto& kv : thriftMeta.key_value_metadata) { + EXPECT_NE("test_key_4", kv.key); + EXPECT_NE("test_value_4", kv.value); + } } // TODO: disabled as they require Arrow parquet data dir. @@ -467,6 +493,7 @@ TEST(Metadata, TestReadPageIndex) { */ TEST(Metadata, TestSortingColumns) { + memory::MemoryManager::testingSetInstance({}); schema::NodeVector fields; fields.push_back(schema::Int32("sort_col", Repetition::REQUIRED)); fields.push_back(schema::Int32("int_col", Repetition::REQUIRED)); @@ -498,15 +525,33 @@ TEST(Metadata, TestSortingColumns) { file_writer->Close(); PARQUET_ASSIGN_OR_THROW(auto buffer, sink->Finish()); - auto source = std::make_shared<::arrow::io::BufferReader>(buffer); - auto file_reader = ParquetFileReader::Open(source); - - ASSERT_NE(nullptr, file_reader->metadata()); - ASSERT_EQ(1, file_reader->metadata()->num_row_groups()); - auto row_group_reader = file_reader->RowGroup(0); - auto* row_group_read_metadata = row_group_reader->metadata(); - ASSERT_NE(nullptr, row_group_read_metadata); - EXPECT_EQ(sorting_columns, row_group_read_metadata->sorting_columns()); + + auto tempFile = exec::test::TempFilePath::create(); + auto file_path = tempFile->getPath(); + WriteBufferToFile(buffer, file_path); + std::shared_ptr rootPool_; + std::shared_ptr leafPool_; + rootPool_ = memory::memoryManager()->addRootPool("MetadataTest"); + leafPool_ = rootPool_->addLeafChild("MetadataTest"); + dwio::common::ReaderOptions readerOptions{leafPool_.get()}; + auto input = std::make_unique( + std::make_shared(file_path), readerOptions.memoryPool()); + auto reader = + std::make_unique(std::move(input), readerOptions); + ASSERT_EQ(1, reader->thriftFileMetaData().row_groups.size()); + auto row_groups = reader->thriftFileMetaData().row_groups; + ASSERT_NE(true, row_groups.empty()); + auto sc = row_groups[0].sorting_columns; + EXPECT_EQ(sorting_columns[0].column_idx, sc[0].column_idx); + EXPECT_EQ(sorting_columns[0].descending, sc[0].descending); + EXPECT_EQ(sorting_columns[0].nulls_first, sc[0].nulls_first); + + // EXPECT_EQ(sorting_columns, row_groups[0].sorting_columns); + // Above will not work until the writer dependecy of sorting_columns changes + // from arrow to velox parquet. Once they are both the same we can use this + // instead since currently sorting_columns is + // facebook::velox::parquet::arrow::SortingColumn and ours is + // facebook::velox::parquet::thrift::SortingColumn } TEST(ApplicationVersion, Basics) { diff --git a/velox/dwio/parquet/writer/arrow/tests/StatisticsTest.cpp b/velox/dwio/parquet/tests/writer/StatisticsTest.cpp similarity index 92% rename from velox/dwio/parquet/writer/arrow/tests/StatisticsTest.cpp rename to velox/dwio/parquet/tests/writer/StatisticsTest.cpp index 3b7029dab0e44..f9848319d3689 100644 --- a/velox/dwio/parquet/writer/arrow/tests/StatisticsTest.cpp +++ b/velox/dwio/parquet/tests/writer/StatisticsTest.cpp @@ -37,6 +37,7 @@ #include "arrow/util/bitmap_ops.h" #include "arrow/util/ubsan.h" +#include "velox/dwio/parquet/reader/ParquetReader.h" #include "velox/dwio/parquet/writer/arrow/ColumnWriter.h" #include "velox/dwio/parquet/writer/arrow/FileWriter.h" #include "velox/dwio/parquet/writer/arrow/Platform.h" @@ -47,6 +48,7 @@ #include "velox/dwio/parquet/writer/arrow/tests/ColumnReader.h" #include "velox/dwio/parquet/writer/arrow/tests/FileReader.h" #include "velox/dwio/parquet/writer/arrow/tests/TestUtil.h" +#include "velox/exec/tests/utils/TempFilePath.h" using arrow::default_memory_pool; using arrow::MemoryPool; @@ -494,18 +496,43 @@ class TestStatistics : public PrimitiveTypedTest { ASSERT_OK_AND_ASSIGN(auto buffer, sink->Finish()); auto source = std::make_shared<::arrow::io::BufferReader>(buffer); - auto file_reader = ParquetFileReader::Open(source); - auto rg_reader = file_reader->RowGroup(0); - auto column_chunk = rg_reader->metadata()->ColumnChunk(0); - if (!column_chunk->is_stats_set()) - return; - std::shared_ptr stats = column_chunk->statistics(); - // check values after serialization + deserialization - EXPECT_EQ(null_count, stats->null_count()); - EXPECT_EQ(num_values - null_count, stats->num_values()); + + // START + // Write the buffer to a tmp file + auto tempFile = exec::test::TempFilePath::create(); + auto file_path = tempFile->getPath(); + // auto lwf = std::make_unique(file_path, false, false); + // auto s = std::make_shared<::arrow::io::BufferReader>(buffer); + // auto b = s->buffer()->ToHexString(); + // lwf->append(b); + // lwf->close(); + WriteBufferToFile(buffer, file_path); + memory::MemoryManager::testingSetInstance({}); + std::shared_ptr rootPool_; + std::shared_ptr leafPool_; + rootPool_ = memory::memoryManager()->addRootPool("StatisticsTest"); + leafPool_ = rootPool_->addLeafChild("StatisticsTest"); + dwio::common::ReaderOptions readerOptions{leafPool_.get()}; + auto input = std::make_unique( + std::make_shared(file_path), readerOptions.memoryPool()); + auto reader = + std::make_unique(std::move(input), readerOptions); + auto row_group = reader->fileMetaData().rowGroup(0); + auto col_chunk = row_group.columnChunk(0); + auto column_metadata = col_chunk.getColumnMetadata(); + auto stats = column_metadata.statistics; + auto type = + column_metadata + .type; // each type needs mapping if needed std::make_shared>() + EXPECT_EQ(null_count, stats.null_count); EXPECT_TRUE(expected_stats->HasMinMax()); - EXPECT_EQ(expected_stats->EncodeMin(), stats->EncodeMin()); - EXPECT_EQ(expected_stats->EncodeMax(), stats->EncodeMax()); + EXPECT_EQ(expected_stats->EncodeMin(), stats.min_value); + EXPECT_EQ(expected_stats->EncodeMax(), stats.max_value); + auto col_stats = + col_chunk.getColumnStatistics(INTEGER(), row_group.numRows()); + EXPECT_EQ(num_values - null_count, col_stats->getNumberOfValues()); + // END } }; @@ -1014,20 +1041,28 @@ class TestStatisticsSortOrder : public ::testing::Test { void VerifyParquetStats() { ASSERT_OK_AND_ASSIGN(auto pbuffer, parquet_sink_->Finish()); - + auto tempFile = exec::test::TempFilePath::create(); + auto file_path = tempFile->getPath(); + WriteBufferToFile(pbuffer, file_path); + memory::MemoryManager::testingSetInstance({}); + std::shared_ptr rootPool_; + std::shared_ptr leafPool_; + rootPool_ = memory::memoryManager()->addRootPool("StatisticsTest"); + leafPool_ = rootPool_->addLeafChild("StatisticsTest"); + dwio::common::ReaderOptions readerOptions{leafPool_.get()}; + auto input = std::make_unique( + std::make_shared(file_path), readerOptions.memoryPool()); // Create a ParquetReader instance - std::unique_ptr parquet_reader = ParquetFileReader::Open( - std::make_shared<::arrow::io::BufferReader>(pbuffer)); - + auto reader = + std::make_unique(std::move(input), readerOptions); // Get the File MetaData - std::shared_ptr file_metadata = parquet_reader->metadata(); - std::shared_ptr rg_metadata = file_metadata->RowGroup(0); + auto row_group = reader->fileMetaData().rowGroup(0); for (int i = 0; i < static_cast(fields_.size()); i++) { ARROW_SCOPED_TRACE("Statistics for field #", i); - std::shared_ptr cc_metadata = - rg_metadata->ColumnChunk(i); - EXPECT_EQ(stats_[i].min(), cc_metadata->statistics()->EncodeMin()); - EXPECT_EQ(stats_[i].max(), cc_metadata->statistics()->EncodeMax()); + auto col_chunk = row_group.columnChunk(i); + auto cc_metadata = col_chunk.getColumnMetadata(); + EXPECT_EQ(stats_[i].min(), cc_metadata.statistics.min_value); + EXPECT_EQ(stats_[i].max(), cc_metadata.statistics.max_value); } } @@ -1279,6 +1314,30 @@ TEST_F(TestStatisticsSortOrderFLBA, UnknownSortOrder) { ASSERT_OK_AND_ASSIGN(auto pbuffer, parquet_sink_->Finish()); + // START:::: COMMENT OUT SINCE "Unsupported Parquet SchemaElement converted + // type: {}", INTERVAL TYPE + // Write the buffer to a tmp file + // auto tempFile = exec::test::TempFilePath::create(); + // auto file_path = tempFile->getPath(); + // WriteBufferToFile(pbuffer, file_path); + // memory::MemoryManager::testingSetInstance({}); + // std::shared_ptr rootPool_; + // std::shared_ptr leafPool_; + // rootPool_ = memory::memoryManager()->addRootPool("StatisticsTest"); + // leafPool_ = rootPool_->addLeafChild("StatisticsTest"); + // dwio::common::ReaderOptions readerOptions{leafPool_.get()}; + // auto input = std::make_unique( + // std::make_shared(file_path), + // readerOptions.memoryPool()); + // auto reader = + // std::make_unique(std::move(input), readerOptions); + // auto row_group = reader->fileMetaData().rowGroup(0); + // auto col_chunk = row_group.columnChunk(0); + // auto column_metadata = col_chunk.getColumnMetadata(); + // auto stats = column_metadata.statistics; + // ASSERT_FALSE(col_chunk.hasStatistics()); + // END + // Create a ParquetReader instance std::unique_ptr parquet_reader = ParquetFileReader::Open( std::make_shared<::arrow::io::BufferReader>(pbuffer)); diff --git a/velox/dwio/parquet/writer/arrow/tests/CMakeLists.txt b/velox/dwio/parquet/writer/arrow/tests/CMakeLists.txt index 0cda25430f2c1..027640d562fe0 100644 --- a/velox/dwio/parquet/writer/arrow/tests/CMakeLists.txt +++ b/velox/dwio/parquet/writer/arrow/tests/CMakeLists.txt @@ -21,11 +21,9 @@ add_executable( FileDeserializeTest.cpp FileSerializeTest.cpp LevelConversionTest.cpp - MetadataTest.cpp PageIndexTest.cpp PropertiesTest.cpp SchemaTest.cpp - StatisticsTest.cpp TypesTest.cpp) add_test(velox_dwio_arrow_parquet_writer_test @@ -38,7 +36,13 @@ target_link_libraries( GTest::gtest GTest::gtest_main arrow - arrow_testing) + arrow_testing + velox_dwio_native_parquet_reader + velox_temp_path) + +target_include_directories( + velox_dwio_arrow_parquet_writer_test + PRIVATE ${CMAKE_SOURCE_DIR}/velox/dwio/parquet/reader) add_library( velox_dwio_arrow_parquet_writer_test_lib