IBM · daw3rd · Dec 2, 2024 · Oct 23, 2024 · Oct 24, 2024 · Oct 24, 2024
diff --git a/data-processing-lib/python/requirements.txt b/data-processing-lib/python/requirements.txt
@@ -4,3 +4,4 @@
   argparse
   mmh3
   psutil
+  polars
diff --git a/data-processing-lib/python/src/data_processing/utils/transform_utils.py b/data-processing-lib/python/src/data_processing/utils/transform_utils.py
@@ -11,6 +11,7 @@
 ################################################################################
 
 import hashlib
+import io
 import os
 import string
 import sys
@@ -144,8 +145,21 @@ def convert_binary_to_arrow(data: bytes, schema: pa.schema = None) -> pa.Table:
             table = pq.read_table(reader, schema=schema)
             return table
         except Exception as e:
-            logger.error(f"Failed to convert byte array to arrow table, exception {e}. Skipping it")
-            return None
+            logger.warning(f"Could not convert bytes to pyarrow: {e}")
+
+        # We have seen this exception before when using pyarrow, but polars does not throw it.
+        # "Nested data conversions not implemented for chunked array outputs"
+        # See issue 816 https://github.com/IBM/data-prep-kit/issues/816.
+        logger.info(f"Attempting read of pyarrow Table using polars")
+        try:
+            import polars
+
+            df = polars.read_parquet(io.BytesIO(data))
+            table = df.to_arrow()
+        except Exception as e:
+            logger.warning(f"Could not convert bytes to pyarrow using polars: {e}. Skipping.")
+            table = None
+        return table
 
     @staticmethod
     def convert_arrow_to_binary(table: pa.Table) -> bytes:

diff --git a/transforms/universal/filter/python/src/filter_transform.py b/transforms/universal/filter/python/src/filter_transform.py
@@ -67,6 +67,10 @@ def __init__(self, config: dict):
         self.logical_operator = config.get(filter_logical_operator_key, filter_logical_operator_default)
         self.columns_to_drop = config.get(filter_columns_to_drop_key, filter_columns_to_drop_default)
 
+        # Temporarily here to test if this can allow use to process files that are required to be read by polars for mm
+        # If this works, we should add as a configurable or always enable (not sure of the downside of enabling this).
+        # duckdb.execute("SET arrow_large_buffer_size = true")
+
     def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict]:
         """
         This implementation filters the input table using a SQL statement and
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,3 +4,4 @@ @@
       argparse
       mmh3
       psutil
+      polars