paradedb · shamb0 · Aug 20, 2024 · Aug 20, 2024 · Aug 24, 2024
diff --git a/Cargo.toml b/Cargo.toml
@@ -61,6 +61,9 @@ testcontainers = "0.16.7"
 testcontainers-modules = { version = "0.4.2", features = ["localstack"] }
 time = { version = "0.3.34", features = ["serde"] }
 geojson = "0.24.1"
+tracing = "0.1"
+rand = { version = "0.8.5" }
+approx = "0.5.1"
 
 [[bin]]
 name = "pgrx_embed_pg_analytics"

@@ -19,16 +19,18 @@ pub mod arrow;
 pub mod db;
 pub mod tables;
 
-use anyhow::Result;
+use anyhow::{Context, Result};
 use async_std::task::block_on;
 use aws_config::{BehaviorVersion, Region};
 use aws_sdk_s3::primitives::ByteStream;
+use bytes::Bytes;
 use chrono::{DateTime, Duration};
-use datafusion::arrow::array::*;
+use datafusion::arrow::array::{Int32Array, TimestampMillisecondArray};
 use datafusion::arrow::datatypes::TimeUnit::Millisecond;
 use datafusion::arrow::datatypes::{DataType, Field, Schema};
 use datafusion::{
     arrow::{datatypes::FieldRef, record_batch::RecordBatch},
+    parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder,
     parquet::arrow::ArrowWriter,
 };
 use futures::future::{BoxFuture, FutureExt};
@@ -141,6 +143,38 @@ impl S3 {
         Ok(())
     }
 
+    #[allow(unused)]
+    pub async fn get_batch(&self, bucket: &str, key: &str) -> Result<RecordBatch> {
+        // Retrieve the object from S3
+        let get_object_output = self
+            .client
+            .get_object()
+            .bucket(bucket)
+            .key(key)
+            .send()
+            .await
+            .context("Failed to get object from S3")?;
+
+        // Read the body of the object
+        let body = get_object_output.body.collect().await?;
+        let bytes: Bytes = body.into_bytes();
+
+        // Create a Parquet reader
+        let builder = ParquetRecordBatchReaderBuilder::try_new(bytes)
+            .context("Failed to create Parquet reader builder")?;
+
+        // Create the reader
+        let mut reader = builder.build().context("Failed to build Parquet reader")?;
+
+        // Read the first batch
+        let record_batch = reader
+            .next()
+            .context("No batches found in Parquet file")?
+            .context("Failed to read batch")?;
+
+        Ok(record_batch)
+    }
+
     #[allow(unused)]
     pub async fn put_rows<T: Serialize>(&self, bucket: &str, key: &str, rows: &[T]) -> Result<()> {
         let fields = Vec::<FieldRef>::from_type::<NycTripsTable>(TracingOptions::default())?;