Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update to latest arrow2 + new datatypes #108

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,6 @@ members = [
"arrow2_convert_derive",
"examples/simple"
]

[patch.crates-io]
arrow2 = { git = "https://github.com/rerun-io/arrow2", branch = "cmc/arc_datatype" }
8 changes: 4 additions & 4 deletions arrow2_convert/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "arrow2_convert"
version = "0.4.2"
version = "0.5.0"
authors = [
"Jorge Leitao <[email protected]>",
"Chandra Penke <[email protected]>",
Expand All @@ -12,13 +12,13 @@ repository = "https://github.com/DataEngineeringLabs/arrow2-convert"
description = "Convert between nested rust types and Arrow with arrow2"

[dependencies]
arrow2 = "0.16"
arrow2_convert_derive = { version = "0.4.2", path = "../arrow2_convert_derive", optional = true }
arrow2 = "0.17"
arrow2_convert_derive = { version = "0.5.0", path = "../arrow2_convert_derive", optional = true }
chrono = { version = "0.4", default_features = false, features = ["std"] }
err-derive = "0.3"

[dev-dependencies]
arrow2_convert_derive = { version = "0.4.2", path = "../arrow2_convert_derive" }
arrow2_convert_derive = { version = "0.5.0", path = "../arrow2_convert_derive" }
criterion = "0.4"
trybuild = "1.0"

Expand Down
3 changes: 2 additions & 1 deletion arrow2_convert/src/deserialize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,8 @@ impl<'a> Iterator for BufferBinaryArrayIter<'a> {
}
let (start, end) = self.array.offsets().start_end(self.index);
self.index += 1;
Some(Some(self.array.values().clone().slice(start, end - start)))

Some(Some(self.array.values().clone().sliced(start, end - start)))
}
}
}
Expand Down
10 changes: 6 additions & 4 deletions arrow2_convert/src/field.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
//! Implementation and traits for mapping rust types to Arrow types

use std::sync::Arc;

use arrow2::{
buffer::Buffer,
datatypes::{DataType, Field},
Expand Down Expand Up @@ -225,7 +227,7 @@ where

#[inline]
fn data_type() -> DataType {
DataType::List(Box::new(<T as ArrowField>::field("item")))
DataType::List(Arc::new(<T as ArrowField>::field("item")))
}
}

Expand All @@ -238,7 +240,7 @@ where

#[inline]
fn data_type() -> arrow2::datatypes::DataType {
arrow2::datatypes::DataType::List(Box::new(<T as ArrowField>::field("item")))
arrow2::datatypes::DataType::List(Arc::new(<T as ArrowField>::field("item")))
}
}

Expand All @@ -255,7 +257,7 @@ where

#[inline]
fn data_type() -> arrow2::datatypes::DataType {
arrow2::datatypes::DataType::LargeList(Box::new(<T as ArrowField>::field("item")))
arrow2::datatypes::DataType::LargeList(Arc::new(<T as ArrowField>::field("item")))
}
}

Expand All @@ -272,7 +274,7 @@ where

#[inline]
fn data_type() -> arrow2::datatypes::DataType {
arrow2::datatypes::DataType::FixedSizeList(Box::new(<T as ArrowField>::field("item")), SIZE)
arrow2::datatypes::DataType::FixedSizeList(Arc::new(<T as ArrowField>::field("item")), SIZE)
}
}

Expand Down
3 changes: 2 additions & 1 deletion arrow2_convert/tests/complex_example.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ use arrow2_convert::serialize::TryIntoArrow;
/// - Custom types
use arrow2_convert::{ArrowDeserialize, ArrowField, ArrowSerialize};
use std::borrow::Borrow;
use std::sync::Arc;

#[derive(Debug, Clone, PartialEq, ArrowField, ArrowSerialize, ArrowDeserialize)]
pub struct Root {
Expand Down Expand Up @@ -85,7 +86,7 @@ impl arrow2_convert::field::ArrowField for CustomType {
fn data_type() -> arrow2::datatypes::DataType {
arrow2::datatypes::DataType::Extension(
"custom".to_string(),
Box::new(arrow2::datatypes::DataType::UInt64),
Arc::new(arrow2::datatypes::DataType::UInt64),
None,
)
}
Expand Down
2 changes: 1 addition & 1 deletion arrow2_convert/tests/test_enum.rs
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ fn test_slice() {
let b: Box<dyn Array> = enums.try_into_arrow().unwrap();

for i in 0..enums.len() {
let arrow_slice = b.slice(i, enums.len() - i);
let arrow_slice = b.sliced(i, enums.len() - i);
let original_slice = &enums[i..enums.len()];
let round_trip: Vec<TestEnum> = arrow_slice.try_into_collection().unwrap();
assert_eq!(round_trip, original_slice);
Expand Down
10 changes: 5 additions & 5 deletions arrow2_convert/tests/test_round_trip.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ fn test_large_string_nested() {
let b: Box<dyn Array> = strs.try_into_arrow_as_type::<Vec<LargeString>>().unwrap();
assert_eq!(
b.data_type(),
&DataType::List(Box::new(Field::new("item", DataType::LargeUtf8, false)))
&DataType::List(Arc::new(Field::new("item", DataType::LargeUtf8, false)))
);
let round_trip: Vec<Vec<String>> = b.try_into_collection_as_type::<Vec<LargeString>>().unwrap();
assert_eq!(round_trip, strs);
Expand All @@ -81,7 +81,7 @@ fn test_large_binary_nested() {
let b: Box<dyn Array> = strs.try_into_arrow_as_type::<Vec<LargeBinary>>().unwrap();
assert_eq!(
b.data_type(),
&DataType::List(Box::new(Field::new("item", DataType::LargeBinary, false)))
&DataType::List(Arc::new(Field::new("item", DataType::LargeBinary, false)))
);
let round_trip: Vec<Vec<Vec<u8>>> =
b.try_into_collection_as_type::<Vec<LargeBinary>>().unwrap();
Expand All @@ -105,7 +105,7 @@ fn test_large_vec() {
let b: Box<dyn Array> = ints.try_into_arrow_as_type::<LargeVec<i32>>().unwrap();
assert_eq!(
b.data_type(),
&DataType::LargeList(Box::new(Field::new("item", DataType::Int32, false)))
&DataType::LargeList(Arc::new(Field::new("item", DataType::Int32, false)))
);
let round_trip: Vec<Vec<i32>> = b.try_into_collection_as_type::<LargeVec<i32>>().unwrap();
assert_eq!(round_trip, ints);
Expand All @@ -119,7 +119,7 @@ fn test_large_vec_nested() {
.unwrap();
assert_eq!(
b.data_type(),
&DataType::LargeList(Box::new(Field::new("item", DataType::LargeBinary, false)))
&DataType::LargeList(Arc::new(Field::new("item", DataType::LargeBinary, false)))
);
let round_trip: Vec<Vec<Vec<u8>>> = b
.try_into_collection_as_type::<LargeVec<LargeBinary>>()
Expand All @@ -135,7 +135,7 @@ fn test_fixed_size_vec() {
.unwrap();
assert_eq!(
b.data_type(),
&DataType::FixedSizeList(Box::new(Field::new("item", DataType::Int32, false)), 3)
&DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int32, false)), 3)
);
let round_trip: Vec<Vec<i32>> = b
.try_into_collection_as_type::<FixedSizeVec<i32, 3>>()
Expand Down
44 changes: 23 additions & 21 deletions arrow2_convert/tests/test_schema.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
use std::sync::Arc;

use arrow2::datatypes::*;
use arrow2_convert::ArrowField;

Expand Down Expand Up @@ -84,7 +86,7 @@ fn test_schema_types() {
fn data_type() -> arrow2::datatypes::DataType {
arrow2::datatypes::DataType::Extension(
"custom".to_string(),
Box::new(arrow2::datatypes::DataType::UInt64),
Arc::new(arrow2::datatypes::DataType::UInt64),
None,
)
}
Expand Down Expand Up @@ -118,7 +120,7 @@ fn test_schema_types() {

assert_eq!(
<Root as arrow2_convert::field::ArrowField>::data_type(),
DataType::Struct(vec![
DataType::Struct(Arc::new(vec![
Field::new("name", DataType::Utf8, true),
Field::new("is_deleted", DataType::Boolean, false),
Field::new("a1", DataType::Float64, true),
Expand All @@ -130,7 +132,7 @@ fn test_schema_types() {
Field::new("a7", DataType::Decimal(32, 32), false),
Field::new(
"date_time_list",
DataType::List(Box::new(Field::new(
DataType::List(Arc::new(Field::new(
"item",
DataType::Timestamp(TimeUnit::Nanosecond, None),
false
Expand All @@ -139,47 +141,47 @@ fn test_schema_types() {
),
Field::new(
"nullable_list",
DataType::List(Box::new(Field::new("item", DataType::Utf8, true))),
DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))),
true
),
Field::new(
"required_list",
DataType::List(Box::new(Field::new("item", DataType::Utf8, true))),
DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))),
false
),
Field::new(
"custom",
DataType::Extension("custom".to_string(), Box::new(DataType::UInt64), None),
DataType::Extension("custom".to_string(), Arc::new(DataType::UInt64), None),
false
),
Field::new(
"nullable_custom",
DataType::Extension("custom".to_string(), Box::new(DataType::UInt64), None),
DataType::Extension("custom".to_string(), Arc::new(DataType::UInt64), None),
true
),
Field::new(
"custom_list",
DataType::List(Box::new(Field::new(
DataType::List(Arc::new(Field::new(
"item",
DataType::Extension("custom".to_string(), Box::new(DataType::UInt64), None),
DataType::Extension("custom".to_string(), Arc::new(DataType::UInt64), None),
false
))),
false
),
Field::new(
"child",
DataType::Struct(vec![
DataType::Struct(Arc::new(vec![
Field::new("a1", DataType::Int64, false),
Field::new("a2", DataType::Utf8, false),
Field::new(
"child_array",
DataType::List(Box::new(Field::new(
DataType::List(Arc::new(Field::new(
"item",
DataType::Struct(vec![
DataType::Struct(Arc::new(vec![
Field::new("a1", DataType::Int32, false),
Field::new(
"bool_array",
DataType::List(Box::new(Field::new(
DataType::List(Arc::new(Field::new(
"item",
DataType::Boolean,
false
Expand All @@ -188,40 +190,40 @@ fn test_schema_types() {
),
Field::new(
"int64_array",
DataType::List(Box::new(Field::new(
DataType::List(Arc::new(Field::new(
"item",
DataType::Int64,
false
))),
false
),
]),
])),
false
))),
false
)
]),
])),
false
),
Field::new(
"int32_array",
DataType::List(Box::new(Field::new("item", DataType::Int32, false))),
DataType::List(Arc::new(Field::new("item", DataType::Int32, false))),
false
),
Field::new("large_binary", DataType::LargeBinary, false),
Field::new("fixed_size_binary", DataType::FixedSizeBinary(3), false),
Field::new("large_string", DataType::LargeUtf8, false),
Field::new(
"large_vec",
DataType::LargeList(Box::new(Field::new("item", DataType::Int64, false))),
DataType::LargeList(Arc::new(Field::new("item", DataType::Int64, false))),
false
),
Field::new(
"fixed_size_vec",
DataType::FixedSizeList(Box::new(Field::new("item", DataType::Int64, false)), 3),
DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int64, false)), 3),
false
),
])
]))
);
}

Expand All @@ -238,6 +240,6 @@ fn test_large_string_schema() {

assert_eq!(
<Vec<LargeString> as arrow2_convert::field::ArrowField>::data_type(),
DataType::List(Box::new(Field::new("item", DataType::LargeUtf8, false)))
DataType::List(Arc::new(Field::new("item", DataType::LargeUtf8, false)))
);
}
6 changes: 3 additions & 3 deletions arrow2_convert/tests/test_serialize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -73,14 +73,14 @@ fn test_array() {
#[test]
fn test_buffer() {
// Buffer<u8> and Vec<u8> should serialize into BinaryArray
let dat: Vec<Buffer<u8>> = vec![(0..10).into_iter().collect()];
let dat: Vec<Buffer<u8>> = vec![(0..10).collect()];
let r: Box<dyn Array> = dat.try_into_arrow().unwrap();
assert_eq!(r.len(), 1);
assert_eq!(r.data_type(), &<Buffer<u8> as ArrowField>::data_type());
assert_eq!(r.data_type(), &<Vec<u8> as ArrowField>::data_type());

// Buffer<u16> and Vec<u16> should serialize into ListArray
let dat: Vec<Buffer<u16>> = vec![(0..10).into_iter().collect()];
let dat: Vec<Buffer<u16>> = vec![(0..10).collect()];
let r: Box<dyn Array> = dat.try_into_arrow().unwrap();
assert_eq!(r.len(), 1);
assert_eq!(r.data_type(), &<Buffer<u16> as ArrowField>::data_type());
Expand All @@ -98,7 +98,7 @@ fn test_field_serialize_error() {
fn data_type() -> arrow2::datatypes::DataType {
arrow2::datatypes::DataType::Extension(
"custom".to_string(),
Box::new(arrow2::datatypes::DataType::UInt64),
Arc::new(arrow2::datatypes::DataType::UInt64),
None,
)
}
Expand Down
4 changes: 2 additions & 2 deletions arrow2_convert/tests/test_struct.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ fn test_slice() {
let b: Box<dyn Array> = original.try_into_arrow().unwrap();

for i in 0..original.len() {
let arrow_slice = b.slice(i, original.len() - i);
let arrow_slice = b.sliced(i, original.len() - i);
let original_slice = &original[i..original.len()];
let round_trip: Vec<T> = arrow_slice.try_into_collection().unwrap();
assert_eq!(round_trip, original_slice);
Expand Down Expand Up @@ -84,7 +84,7 @@ fn test_nested_slice() {
let b: Box<dyn Array> = original.try_into_arrow().unwrap();

for i in 0..original.len() {
let arrow_slice = b.slice(i, original.len() - i);
let arrow_slice = b.sliced(i, original.len() - i);
let original_slice = &original[i..original.len()];
let round_trip: Vec<Top> = arrow_slice.try_into_collection().unwrap();
assert_eq!(round_trip, original_slice);
Expand Down
2 changes: 1 addition & 1 deletion arrow2_convert_derive/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "arrow2_convert_derive"
version = "0.4.2"
version = "0.5.0"
authors = [
"Jorge Leitao <[email protected]>",
"Chandra Penke <[email protected]>"
Expand Down
6 changes: 3 additions & 3 deletions arrow2_convert_derive/src/derive_enum.rs
Original file line number Diff line number Diff line change
Expand Up @@ -93,11 +93,11 @@ pub fn expand_field(input: DeriveEnum) -> TokenStream {

fn data_type() -> arrow2::datatypes::DataType {
arrow2::datatypes::DataType::Union(
vec![
std::sync::Arc::new(vec![
#(
<#variant_types as arrow2_convert::field::ArrowField>::field(#variant_names_str),
)*
],
]),
None,
#union_type,
)
Expand Down Expand Up @@ -502,7 +502,7 @@ pub fn expand_deserialize(input: DeriveEnum) -> TokenStream {
return None;
};
let (type_idx, offset) = self.arr.index(next_index);
let slice = self.arr.fields()[type_idx].slice(offset, 1);
let slice = self.arr.fields()[type_idx].sliced(offset, 1);
match type_idx {
#iter_next_match_block
_ => panic!("Invalid type for {}", #original_name_str)
Expand Down
12 changes: 7 additions & 5 deletions arrow2_convert_derive/src/derive_struct.rs
Original file line number Diff line number Diff line change
Expand Up @@ -115,11 +115,13 @@ pub fn expand_field(input: DeriveStruct) -> TokenStream {
syn::Member::Named(ident) => format_ident!("{}", ident),
syn::Member::Unnamed(index) => format_ident!("field_{}", index),
});
quote!(arrow2::datatypes::DataType::Struct(vec![
#(
<#field_types as arrow2_convert::field::ArrowField>::field(stringify!(#field_names)),
)*
]))
quote!(arrow2::datatypes::DataType::Struct(std::sync::Arc::new(
vec![
#(
<#field_types as arrow2_convert::field::ArrowField>::field(stringify!(#field_names)),
)*
]
)))
}
};

Expand Down
Loading