From 0a9ba26355120af41a2a50dde835ca4898eb8ea8 Mon Sep 17 00:00:00 2001 From: Fabian Murariu <2404621+fabianmurariu@users.noreply.github.com> Date: Thu, 30 Jan 2025 09:53:32 +0000 Subject: [PATCH] Graph to parquet (#1932) * Write all props except Arrays to parquet for edges, nodes and graph, need to add deletions * encode and decode Graph as parquet * fix test after rebase * fix issues with edge layers * fixing issues from review * fix test failures in df_loaders.rs * fix properties in df_loaders.rs * fix compilation issues * fix the columns for loading props to graph too --- Cargo.lock | 2 + Cargo.toml | 3 +- examples/rust/Cargo.toml | 2 +- python/tests/test_graphdb/test_disk_graph.py | 4 +- .../test_graph_nodes_edges_property_filter.py | 4 +- .../test_nodes_property_filter.py | 2 +- python/tests/test_graphql/test_schema.py | 6 +- .../src/core/entities/properties/mod.rs | 2 +- .../src/core/entities/properties/props.rs | 128 ++- raphtory-api/src/core/mod.rs | 218 +++++- raphtory-api/src/core/storage/dict_mapper.rs | 7 +- raphtory-api/src/lib.rs | 6 + raphtory-graphql/src/model/graph/property.rs | 1 - .../src/model/schema/node_schema.rs | 11 +- raphtory-graphql/src/python/client/mod.rs | 4 +- raphtory/Cargo.toml | 10 +- .../core/entities/properties/graph_meta.rs | 29 +- .../src/core/entities/properties/tprop.rs | 28 +- raphtory/src/core/mod.rs | 97 ++- raphtory/src/core/storage/mod.rs | 17 +- raphtory/src/core/utils/errors.rs | 10 +- .../src/db/api/properties/temporal_props.rs | 9 +- raphtory/src/db/api/storage/graph/locked.rs | 9 + .../storage/graph/storage_ops/additions.rs | 2 +- .../storage/graph/storage_ops/materialize.rs | 3 +- .../src/db/api/view/edge_property_filter.rs | 3 +- .../api/view/exploded_edge_property_filter.rs | 3 +- .../src/db/api/view/internal/materialize.rs | 14 +- raphtory/src/db/api/view/time.rs | 3 +- raphtory/src/db/graph/graph.rs | 9 +- raphtory/src/db/graph/views/deletion_graph.rs | 1 + .../disk_graph/graph_impl/prop_conversion.rs | 10 +- raphtory/src/io/arrow/df_loaders.rs | 511 +++++++++--- raphtory/src/io/arrow/mod.rs | 8 +- raphtory/src/io/arrow/prop_handler.rs | 136 +++- raphtory/src/io/parquet_loaders.rs | 76 +- raphtory/src/lib.rs | 392 +++++++++- raphtory/src/python/graph/graph.rs | 83 +- .../src/python/graph/graph_with_deletions.rs | 56 +- .../src/python/graph/io/pandas_loaders.rs | 38 +- raphtory/src/python/types/repr.rs | 2 +- raphtory/src/python/types/wrappers/prop.rs | 11 +- raphtory/src/serialise/mod.rs | 1 + raphtory/src/serialise/parquet/edges.rs | 176 +++++ raphtory/src/serialise/parquet/graph.rs | 125 +++ raphtory/src/serialise/parquet/mod.rs | 739 ++++++++++++++++++ raphtory/src/serialise/parquet/model.rs | 207 +++++ raphtory/src/serialise/parquet/nodes.rs | 116 +++ raphtory/src/serialise/proto_ext.rs | 52 +- raphtory/src/serialise/serialise.rs | 360 +++++---- raphtory/src/vectors/template.rs | 1 - 51 files changed, 3120 insertions(+), 627 deletions(-) create mode 100644 raphtory/src/serialise/parquet/edges.rs create mode 100644 raphtory/src/serialise/parquet/graph.rs create mode 100644 raphtory/src/serialise/parquet/mod.rs create mode 100644 raphtory/src/serialise/parquet/model.rs create mode 100644 raphtory/src/serialise/parquet/nodes.rs diff --git a/Cargo.lock b/Cargo.lock index 26c1ed77b..9a33be20b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4830,6 +4830,7 @@ dependencies = [ "arrow-buffer", "arrow-data", "arrow-ipc", + "arrow-json", "arrow-schema", "async-openai", "async-trait", @@ -4864,6 +4865,7 @@ dependencies = [ "ordered-float 4.5.0", "ouroboros", "parking_lot", + "parquet", "polars-arrow", "polars-core", "polars-io", diff --git a/Cargo.toml b/Cargo.toml index 4c34bc14a..5b37ded3b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,7 +13,6 @@ members = [ ] default-members = [ "raphtory", - "raphtory-graphql" ] resolver = "2" @@ -149,6 +148,8 @@ datafusion = { version = "43.0.0" } sqlparser = "0.51.0" futures = "0.3" arrow = { version = "53.2.0" } +parquet = { version = "53.2.0" } +arrow-json = { version = "53.2.0" } arrow-buffer = { version = "53.2.0" } arrow-schema = { version = "53.2.0" } arrow-array = { version = "53.2.0" } diff --git a/examples/rust/Cargo.toml b/examples/rust/Cargo.toml index 9d6506019..01a767eba 100644 --- a/examples/rust/Cargo.toml +++ b/examples/rust/Cargo.toml @@ -7,7 +7,7 @@ keywords = ["graph", "temporal-graph", "temporal", "examples"] # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -raphtory = { path = "../../raphtory", features = ["io"] } +raphtory = { path = "../../raphtory", features = ["io", "proto"] } chrono = { workspace = true } regex = { workspace = true } serde = { workspace = true } diff --git a/python/tests/test_graphdb/test_disk_graph.py b/python/tests/test_graphdb/test_disk_graph.py index 8f67c16e3..4b2a3ecbe 100644 --- a/python/tests/test_graphdb/test_disk_graph.py +++ b/python/tests/test_graphdb/test_disk_graph.py @@ -1,4 +1,3 @@ -from raphtory import DiskGraphStorage from raphtory import algorithms import pandas as pd import tempfile @@ -40,6 +39,7 @@ def test_counts(): + from raphtory import DiskGraphStorage graph_dir = tempfile.TemporaryDirectory() graph = DiskGraphStorage.load_from_pandas( graph_dir.name, edges, "time", "src", "dst" @@ -50,6 +50,7 @@ def test_counts(): def test_disk_graph(): + from raphtory import DiskGraphStorage curr_dir = os.path.dirname(os.path.abspath(__file__)) rsc_dir = os.path.join( curr_dir, "..", "..", "..", "pometry-storage-private", "resources" @@ -139,6 +140,7 @@ def test_disk_graph(): def test_disk_graph_type_filter(): + from raphtory import DiskGraphStorage curr_dir = os.path.dirname(os.path.abspath(__file__)) rsc_dir = os.path.join( curr_dir, "..", "..", "..", "pometry-storage-private", "resources" diff --git a/python/tests/test_graphql/test_graph_nodes_edges_property_filter.py b/python/tests/test_graphql/test_graph_nodes_edges_property_filter.py index eaf8dc68b..881e3cac2 100644 --- a/python/tests/test_graphql/test_graph_nodes_edges_property_filter.py +++ b/python/tests/test_graphql/test_graph_nodes_edges_property_filter.py @@ -185,7 +185,7 @@ def test_graph_node_property_filter_equal_type_error(graph): } } """ - expected_error_message = "PropertyType Error: Wrong type for property prop5: expected List but actual type is I64" + expected_error_message = "PropertyType Error: Wrong type for property prop5: expected List(I64) but actual type is I64" run_graphql_error_test(query, expected_error_message, graph()) @@ -936,7 +936,7 @@ def test_graph_edge_property_filter_equal_type_error(graph): } } """ - expected_error_message = "PropertyType Error: Wrong type for property eprop5: expected List but actual type is I64" + expected_error_message = "PropertyType Error: Wrong type for property eprop5: expected List(I64) but actual type is I64" run_graphql_error_test(query, expected_error_message, graph()) diff --git a/python/tests/test_graphql/test_nodes_property_filter.py b/python/tests/test_graphql/test_nodes_property_filter.py index 868576cd0..e635868a9 100644 --- a/python/tests/test_graphql/test_nodes_property_filter.py +++ b/python/tests/test_graphql/test_nodes_property_filter.py @@ -185,7 +185,7 @@ def test_node_property_filter_equal_type_error(graph): } } """ - expected_error_message = "PropertyType Error: Wrong type for property prop5: expected List but actual type is I64" + expected_error_message = "PropertyType Error: Wrong type for property prop5: expected List(I64) but actual type is I64" run_graphql_error_test(query, expected_error_message, graph()) diff --git a/python/tests/test_graphql/test_schema.py b/python/tests/test_graphql/test_schema.py index d39b8f679..722fe7953 100644 --- a/python/tests/test_graphql/test_schema.py +++ b/python/tests/test_graphql/test_schema.py @@ -156,7 +156,7 @@ def test_node_edge_properties_schema(): }, { "key": "prop6", - "propertyType": "Map", + "propertyType": "Map{ data: Str }", "variants": ['{"data": "map"}'], }, ], @@ -172,7 +172,7 @@ def test_node_edge_properties_schema(): }, { "key": "propArray", - "propertyType": "List", + "propertyType": "List", "variants": ["[1, 2, 3]"], }, { @@ -263,7 +263,7 @@ def test_node_edge_properties_schema(): }, { "key": "list_prop", - "propertyType": "List", + "propertyType": "List", "variants": ["[1.1, 2.2, 3.3]"], }, { diff --git a/raphtory-api/src/core/entities/properties/mod.rs b/raphtory-api/src/core/entities/properties/mod.rs index 5686e4a75..2d0be9568 100644 --- a/raphtory-api/src/core/entities/properties/mod.rs +++ b/raphtory-api/src/core/entities/properties/mod.rs @@ -2,7 +2,7 @@ use crate::core::PropType; pub mod props; -#[derive(thiserror::Error, Debug)] +#[derive(thiserror::Error, Debug, PartialEq)] pub enum PropError { #[error("Wrong type for property {name}: expected {expected:?} but actual type is {actual:?}")] PropertyTypeError { diff --git a/raphtory-api/src/core/entities/properties/props.rs b/raphtory-api/src/core/entities/properties/props.rs index 4048b9bfd..9e91d6974 100644 --- a/raphtory-api/src/core/entities/properties/props.rs +++ b/raphtory-api/src/core/entities/properties/props.rs @@ -9,7 +9,7 @@ use crate::core::{ dict_mapper::{DictMapper, MaybeNew}, locked_vec::ArcReadLockedVec, }, - PropType, + unify_types, PropType, }; use super::PropError; @@ -217,37 +217,33 @@ impl PropMapper { let id = wrapped_id.inner(); let dtype_read = self.dtypes.read_recursive(); if let Some(old_type) = dtype_read.get(id) { - if !matches!(old_type, PropType::Empty) { - return if *old_type == dtype { - Ok(wrapped_id) - } else { - Err(PropError::PropertyTypeError { - name: prop.to_owned(), - expected: old_type.clone(), - actual: dtype, - }) - }; + let mut unified = false; + if let Ok(_) = unify_types(&dtype, old_type, &mut unified) { + if !unified { + // means the types were equal, no change needed + return Ok(wrapped_id); + } + } else { + return Err(PropError::PropertyTypeError { + name: prop.to_owned(), + expected: old_type.clone(), + actual: dtype, + }); } } drop(dtype_read); // drop the read lock and wait for write lock as type did not exist yet let mut dtype_write = self.dtypes.write(); match dtype_write.get(id).cloned() { Some(old_type) => { - if matches!(old_type, PropType::Empty) { - // vector already resized but this id is not filled yet, set the dtype and return id - dtype_write[id] = dtype; + if let Ok(tpe) = unify_types(&dtype, &old_type, &mut false) { + dtype_write[id] = tpe; Ok(wrapped_id) } else { - // already filled because a different thread won the race for this id, check the type matches - if old_type == dtype { - Ok(wrapped_id) - } else { - Err(PropError::PropertyTypeError { - name: prop.to_owned(), - expected: old_type, - actual: dtype, - }) - } + Err(PropError::PropertyTypeError { + name: prop.to_owned(), + expected: old_type, + actual: dtype, + }) } } None => { @@ -276,3 +272,87 @@ impl PropMapper { self.dtypes.read_recursive() } } + +#[cfg(test)] +mod tests { + use super::*; + use crate::core::PropType; + + #[test] + fn test_get_or_create_and_validate_new_property() { + let prop_mapper = PropMapper::default(); + let result = prop_mapper.get_or_create_and_validate("new_prop", PropType::U8); + assert!(result.is_ok()); + assert_eq!(result.unwrap().inner(), 0); + assert_eq!(prop_mapper.get_dtype(0), Some(PropType::U8)); + } + + #[test] + fn test_get_or_create_and_validate_existing_property_same_type() { + let prop_mapper = PropMapper::default(); + prop_mapper + .get_or_create_and_validate("existing_prop", PropType::U8) + .unwrap(); + let result = prop_mapper.get_or_create_and_validate("existing_prop", PropType::U8); + assert!(result.is_ok()); + assert_eq!(result.unwrap().inner(), 0); + assert_eq!(prop_mapper.get_dtype(0), Some(PropType::U8)); + } + + #[test] + fn test_get_or_create_and_validate_existing_property_different_type() { + let prop_mapper = PropMapper::default(); + prop_mapper + .get_or_create_and_validate("existing_prop", PropType::U8) + .unwrap(); + let result = prop_mapper.get_or_create_and_validate("existing_prop", PropType::U16); + assert!(result.is_err()); + if let Err(PropError::PropertyTypeError { + name, + expected, + actual, + }) = result + { + assert_eq!(name, "existing_prop"); + assert_eq!(expected, PropType::U8); + assert_eq!(actual, PropType::U16); + } else { + panic!("Expected PropertyTypeError"); + } + } + + #[test] + fn test_get_or_create_and_validate_unify_types() { + let prop_mapper = PropMapper::default(); + prop_mapper + .get_or_create_and_validate("prop", PropType::Empty) + .unwrap(); + let result = prop_mapper.get_or_create_and_validate("prop", PropType::U8); + assert!(result.is_ok()); + assert_eq!(result.unwrap().inner(), 0); + assert_eq!(prop_mapper.get_dtype(0), Some(PropType::U8)); + } + + #[test] + fn test_get_or_create_and_validate_resize_vector() { + let prop_mapper = PropMapper::default(); + prop_mapper.set_id_and_dtype("existing_prop", 5, PropType::U8); + let result = prop_mapper.get_or_create_and_validate("new_prop", PropType::U16); + assert!(result.is_ok()); + assert_eq!(result.unwrap().inner(), 6); + assert_eq!(prop_mapper.get_dtype(6), Some(PropType::U16)); + } + + #[test] + fn test_get_or_create_and_validate_two_independent_properties() { + let prop_mapper = PropMapper::default(); + let result1 = prop_mapper.get_or_create_and_validate("prop1", PropType::U8); + let result2 = prop_mapper.get_or_create_and_validate("prop2", PropType::U16); + assert!(result1.is_ok()); + assert!(result2.is_ok()); + assert_eq!(result1.unwrap().inner(), 0); + assert_eq!(result2.unwrap().inner(), 1); + assert_eq!(prop_mapper.get_dtype(0), Some(PropType::U8)); + assert_eq!(prop_mapper.get_dtype(1), Some(PropType::U16)); + } +} diff --git a/raphtory-api/src/core/mod.rs b/raphtory-api/src/core/mod.rs index ba2a53410..7ec079af5 100644 --- a/raphtory-api/src/core/mod.rs +++ b/raphtory-api/src/core/mod.rs @@ -1,4 +1,7 @@ -use std::fmt::{self, Display, Formatter}; +use std::{ + collections::HashMap, + fmt::{self, Display, Formatter}, +}; use serde::{Deserialize, Serialize}; @@ -30,10 +33,9 @@ pub enum PropType { F32, F64, Bool, - List, - Map, + List(Box), + Map(HashMap), NDTime, - Document, DTime, Array(Box), } @@ -52,10 +54,16 @@ impl Display for PropType { PropType::F32 => "F32", PropType::F64 => "F64", PropType::Bool => "Bool", - PropType::List => "List", - PropType::Map => "Map", + PropType::List(p_type) => return write!(f, "List<{}>", p_type), + PropType::Map(p_type) => { + let mut types = p_type + .iter() + .map(|(k, v)| format!("{}: {}", k, v)) + .collect::>(); + types.sort(); + return write!(f, "Map{{ {} }}", types.join(", ")); + } PropType::NDTime => "NDTime", - PropType::Document => "Document", PropType::DTime => "DTime", PropType::Array(p_type) => return write!(f, "Array<{}>", p_type), }; @@ -65,6 +73,14 @@ impl Display for PropType { } impl PropType { + pub fn map(fields: impl IntoIterator, PropType)>) -> Self { + PropType::Map( + fields + .into_iter() + .map(|(k, v)| (k.as_ref().to_owned(), v)) + .collect(), + ) + } pub fn is_numeric(&self) -> bool { matches!( self, @@ -104,6 +120,7 @@ impl PropType { } } +use crate::core::entities::properties::PropError; #[cfg(feature = "storage")] use polars_arrow::datatypes::ArrowDataType as DataType; @@ -127,3 +144,190 @@ impl From<&DataType> for PropType { } } } + +// step through these types trees and check they are structurally the same +// if we encounter an empty we replace it with the other type +// the result is the unified type or err if the types are not compatible +pub fn unify_types(l: &PropType, r: &PropType, unified: &mut bool) -> Result { + match (l, r) { + (PropType::Empty, r) => { + *unified = true; + Ok(r.clone()) + } + (l, PropType::Empty) => { + *unified = true; + Ok(l.clone()) + } + (PropType::Str, PropType::Str) => Ok(PropType::Str), + (PropType::U8, PropType::U8) => Ok(PropType::U8), + (PropType::U16, PropType::U16) => Ok(PropType::U16), + (PropType::I32, PropType::I32) => Ok(PropType::I32), + (PropType::I64, PropType::I64) => Ok(PropType::I64), + (PropType::U32, PropType::U32) => Ok(PropType::U32), + (PropType::U64, PropType::U64) => Ok(PropType::U64), + (PropType::F32, PropType::F32) => Ok(PropType::F32), + (PropType::F64, PropType::F64) => Ok(PropType::F64), + (PropType::Bool, PropType::Bool) => Ok(PropType::Bool), + (PropType::NDTime, PropType::NDTime) => Ok(PropType::NDTime), + (PropType::DTime, PropType::DTime) => Ok(PropType::DTime), + (PropType::List(l_type), PropType::List(r_type)) => { + unify_types(l_type, r_type, unified).map(|t| PropType::List(Box::new(t))) + } + (PropType::Array(l_type), PropType::Array(r_type)) => { + unify_types(l_type, r_type, unified).map(|t| PropType::Array(Box::new(t))) + } + (PropType::Map(l_map), PropType::Map(r_map)) => { + // maps need to be merged and only overlapping keys need to be unified + + let mut merged = HashMap::new(); + for (k, v) in l_map.iter() { + if let Some(r_v) = r_map.get(k) { + let merged_prop = unify_types(v, r_v, unified)?; + merged.insert(k.clone(), merged_prop); + } else { + merged.insert(k.clone(), v.clone()); + *unified = true; + } + } + for (k, v) in r_map.iter() { + if !merged.contains_key(k) { + merged.insert(k.clone(), v.clone()); + *unified = true; + } + } + Ok(PropType::Map(merged)) + } + (_, _) => Err(PropError::PropertyTypeError { + name: "unknown".to_string(), + expected: l.clone(), + actual: r.clone(), + }), + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_unify_types_ne() { + let l = PropType::List(Box::new(PropType::U8)); + let r = PropType::List(Box::new(PropType::U16)); + assert!(unify_types(&l, &r, &mut false).is_err()); + + let l = PropType::map([("a".to_string(), PropType::U8)]); + let r = PropType::map([("a".to_string(), PropType::U16)]); + assert!(unify_types(&l, &r, &mut false).is_err()); + + let l = PropType::List(Box::new(PropType::U8)); + let r = PropType::List(Box::new(PropType::U16)); + assert!(unify_types(&l, &r, &mut false).is_err()); + } + + #[test] + fn test_unify_types_eq() { + let l = PropType::List(Box::new(PropType::U8)); + let r = PropType::List(Box::new(PropType::U8)); + assert_eq!( + unify_types(&l, &r, &mut false), + Ok(PropType::List(Box::new(PropType::U8))) + ); + + let l = PropType::map([("a".to_string(), PropType::U8)]); + let r = PropType::map([("a".to_string(), PropType::U8)]); + assert_eq!( + unify_types(&l, &r, &mut false), + Ok(PropType::map([("a".to_string(), PropType::U8)])) + ); + } + + #[test] + fn test_unify_maps() { + let l = PropType::map([("a".to_string(), PropType::U8)]); + let r = PropType::map([("a".to_string(), PropType::U16)]); + assert!(unify_types(&l, &r, &mut false).is_err()); + + let l = PropType::map([("a".to_string(), PropType::U8)]); + let r = PropType::map([("b".to_string(), PropType::U16)]); + let mut unify = false; + assert_eq!( + unify_types(&l, &r, &mut unify), + Ok(PropType::map([ + ("a".to_string(), PropType::U8), + ("b".to_string(), PropType::U16) + ])) + ); + assert!(unify); + + let l = PropType::map([("a".to_string(), PropType::U8)]); + let r = PropType::map([ + ("a".to_string(), PropType::U8), + ("b".to_string(), PropType::U16), + ]); + let mut unify = false; + assert_eq!( + unify_types(&l, &r, &mut unify), + Ok(PropType::map([ + ("a".to_string(), PropType::U8), + ("b".to_string(), PropType::U16) + ])) + ); + assert!(unify); + + let l = PropType::map([ + ("a".to_string(), PropType::U8), + ("b".to_string(), PropType::U16), + ]); + let r = PropType::map([("a".to_string(), PropType::U8)]); + let mut unify = false; + assert_eq!( + unify_types(&l, &r, &mut unify), + Ok(PropType::map([ + ("a".to_string(), PropType::U8), + ("b".to_string(), PropType::U16) + ])) + ); + assert!(unify); + } + + #[test] + fn test_unify() { + let l = PropType::Empty; + let r = PropType::U8; + let mut unify = false; + assert_eq!(unify_types(&l, &r, &mut unify), Ok(PropType::U8)); + assert!(unify); + + let l = PropType::Str; + let r = PropType::Empty; + let mut unify = false; + assert_eq!(unify_types(&l, &r, &mut unify), Ok(PropType::Str)); + assert!(unify); + + let l = PropType::List(Box::new(PropType::List(Box::new(PropType::U8)))); + let r = PropType::List(Box::new(PropType::Empty)); + let mut unify = false; + assert_eq!( + unify_types(&l, &r, &mut unify), + Ok(PropType::List(Box::new(PropType::List(Box::new( + PropType::U8 + ))))) + ); + assert!(unify); + + let l = PropType::Array(Box::new(PropType::map([("a".to_string(), PropType::U8)]))); + let r = PropType::Array(Box::new(PropType::map([ + ("a".to_string(), PropType::Empty), + ("b".to_string(), PropType::Str), + ]))); + let mut unify = false; + assert_eq!( + unify_types(&l, &r, &mut unify), + Ok(PropType::Array(Box::new(PropType::map([ + ("a".to_string(), PropType::U8), + ("b".to_string(), PropType::Str) + ])))) + ); + assert!(unify); + } +} diff --git a/raphtory-api/src/core/storage/dict_mapper.rs b/raphtory-api/src/core/storage/dict_mapper.rs index a1af880ec..bb96dc16d 100644 --- a/raphtory-api/src/core/storage/dict_mapper.rs +++ b/raphtory-api/src/core/storage/dict_mapper.rs @@ -141,10 +141,9 @@ impl DictMapper { pub fn get_name(&self, id: usize) -> ArcStr { let guard = self.reverse_map.read(); - guard - .get(id) - .cloned() - .expect("internal ids should always be mapped to a name") + guard.get(id).cloned().expect(&format!( + "internal ids should always be mapped to a name {id}" + )) } pub fn get_keys(&self) -> ArcReadLockedVec { diff --git a/raphtory-api/src/lib.rs b/raphtory-api/src/lib.rs index a5bb02383..3b6c3cbf9 100644 --- a/raphtory-api/src/lib.rs +++ b/raphtory-api/src/lib.rs @@ -5,3 +5,9 @@ pub mod core; pub mod python; pub mod iter; + +#[derive(PartialOrd, PartialEq, Debug)] +pub enum GraphType { + EventGraph, + PersistentGraph, +} diff --git a/raphtory-graphql/src/model/graph/property.rs b/raphtory-graphql/src/model/graph/property.rs index 08e8ecb21..3e845dd39 100644 --- a/raphtory-graphql/src/model/graph/property.rs +++ b/raphtory-graphql/src/model/graph/property.rs @@ -77,7 +77,6 @@ fn prop_to_gql(prop: &Prop) -> GqlValue { Prop::DTime(t) => GqlValue::Number(t.timestamp_millis().into()), Prop::NDTime(t) => GqlValue::Number(t.and_utc().timestamp_millis().into()), Prop::Array(a) => GqlValue::List(a.iter_prop().map(|p| prop_to_gql(&p)).collect()), - Prop::Document(d) => GqlValue::String(d.content.to_owned()), // TODO: return GqlValue::Object ?? } } diff --git a/raphtory-graphql/src/model/schema/node_schema.rs b/raphtory-graphql/src/model/schema/node_schema.rs index a82e34df8..e214a41be 100644 --- a/raphtory-graphql/src/model/schema/node_schema.rs +++ b/raphtory-graphql/src/model/schema/node_schema.rs @@ -127,6 +127,10 @@ mod test { "list_prop", Prop::List(vec![Prop::F64(1.1), Prop::F64(2.2), Prop::F64(3.3)].into()), ), + ( + "map_prop", + Prop::map([("a", Prop::F64(1.0)), ("b", Prop::F64(2.0))]), + ), ("cost_b", Prop::F64(76.0)), ], Some("b"), @@ -171,10 +175,15 @@ mod test { ( "b".to_string(), vec![ - (("list_prop", "List"), ["[1.1, 2.2, 3.3]"]).into(), + (("list_prop", "List"), ["[1.1, 2.2, 3.3]"]).into(), (("cost_b", "F64"), ["76"]).into(), (("str_prop", "Str"), ["hello"]).into(), (("bool_prop", "Bool"), ["true"]).into(), + ( + ("map_prop", "Map{ a: F64, b: F64 }"), + ["{\"a\": 1, \"b\": 2}"], + ) + .into(), ], ), ]; diff --git a/raphtory-graphql/src/python/client/mod.rs b/raphtory-graphql/src/python/client/mod.rs index 8f8b29880..e863d8caf 100644 --- a/raphtory-graphql/src/python/client/mod.rs +++ b/raphtory-graphql/src/python/client/mod.rs @@ -3,7 +3,7 @@ use pyo3::{pyclass, pymethods}; use raphtory::{ core::{ utils::{errors::GraphError, time::IntoTime}, - DocumentInput, Prop, + Prop, }, python::utils::PyTime, }; @@ -257,7 +257,6 @@ fn inner_collection(value: &Prop) -> String { } Prop::DTime(value) => format!("\"{}\"", value.to_string()), Prop::NDTime(value) => format!("\"{}\"", value.to_string()), - Prop::Document(DocumentInput { content, .. }) => content.to_owned().to_string(), // TODO: return Value::Object ?? } } @@ -298,7 +297,6 @@ fn to_graphql_valid(key: &String, value: &Prop) -> String { } Prop::DTime(value) => format!("{{ key: \"{}\", value: \"{}\" }}", key, value.to_string()), Prop::NDTime(value) => format!("{{ key: \"{}\", value: \"{}\" }}", key, value.to_string()), - Prop::Document(_) => "Document cannot be converted to JSON".to_string(), // TODO: return Value::Object ?? } } diff --git a/raphtory/Cargo.toml b/raphtory/Cargo.toml index a70cc6799..f26b921c3 100644 --- a/raphtory/Cargo.toml +++ b/raphtory/Cargo.toml @@ -17,7 +17,7 @@ homepage.workspace = true [dependencies] raphtory-api = { path = "../raphtory-api", version = "0.14.0" } arrow-ipc = {workspace = true} -arrow-array = {workspace = true} +arrow-array = {workspace = true, features = ["chrono-tz"]} arrow-schema = {workspace = true} arrow-buffer = {workspace = true} arrow-data ={ workspace = true } @@ -77,6 +77,8 @@ num = { workspace = true, optional = true } display-error-chain = { workspace = true, optional = true } polars-arrow = { workspace = true, optional = true } polars-parquet = { workspace = true, optional = true } +parquet = { workspace = true, optional = true } +arrow-json = { workspace = true, optional = true } memmap2 = { workspace = true, optional = true } tempfile = { workspace = true, optional = true } pometry-storage = { workspace = true, optional = true } @@ -115,6 +117,8 @@ io = [ "dep:csv", "dep:reqwest", "dep:tokio", + "dep:parquet", + "dep:arrow-json", "proto", ] @@ -168,4 +172,6 @@ proto = [ "dep:prost-build", "dep:memmap2", "dep:zip", -] \ No newline at end of file + "arrow", + "io", +] diff --git a/raphtory/src/core/entities/properties/graph_meta.rs b/raphtory/src/core/entities/properties/graph_meta.rs index 57abbcf78..c16207240 100644 --- a/raphtory/src/core/entities/properties/graph_meta.rs +++ b/raphtory/src/core/entities/properties/graph_meta.rs @@ -5,10 +5,7 @@ use crate::core::{ Prop, PropType, }; use raphtory_api::core::storage::{ - arc_str::ArcStr, - dict_mapper::{DictMapper, MaybeNew}, - locked_vec::ArcReadLockedVec, - FxDashMap, + arc_str::ArcStr, dict_mapper::MaybeNew, locked_vec::ArcReadLockedVec, FxDashMap, }; use serde::{Deserialize, Serialize}; #[cfg(feature = "proto")] @@ -17,7 +14,7 @@ use std::ops::DerefMut; #[derive(Serialize, Deserialize, Debug)] pub struct GraphMeta { - constant_mapper: DictMapper, + constant_mapper: PropMapper, temporal_mapper: PropMapper, constant: FxDashMap>, temporal: FxDashMap, @@ -26,7 +23,7 @@ pub struct GraphMeta { impl GraphMeta { pub(crate) fn new() -> Self { Self { - constant_mapper: DictMapper::default(), + constant_mapper: PropMapper::default(), temporal_mapper: PropMapper::default(), constant: FxDashMap::default(), temporal: FxDashMap::default(), @@ -43,7 +40,7 @@ impl GraphMeta { } #[inline] - pub fn const_prop_meta(&self) -> &DictMapper { + pub fn const_prop_meta(&self) -> &PropMapper { &self.constant_mapper } @@ -59,13 +56,15 @@ impl GraphMeta { dtype: PropType, is_static: bool, ) -> Result, GraphError> { - if is_static { - Ok(self.constant_mapper.get_or_create_id(name)) + let mapper = if is_static { + &self.constant_mapper } else { - self.temporal_mapper - .get_or_create_and_validate(name, dtype) - .map_err(|e| e.into()) - } + &self.temporal_mapper + }; + + mapper + .get_or_create_and_validate(name, dtype) + .map_err(|e| e.into()) } pub(crate) fn add_constant_prop( @@ -147,6 +146,10 @@ impl GraphMeta { self.temporal_mapper.get_dtype(prop_id) } + pub fn get_const_dtype(&self, prop_id: usize) -> Option { + self.constant_mapper.get_dtype(prop_id) + } + pub(crate) fn constant_names(&self) -> ArcReadLockedVec { self.constant_mapper.get_keys() } diff --git a/raphtory/src/core/entities/properties/tprop.rs b/raphtory/src/core/entities/properties/tprop.rs index cab2c9aaf..031a30abe 100644 --- a/raphtory/src/core/entities/properties/tprop.rs +++ b/raphtory/src/core/entities/properties/tprop.rs @@ -3,14 +3,15 @@ use crate::{ entities::properties::tcell::TCell, storage::{timeindex::TimeIndexEntry, TPropColumn}, utils::errors::GraphError, - DocumentInput, Prop, PropArray, + Prop, PropArray, }, db::api::storage::graph::tprop_storage_ops::TPropOps, }; use chrono::{DateTime, NaiveDateTime, Utc}; use raphtory_api::{core::storage::arc_str::ArcStr, iter::BoxedLIter}; +use rustc_hash::FxHashMap; use serde::{Deserialize, Serialize}; -use std::{collections::HashMap, iter, ops::Range, sync::Arc}; +use std::{iter, ops::Range, sync::Arc}; #[derive(Debug, Default, PartialEq, Clone, Serialize, Deserialize)] pub enum TProp { @@ -29,9 +30,8 @@ pub enum TProp { DTime(TCell>), Array(TCell), NDTime(TCell), - Document(TCell), List(TCell>>), - Map(TCell>>), + Map(TCell>>), } #[derive(Copy, Clone, Debug)] @@ -101,7 +101,6 @@ impl TProp { Prop::DTime(value) => TProp::DTime(TCell::new(t, value)), Prop::NDTime(value) => TProp::NDTime(TCell::new(t, value)), Prop::Array(value) => TProp::Array(TCell::new(t, value)), - Prop::Document(value) => TProp::Document(TCell::new(t, value)), Prop::List(value) => TProp::List(TCell::new(t, value)), Prop::Map(value) => TProp::Map(TCell::new(t, value)), } @@ -153,9 +152,6 @@ impl TProp { (TProp::Array(cell), Prop::Array(a)) => { cell.set(t, a); } - (TProp::Document(cell), Prop::Document(a)) => { - cell.set(t, a); - } (TProp::List(cell), Prop::List(a)) => { cell.set(t, a); } @@ -191,10 +187,6 @@ impl TProp { cell.iter() .map(|(t, value)| (*t, Prop::Array(value.clone()))), ), - TProp::Document(cell) => Box::new( - cell.iter() - .map(|(t, value)| (*t, Prop::Document(value.clone()))), - ), TProp::List(cell) => Box::new( cell.iter() .map(|(t, value)| (*t, Prop::List(value.clone()))), @@ -231,10 +223,6 @@ impl TProp { cell.iter_t() .map(|(t, value)| (t, Prop::Array(value.clone()))), ), - TProp::Document(cell) => Box::new( - cell.iter_t() - .map(|(t, value)| (t, Prop::Document(value.clone()))), - ), TProp::List(cell) => Box::new( cell.iter_t() .map(|(t, value)| (t, Prop::List(value.clone()))), @@ -303,10 +291,6 @@ impl TProp { cell.iter_window(r) .map(|(t, value)| (*t, Prop::Array(value.clone()))), ), - TProp::Document(cell) => Box::new( - cell.iter_window(r) - .map(|(t, value)| (*t, Prop::Document(value.clone()))), - ), TProp::List(cell) => Box::new( cell.iter_window(r) .map(|(t, value)| (*t, Prop::List(value.clone()))), @@ -338,9 +322,6 @@ impl<'a> TPropOps<'a> for &'a TProp { TProp::Array(cell) => cell .last_before(t) .map(|(t, v)| (t, Prop::Array(v.clone()))), - TProp::Document(cell) => cell - .last_before(t) - .map(|(t, v)| (t, Prop::Document(v.clone()))), TProp::List(cell) => cell.last_before(t).map(|(t, v)| (t, Prop::List(v.clone()))), TProp::Map(cell) => cell.last_before(t).map(|(t, v)| (t, Prop::Map(v.clone()))), } @@ -373,7 +354,6 @@ impl<'a> TPropOps<'a> for &'a TProp { TProp::DTime(cell) => cell.at(ti).map(|v| Prop::DTime(*v)), TProp::NDTime(cell) => cell.at(ti).map(|v| Prop::NDTime(*v)), TProp::Array(cell) => cell.at(ti).map(|v| Prop::Array(v.clone())), - TProp::Document(cell) => cell.at(ti).map(|v| Prop::Document(v.clone())), TProp::List(cell) => cell.at(ti).map(|v| Prop::List(v.clone())), TProp::Map(cell) => cell.at(ti).map(|v| Prop::Map(v.clone())), } diff --git a/raphtory/src/core/mod.rs b/raphtory/src/core/mod.rs index 40892b786..0525ffa3c 100644 --- a/raphtory/src/core/mod.rs +++ b/raphtory/src/core/mod.rs @@ -40,6 +40,7 @@ use std::{ use utils::errors::GraphError; use arrow_schema::{DataType, Field}; +use rustc_hash::FxHashMap; #[cfg(test)] extern crate core; @@ -94,11 +95,10 @@ pub enum Prop { F64(f64), Bool(bool), List(Arc>), - Map(Arc>), + Map(Arc>), NDTime(NaiveDateTime), DTime(DateTime), Array(PropArray), - Document(DocumentInput), } impl Hash for Prop { @@ -134,7 +134,6 @@ impl Hash for Prop { prop.hash(state); } } - Prop::Document(d) => d.hash(state), } } } @@ -164,7 +163,7 @@ impl PartialOrd for Prop { impl Prop { pub fn map(vals: impl IntoIterator, Prop)>) -> Self { - let h_map: HashMap<_, _> = vals.into_iter().map(|(k, v)| (k.into(), v)).collect(); + let h_map: FxHashMap<_, _> = vals.into_iter().map(|(k, v)| (k.into(), v)).collect(); Prop::Map(h_map.into()) } @@ -188,8 +187,22 @@ impl Prop { Prop::F32(_) => PropType::F32, Prop::F64(_) => PropType::F64, Prop::Bool(_) => PropType::Bool, - Prop::List(_) => PropType::List, - Prop::Map(_) => PropType::Map, + Prop::List(list) => { + let list_type = list + .iter() + .map(|p| Ok(p.dtype())) + .reduce(|a, b| unify_types(&a?, &b?, &mut false)) + .transpose() + .map(|e| e.unwrap_or(PropType::Empty)) + .expect(&format!("Cannot unify types for list {:?}", list)); + PropType::List(Box::new(list_type)) + } + Prop::Map(map) => PropType::Map( + map.iter() + .map(|(k, prop)| (k.to_string(), prop.dtype())) + .sorted_by(|(k1, _), (k2, _)| k1.cmp(k2)) + .collect(), + ), Prop::NDTime(_) => PropType::NDTime, Prop::Array(arr) => { let arrow_dtype = arr @@ -198,7 +211,6 @@ impl Prop { .data_type(); PropType::Array(Box::new(prop_type_from_arrow_dtype(arrow_dtype))) } - Prop::Document(_) => PropType::Document, Prop::DTime(_) => PropType::DTime, } } @@ -265,15 +277,38 @@ pub fn arrow_dtype_from_prop_type(prop_type: &PropType) -> Result Ok(DataType::Float32), PropType::F64 => Ok(DataType::Float64), PropType::Bool => Ok(DataType::Boolean), + PropType::NDTime => Ok(DataType::Timestamp( + arrow_schema::TimeUnit::Millisecond, + None, + )), + PropType::DTime => Ok(DataType::Timestamp( + arrow_schema::TimeUnit::Millisecond, + Some("UTC".into()), + )), PropType::Array(d_type) => Ok(DataType::List( Field::new("data", arrow_dtype_from_prop_type(&d_type)?, true).into(), )), - PropType::Empty - | PropType::List - | PropType::Map - | PropType::NDTime - | PropType::Document - | PropType::DTime => Err(GraphError::UnsupportedArrowDataType(prop_type.clone())), //panic!("{prop_type:?} not supported as disk_graph property"), + + PropType::List(d_type) => Ok(DataType::List( + Field::new("data", arrow_dtype_from_prop_type(&d_type)?, true).into(), + )), + PropType::Map(d_type) => { + let fields = d_type + .iter() + .map(|(k, v)| { + Ok::<_, GraphError>(Field::new( + k.to_string(), + arrow_dtype_from_prop_type(v)?, + true, + )) + }) + .collect::, _>>()?; + Ok(DataType::Struct(fields.into())) + } + PropType::Empty => { + // this is odd, we'll just pick one and hope for the best + Ok(DataType::Null) + } } } @@ -353,8 +388,8 @@ pub trait PropUnwrap: Sized { self.into_list().unwrap() } - fn into_map(self) -> Option>>; - fn unwrap_map(self) -> Arc> { + fn into_map(self) -> Option>>; + fn unwrap_map(self) -> Arc> { self.into_map().unwrap() } @@ -363,11 +398,6 @@ pub trait PropUnwrap: Sized { self.into_ndtime().unwrap() } - fn into_document(self) -> Option; - fn unwrap_document(self) -> DocumentInput { - self.into_document().unwrap() - } - fn into_array(self) -> Option; fn unwrap_array(self) -> ArrayRef { self.into_array().unwrap() @@ -421,7 +451,7 @@ impl PropUnwrap for Option

{ self.and_then(|p| p.into_list()) } - fn into_map(self) -> Option>> { + fn into_map(self) -> Option>> { self.and_then(|p| p.into_map()) } @@ -429,10 +459,6 @@ impl PropUnwrap for Option

{ self.and_then(|p| p.into_ndtime()) } - fn into_document(self) -> Option { - self.and_then(|p| p.into_document()) - } - fn into_array(self) -> Option { self.and_then(|p| p.into_array()) } @@ -531,7 +557,7 @@ impl PropUnwrap for Prop { } } - fn into_map(self) -> Option>> { + fn into_map(self) -> Option>> { if let Prop::Map(v) = self { Some(v) } else { @@ -547,14 +573,6 @@ impl PropUnwrap for Prop { } } - fn into_document(self) -> Option { - if let Prop::Document(d) = self { - Some(d) - } else { - None - } - } - fn into_array(self) -> Option { if let Prop::Array(v) = self { v.into_array_ref() @@ -632,7 +650,6 @@ impl Display for Prop { .join(", ") ) } - Prop::Document(value) => write!(f, "{}", value), } } } @@ -740,6 +757,12 @@ impl From for Prop { impl From> for Prop { fn from(value: HashMap) -> Self { + Prop::Map(Arc::new(value.into_iter().collect())) + } +} + +impl From> for Prop { + fn from(value: FxHashMap) -> Self { Prop::Map(Arc::new(value)) } } @@ -817,10 +840,6 @@ impl From for Value { } Prop::NDTime(value) => Value::String(value.to_string()), Prop::DTime(value) => Value::String(value.to_string()), - Prop::Document(doc) => json!({ - "content": doc.content, - "life": Value::from(doc.life), - }), _ => Value::Null, } } diff --git a/raphtory/src/core/storage/mod.rs b/raphtory/src/core/storage/mod.rs index f9f8f6bf3..509042a65 100644 --- a/raphtory/src/core/storage/mod.rs +++ b/raphtory/src/core/storage/mod.rs @@ -1,3 +1,4 @@ +use super::{utils::errors::GraphError, Prop, PropArray}; use crate::core::entities::nodes::node_store::NodeStore; use lazy_vec::LazyVec; use lock_api; @@ -8,9 +9,9 @@ use raphtory_api::core::{ storage::arc_str::ArcStr, }; use rayon::prelude::*; +use rustc_hash::FxHashMap; use serde::{Deserialize, Serialize}; use std::{ - collections::HashMap, fmt::Debug, marker::PhantomData, ops::{Deref, DerefMut, Index, IndexMut}, @@ -20,8 +21,6 @@ use std::{ }, }; -use super::{utils::errors::GraphError, DocumentInput, Prop, PropArray}; - pub mod lazy_vec; pub mod locked_view; pub mod node_entry; @@ -132,10 +131,9 @@ pub(crate) enum TPropColumn { Str(LazyVec), Array(LazyVec), List(LazyVec>>), - Map(LazyVec>>), + Map(LazyVec>>), NDTime(LazyVec), DTime(LazyVec>), - Document(LazyVec), } impl Default for TPropColumn { @@ -175,7 +173,6 @@ impl TPropColumn { (TPropColumn::Map(col), Prop::Map(v)) => col.set(index, v)?, (TPropColumn::NDTime(col), Prop::NDTime(v)) => col.set(index, v)?, (TPropColumn::DTime(col), Prop::DTime(v)) => col.set(index, v)?, - (TPropColumn::Document(col), Prop::Document(v)) => col.set(index, v)?, _ => return Err(GraphError::IncorrectPropertyType), } Ok(()) @@ -185,6 +182,7 @@ impl TPropColumn { self.init_empty_col(&prop)?; match (self, prop) { (TPropColumn::Bool(col), Prop::Bool(v)) => col.push(Some(v)), + (TPropColumn::U8(col), Prop::U8(v)) => col.push(Some(v)), (TPropColumn::I64(col), Prop::I64(v)) => col.push(Some(v)), (TPropColumn::U32(col), Prop::U32(v)) => col.push(Some(v)), (TPropColumn::U64(col), Prop::U64(v)) => col.push(Some(v)), @@ -198,7 +196,6 @@ impl TPropColumn { (TPropColumn::Map(col), Prop::Map(v)) => col.push(Some(v)), (TPropColumn::NDTime(col), Prop::NDTime(v)) => col.push(Some(v)), (TPropColumn::DTime(col), Prop::DTime(v)) => col.push(Some(v)), - (TPropColumn::Document(col), Prop::Document(v)) => col.push(Some(v)), _ => return Err(GraphError::IncorrectPropertyType), } Ok(()) @@ -222,7 +219,6 @@ impl TPropColumn { Prop::Map(_) => *self = TPropColumn::Map(LazyVec::with_len(*len)), Prop::NDTime(_) => *self = TPropColumn::NDTime(LazyVec::with_len(*len)), Prop::DTime(_) => *self = TPropColumn::DTime(LazyVec::with_len(*len)), - Prop::Document(_) => *self = TPropColumn::Document(LazyVec::with_len(*len)), }, _ => {} } @@ -250,7 +246,6 @@ impl TPropColumn { TPropColumn::Map(col) => col.push(None), TPropColumn::NDTime(col) => col.push(None), TPropColumn::DTime(col) => col.push(None), - TPropColumn::Document(col) => col.push(None), TPropColumn::Empty(count) => { *count += 1; } @@ -274,9 +269,6 @@ impl TPropColumn { TPropColumn::Map(col) => col.get_opt(index).map(|prop| Prop::Map(prop.clone())), TPropColumn::NDTime(col) => col.get_opt(index).map(|prop| Prop::NDTime(prop.clone())), TPropColumn::DTime(col) => col.get_opt(index).map(|prop| Prop::DTime(prop.clone())), - TPropColumn::Document(col) => { - col.get_opt(index).map(|prop| Prop::Document(prop.clone())) - } TPropColumn::Empty(_) => None, } } @@ -298,7 +290,6 @@ impl TPropColumn { TPropColumn::Map(col) => col.len(), TPropColumn::NDTime(col) => col.len(), TPropColumn::DTime(col) => col.len(), - TPropColumn::Document(col) => col.len(), TPropColumn::Empty(count) => *count, } } diff --git a/raphtory/src/core/utils/errors.rs b/raphtory/src/core/utils/errors.rs index e392c6483..42cf0485d 100644 --- a/raphtory/src/core/utils/errors.rs +++ b/raphtory/src/core/utils/errors.rs @@ -1,4 +1,6 @@ use crate::core::{storage::lazy_vec::IllegalSet, utils::time::error::ParseTimeError, Prop}; +#[cfg(feature = "io")] +use parquet::errors::ParquetError; #[cfg(feature = "arrow")] use polars_arrow::{datatypes::ArrowDataType, legacy::error}; #[cfg(feature = "storage")] @@ -8,7 +10,7 @@ use pyo3::PyErr; #[cfg(feature = "arrow")] use raphtory_api::core::entities::GidType; use raphtory_api::core::{ - entities::{properties::PropError, GID}, + entities::{properties::PropError, GID, VID}, storage::arc_str::ArcStr, PropType, }; @@ -61,6 +63,8 @@ pub enum LoadError { MissingNodeError, #[error("Missing value for timestamp")] MissingTimeError, + #[error("Missing value for edge id {0:?} -> {1:?}")] + MissingEdgeError(VID, VID), #[error("Node IDs have the wrong type, expected {existing}, got {new}")] NodeIdTypeError { existing: GidType, new: GidType }, #[error("Fatal load error, graph may be in a dirty state.")] @@ -90,6 +94,10 @@ pub enum GraphError { Arrow(#[from] error::PolarsError), #[error("Arrow-rs error: {0}")] ArrowRs(#[from] arrow_schema::ArrowError), + + #[cfg(feature = "io")] + #[error("Arrow-rs parquet error: {0}")] + ParquetError(#[from] ParquetError), #[error("Invalid path: {source}")] InvalidPath { #[from] diff --git a/raphtory/src/db/api/properties/temporal_props.rs b/raphtory/src/db/api/properties/temporal_props.rs index 1f2383728..85f8193a5 100644 --- a/raphtory/src/db/api/properties/temporal_props.rs +++ b/raphtory/src/db/api/properties/temporal_props.rs @@ -1,10 +1,11 @@ use crate::{ - core::{DocumentInput, Prop, PropType, PropUnwrap}, + core::{Prop, PropType, PropUnwrap}, db::api::{properties::internal::PropertiesOps, view::BoxedLIter}, }; use arrow_array::ArrayRef; use chrono::{DateTime, NaiveDateTime, Utc}; use raphtory_api::core::storage::arc_str::ArcStr; +use rustc_hash::FxHashMap; use std::{ collections::{HashMap, HashSet}, iter::Zip, @@ -229,7 +230,7 @@ impl PropUnwrap for TemporalPropertyView

{ self.latest().into_list() } - fn into_map(self) -> Option>> { + fn into_map(self) -> Option>> { self.latest().into_map() } @@ -241,10 +242,6 @@ impl PropUnwrap for TemporalPropertyView

{ self.latest().into_array() } - fn into_document(self) -> Option { - self.latest().into_document() - } - fn as_f64(&self) -> Option { self.latest().as_f64() } diff --git a/raphtory/src/db/api/storage/graph/locked.rs b/raphtory/src/db/api/storage/graph/locked.rs index 122d42c2d..01e01624c 100644 --- a/raphtory/src/db/api/storage/graph/locked.rs +++ b/raphtory/src/db/api/storage/graph/locked.rs @@ -82,6 +82,15 @@ impl<'a> WriteLockedGraph<'a> { .get_or_init(gid, || self.graph.storage.nodes.next_id()) } + pub fn resolve_node_type( + &self, + node_type: Option<&str>, + ) -> Result, GraphError> { + node_type + .map(|node_type| Ok(self.graph.node_meta.get_or_create_node_type_id(node_type))) + .unwrap_or_else(|| Ok(MaybeNew::Existing(0))) + } + pub fn num_shards(&self) -> usize { self.nodes.num_shards().max(self.edges.num_shards()) } diff --git a/raphtory/src/db/api/storage/graph/storage_ops/additions.rs b/raphtory/src/db/api/storage/graph/storage_ops/additions.rs index 4accccc17..78f066440 100644 --- a/raphtory/src/db/api/storage/graph/storage_ops/additions.rs +++ b/raphtory/src/db/api/storage/graph/storage_ops/additions.rs @@ -120,7 +120,7 @@ impl InternalAdditionOps for TemporalGraph { is_static: bool, ) -> Result, GraphError> { self.node_meta - .resolve_prop_id(prop, dtype, is_static) + .resolve_prop_id(prop, dtype.clone(), is_static) .map_err(|e| e.into()) } diff --git a/raphtory/src/db/api/storage/graph/storage_ops/materialize.rs b/raphtory/src/db/api/storage/graph/storage_ops/materialize.rs index b94b818ff..9578efa64 100644 --- a/raphtory/src/db/api/storage/graph/storage_ops/materialize.rs +++ b/raphtory/src/db/api/storage/graph/storage_ops/materialize.rs @@ -1,5 +1,6 @@ use super::GraphStorage; -use crate::db::api::view::internal::{GraphType, InternalMaterialize}; +use crate::db::api::view::internal::InternalMaterialize; +use raphtory_api::GraphType; impl InternalMaterialize for GraphStorage { fn graph_type(&self) -> GraphType { diff --git a/raphtory/src/db/api/view/edge_property_filter.rs b/raphtory/src/db/api/view/edge_property_filter.rs index b04109341..d3ce136c3 100644 --- a/raphtory/src/db/api/view/edge_property_filter.rs +++ b/raphtory/src/db/api/view/edge_property_filter.rs @@ -1,11 +1,12 @@ use crate::{ core::utils::errors::GraphError, db::{ - api::view::internal::{GraphType, InternalMaterialize, OneHopFilter}, + api::view::internal::{InternalMaterialize, OneHopFilter}, graph::views::property_filter::internal::InternalEdgeFilterOps, }, prelude::GraphViewOps, }; +use raphtory_api::GraphType; pub trait EdgePropertyFilterOps<'graph>: OneHopFilter<'graph> { fn filter_edges( diff --git a/raphtory/src/db/api/view/exploded_edge_property_filter.rs b/raphtory/src/db/api/view/exploded_edge_property_filter.rs index 5a3bc0d67..34ef3a254 100644 --- a/raphtory/src/db/api/view/exploded_edge_property_filter.rs +++ b/raphtory/src/db/api/view/exploded_edge_property_filter.rs @@ -1,11 +1,12 @@ use crate::{ core::utils::errors::GraphError, db::{ - api::view::internal::{GraphType, InternalMaterialize, OneHopFilter}, + api::view::internal::{InternalMaterialize, OneHopFilter}, graph::views::property_filter::internal::InternalExplodedEdgeFilterOps, }, prelude::GraphViewOps, }; +use raphtory_api::GraphType; pub trait ExplodedEdgePropertyFilterOps<'graph>: OneHopFilter<'graph> { fn filter_exploded_edges( diff --git a/raphtory/src/db/api/view/internal/materialize.rs b/raphtory/src/db/api/view/internal/materialize.rs index 2c9349538..2fd54bf43 100644 --- a/raphtory/src/db/api/view/internal/materialize.rs +++ b/raphtory/src/db/api/view/internal/materialize.rs @@ -37,9 +37,12 @@ use crate::{ }; use chrono::{DateTime, Utc}; use enum_dispatch::enum_dispatch; -use raphtory_api::core::{ - entities::GidType, - storage::{arc_str::ArcStr, dict_mapper::MaybeNew}, +use raphtory_api::{ + core::{ + entities::GidType, + storage::{arc_str::ArcStr, dict_mapper::MaybeNew}, + }, + GraphType, }; use serde::{Deserialize, Serialize}; @@ -61,11 +64,6 @@ pub enum MaterializedGraph { PersistentGraph(PersistentGraph), } -pub enum GraphType { - EventGraph, - PersistentGraph, -} - impl Static for MaterializedGraph {} impl MaterializedGraph { diff --git a/raphtory/src/db/api/view/time.rs b/raphtory/src/db/api/view/time.rs index 40cb6e222..664ea732c 100644 --- a/raphtory/src/db/api/view/time.rs +++ b/raphtory/src/db/api/view/time.rs @@ -9,13 +9,12 @@ use crate::{ }, }; use chrono::{DateTime, Utc}; +use raphtory_api::GraphType; use std::{ cmp::{max, min}, marker::PhantomData, }; -use super::internal::GraphType; - pub(crate) mod internal { use crate::{ db::{api::view::internal::OneHopFilter, graph::views::window_graph::WindowedGraph}, diff --git a/raphtory/src/db/graph/graph.rs b/raphtory/src/db/graph/graph.rs index 2d76b6496..50fcb4072 100644 --- a/raphtory/src/db/graph/graph.rs +++ b/raphtory/src/db/graph/graph.rs @@ -2382,12 +2382,11 @@ mod db_tests { ); let data = vec![ - ("key1".into(), Prop::I64(10)), - ("key2".into(), Prop::I64(20)), - ("key3".into(), Prop::I64(30)), + ("key1", Prop::I64(10)), + ("key2", Prop::I64(20)), + ("key3", Prop::I64(30)), ]; - let props_map = data.into_iter().collect::>(); - let as_props: Vec<(&str, Prop)> = vec![("mylist2", Prop::Map(Arc::from(props_map)))]; + let as_props: Vec<(&str, Prop)> = vec![("mylist2", Prop::map(data))]; g.add_constant_properties(as_props.clone()).unwrap(); diff --git a/raphtory/src/db/graph/views/deletion_graph.rs b/raphtory/src/db/graph/views/deletion_graph.rs index f9b3e497d..edc62aa62 100644 --- a/raphtory/src/db/graph/views/deletion_graph.rs +++ b/raphtory/src/db/graph/views/deletion_graph.rs @@ -25,6 +25,7 @@ use crate::{ prelude::*, }; use itertools::Itertools; +use raphtory_api::GraphType; use rayon::prelude::*; use serde::{Deserialize, Serialize}; use std::{ diff --git a/raphtory/src/disk_graph/graph_impl/prop_conversion.rs b/raphtory/src/disk_graph/graph_impl/prop_conversion.rs index 3a579fb88..aff993f97 100644 --- a/raphtory/src/disk_graph/graph_impl/prop_conversion.rs +++ b/raphtory/src/disk_graph/graph_impl/prop_conversion.rs @@ -139,11 +139,10 @@ pub fn arrow_array_from_props( array.iter().any(|v| v.is_some()).then_some(array.boxed()) } PropType::Empty - | PropType::List - | PropType::Map + | PropType::List(_) + | PropType::Map(_) | PropType::NDTime | PropType::Array(_) - | PropType::Document | PropType::DTime => panic!("{prop_type:?} not supported as disk_graph property"), } } @@ -185,11 +184,10 @@ pub fn schema_from_prop_meta(prop_map: &PropMapper) -> Schema { schema.push(Field::new(key, DataType::Boolean, true)); } prop_type @ (PropType::Empty - | PropType::List - | PropType::Map + | PropType::List(_) + | PropType::Map(_) | PropType::NDTime | PropType::Array(_) - | PropType::Document | PropType::DTime) => panic!("{:?} not supported as disk_graph property", prop_type), } } diff --git a/raphtory/src/io/arrow/df_loaders.rs b/raphtory/src/io/arrow/df_loaders.rs index 6eae6a36b..7eef62f0e 100644 --- a/raphtory/src/io/arrow/df_loaders.rs +++ b/raphtory/src/io/arrow/df_loaders.rs @@ -1,6 +1,6 @@ use crate::{ core::{ - entities::LayerIds, + entities::{nodes::node_ref::AsNodeRef, LayerIds}, utils::errors::{GraphError, LoadError}, PropType, }, @@ -8,7 +8,6 @@ use crate::{ io::arrow::{ dataframe::{DFChunk, DFView}, layer_col::{lift_layer_col, lift_node_type_col}, - node_col::lift_node_col, prop_handler::*, }, prelude::*, @@ -53,21 +52,18 @@ fn process_shared_properties( } pub(crate) fn load_nodes_from_df< - G: StaticGraphViewOps + InternalPropertyAdditionOps + InternalAdditionOps, + G: StaticGraphViewOps + InternalPropertyAdditionOps + InternalAdditionOps + InternalCache, >( df_view: DFView>>, time: &str, node_id: &str, - properties: Option<&[&str]>, - constant_properties: Option<&[&str]>, + properties: &[&str], + constant_properties: &[&str], shared_constant_properties: Option<&HashMap>, node_type: Option<&str>, node_type_col: Option<&str>, graph: &G, ) -> Result<(), GraphError> { - let properties = properties.unwrap_or(&[]); - let constant_properties = constant_properties.unwrap_or(&[]); - let properties_indices = properties .iter() .map(|name| df_view.get_index(name)) @@ -95,6 +91,17 @@ pub(crate) fn load_nodes_from_df< #[cfg(feature = "python")] let mut pb = build_progress_bar("Loading nodes".to_string(), df_view.num_rows)?; + let mut node_col_resolved = vec![]; + let mut node_type_col_resolved = vec![]; + + let cache = graph.get_cache(); + let mut write_locked_graph = graph.write_lock()?; + let cache_shards = cache.map(|cache| { + (0..write_locked_graph.num_shards()) + .map(|_| cache.fork()) + .collect::>() + }); + let mut start_id = graph.reserve_event_ids(df_view.num_rows)?; for chunk in df_view.chunks { let df = chunk?; @@ -108,38 +115,98 @@ pub(crate) fn load_nodes_from_df< |key, dtype| graph.resolve_node_property(key, dtype, true), )?; let node_type_col = lift_node_type_col(node_type, node_type_index, &df)?; + let time_col = df.time_col(time_index)?; let node_col = df.node_col(node_id_index)?; + node_col_resolved.resize_with(df.len(), Default::default); + node_type_col_resolved.resize_with(df.len(), Default::default); + node_col .par_iter() - .zip(time_col.par_iter()) + .zip(node_col_resolved.par_iter_mut()) .zip(node_type_col.par_iter()) - .zip(prop_cols.par_rows()) - .zip(const_prop_cols.par_rows()) - .enumerate() - .try_for_each(|(id, ((((node, time), node_type), t_props), c_props))| { - let node = node.ok_or(LoadError::MissingNodeError)?; - let time = time.ok_or(LoadError::MissingTimeError)?; - let node_id = match node_type { - None => graph.resolve_node(node)?.inner(), - Some(node_type) => graph - .resolve_node_and_type(node, node_type)? - .inner() - .0 - .inner(), - }; - let t = TimeIndexEntry(time, start_id + id); - let t_props: Vec<_> = t_props.collect(); - graph.internal_add_node(t, node_id, &t_props)?; - let c_props: Vec<_> = c_props - .chain(shared_constant_properties.iter().cloned()) - .collect(); - if !c_props.is_empty() { - graph.internal_add_constant_node_properties(node_id, &c_props)?; + .zip(node_type_col_resolved.par_iter_mut()) + .try_for_each(|(((gid, resolved), node_type), node_type_resolved)| { + let gid = gid.ok_or(LoadError::FatalError)?; + let vid = write_locked_graph + .resolve_node(gid) + .map_err(|_| LoadError::FatalError)?; + let node_type_res = write_locked_graph + .resolve_node_type(node_type) + .map_err(|_| LoadError::FatalError)? + .inner(); + *node_type_resolved = node_type_res; + if let Some(cache) = cache { + cache.resolve_node(vid, gid); } - Ok::<(), GraphError>(()) + *resolved = vid.inner(); + Ok::<(), LoadError>(()) })?; + + let g = write_locked_graph.graph; + let update_time = |time| g.update_time(time); + + write_locked_graph + .nodes + .resize(write_locked_graph.num_nodes()); + + write_locked_graph + .nodes + .par_iter_mut() + .try_for_each(|mut shard| { + let mut t_props = vec![]; + let mut c_props = vec![]; + + for (idx, (((vid, time), node_type), gid)) in node_col_resolved + .iter() + .zip(time_col.iter()) + .zip(node_type_col_resolved.iter()) + .zip(node_col.iter()) + .enumerate() + { + let shard_id = shard.shard_id(); + let node_exists = if let Some(mut_node) = shard.get_mut(*vid) { + mut_node.init(*vid, gid); + mut_node.node_type = *node_type; + t_props.clear(); + t_props.extend(prop_cols.iter_row(idx)); + + c_props.clear(); + c_props.extend(const_prop_cols.iter_row(idx)); + c_props.extend_from_slice(&shared_constant_properties); + + if let Some(caches) = cache_shards.as_ref() { + let cache = &caches[shard_id]; + cache.add_node_update( + TimeIndexEntry(time, start_id + idx), + *vid, + &t_props, + ); + cache.add_node_cprops(*vid, &c_props); + } + + for (id, prop) in c_props.drain(..) { + mut_node.add_constant_prop(id, prop)?; + } + + true + } else { + false + }; + + if node_exists { + let t = TimeIndexEntry(time, start_id + idx); + update_time(t); + let prop_i = shard.t_prop_log_mut().push(t_props.drain(..))?; + if let Some(mut_node) = shard.get_mut(*vid) { + mut_node.update_t_prop_time(t, prop_i); + } + } + } + Ok::<_, GraphError>(()) + })?; + #[cfg(feature = "python")] let _ = pb.update(df.len()); start_id += df.len(); @@ -148,23 +215,19 @@ pub(crate) fn load_nodes_from_df< } pub(crate) fn load_edges_from_df< - 'a, G: StaticGraphViewOps + InternalPropertyAdditionOps + InternalAdditionOps + InternalCache, >( df_view: DFView>>, time: &str, src: &str, dst: &str, - properties: Option<&[&str]>, - constant_properties: Option<&[&str]>, + properties: &[&str], + constant_properties: &[&str], shared_constant_properties: Option<&HashMap>, layer: Option<&str>, layer_col: Option<&str>, graph: &G, ) -> Result<(), GraphError> { - let properties = properties.unwrap_or(&[]); - let constant_properties = constant_properties.unwrap_or(&[]); - let properties_indices = properties .iter() .map(|name| df_view.get_index(name)) @@ -207,11 +270,11 @@ pub(crate) fn load_edges_from_df< for chunk in df_view.chunks { let df = chunk?; - let prop_cols = combine_properties(properties, &properties_indices, &df, |key, dtype| { + let prop_cols = combine_properties(&properties, &properties_indices, &df, |key, dtype| { graph.resolve_edge_property(key, dtype, false) })?; let const_prop_cols = combine_properties( - constant_properties, + &constant_properties, &constant_properties_indices, &df, |key, dtype| graph.resolve_edge_property(key, dtype, true), @@ -450,74 +513,123 @@ pub(crate) fn load_edge_deletions_from_df< pub(crate) fn load_node_props_from_df< 'a, - G: StaticGraphViewOps + InternalPropertyAdditionOps + InternalAdditionOps, + G: StaticGraphViewOps + InternalPropertyAdditionOps + InternalAdditionOps + InternalCache, >( df_view: DFView>>, node_id: &str, node_type: Option<&str>, node_type_col: Option<&str>, - constant_properties: Option<&[&str]>, + constant_properties: &[&str], shared_constant_properties: Option<&HashMap>, graph: &G, ) -> Result<(), GraphError> { - let constant_properties = constant_properties.unwrap_or(&[]); let constant_properties_indices = constant_properties .iter() .map(|name| df_view.get_index(name)) .collect::, GraphError>>()?; - let node_id_index = df_view.get_index(node_id)?; + let node_type_index = if let Some(node_type_col) = node_type_col { - Some(df_view.get_index(node_type_col.as_ref())?) + Some(df_view.get_index(node_type_col.as_ref())) } else { None }; - let shared_constant_properties = match shared_constant_properties { - Some(props) => props - .iter() - .map(|(name, prop)| { - Ok(( - graph - .resolve_node_property(name, prop.dtype(), true)? - .inner(), - prop.clone(), - )) - }) - .collect::, GraphError>>()?, - None => vec![], - }; + let node_type_index = node_type_index.transpose()?; + + let node_id_index = df_view.get_index(node_id)?; + + let shared_constant_properties = + process_shared_properties(shared_constant_properties, |key, dtype| { + graph.resolve_node_property(key, dtype, true) + })?; + #[cfg(feature = "python")] let mut pb = build_progress_bar("Loading node properties".to_string(), df_view.num_rows)?; + + let mut node_col_resolved = vec![]; + let mut node_type_col_resolved = vec![]; + + let cache = graph.get_cache(); + let mut write_locked_graph = graph.write_lock()?; + let cache_shards = cache.map(|cache| { + (0..write_locked_graph.num_shards()) + .map(|_| cache.fork()) + .collect::>() + }); + for chunk in df_view.chunks { let df = chunk?; - let const_props = combine_properties( + let const_prop_cols = combine_properties( constant_properties, &constant_properties_indices, &df, - |name, dtype| graph.resolve_node_property(name, dtype, true), + |key, dtype| graph.resolve_node_property(key, dtype, true), )?; - let node_col = df.node_col(node_id_index)?; let node_type_col = lift_node_type_col(node_type, node_type_index, &df)?; + let node_col = df.node_col(node_id_index)?; + + node_col_resolved.resize_with(df.len(), Default::default); + node_type_col_resolved.resize_with(df.len(), Default::default); node_col .par_iter() + .zip(node_col_resolved.par_iter_mut()) .zip(node_type_col.par_iter()) - .zip(const_props.par_rows()) - .try_for_each(|((node_id, node_type), cprops)| { - let node_id = node_id.ok_or(LoadError::MissingNodeError)?; - let node = graph - .node(node_id) - .ok_or_else(|| GraphError::NodeMissingError(node_id.to_owned()))?; - if let Some(node_type) = node_type { - node.set_node_type(node_type)?; + .zip(node_type_col_resolved.par_iter_mut()) + .try_for_each(|(((gid, resolved), node_type), node_type_resolved)| { + let gid = gid.ok_or(LoadError::FatalError)?; + let vid = write_locked_graph + .resolve_node(gid) + .map_err(|_| LoadError::FatalError)?; + let node_type_res = write_locked_graph + .resolve_node_type(node_type) + .map_err(|_| LoadError::FatalError)? + .inner(); + *node_type_resolved = node_type_res; + if let Some(cache) = cache { + cache.resolve_node(vid, gid); } - let props = cprops - .chain(shared_constant_properties.iter().cloned()) - .collect::>(); - if !props.is_empty() { - graph.internal_add_constant_node_properties(node.node, &props)?; + *resolved = vid.inner(); + Ok::<(), LoadError>(()) + })?; + + write_locked_graph + .nodes + .resize(write_locked_graph.num_nodes()); + + write_locked_graph + .nodes + .par_iter_mut() + .try_for_each(|mut shard| { + let mut c_props = vec![]; + + for (idx, ((vid, node_type), gid)) in node_col_resolved + .iter() + .zip(node_type_col_resolved.iter()) + .zip(node_col.iter()) + .enumerate() + { + let shard_id = shard.shard_id(); + if let Some(mut_node) = shard.get_mut(*vid) { + mut_node.init(*vid, gid); + mut_node.node_type = *node_type; + + c_props.clear(); + c_props.extend(const_prop_cols.iter_row(idx)); + c_props.extend_from_slice(&shared_constant_properties); + + if let Some(caches) = cache_shards.as_ref() { + let cache = &caches[shard_id]; + cache.add_node_cprops(*vid, &c_props); + } + + for (id, prop) in c_props.drain(..) { + mut_node.add_constant_prop(id, prop)?; + } + }; } - Ok::<(), GraphError>(()) + Ok::<_, GraphError>(()) })?; + #[cfg(feature = "python")] let _ = pb.update(df.len()); } @@ -525,85 +637,162 @@ pub(crate) fn load_node_props_from_df< } pub(crate) fn load_edges_props_from_df< - 'a, - G: StaticGraphViewOps + InternalPropertyAdditionOps + InternalAdditionOps, + G: StaticGraphViewOps + InternalPropertyAdditionOps + InternalAdditionOps + InternalCache, >( df_view: DFView>>, src: &str, dst: &str, - constant_properties: Option<&[&str]>, - shared_constant_properties: Option<&HashMap>, + constant_properties: &[&str], + shared_const_properties: Option<&HashMap>, layer: Option<&str>, layer_col: Option<&str>, graph: &G, ) -> Result<(), GraphError> { - let constant_properties = constant_properties.unwrap_or(&[]); let constant_properties_indices = constant_properties .iter() .map(|name| df_view.get_index(name)) .collect::, GraphError>>()?; + let src_index = df_view.get_index(src)?; let dst_index = df_view.get_index(dst)?; let layer_index = if let Some(layer_col) = layer_col { - Some(df_view.get_index(layer_col.as_ref())) + Some(df_view.get_index(layer_col.as_ref())?) } else { None }; - let layer_index = layer_index.transpose()?; + let shared_constant_properties = + process_shared_properties(shared_const_properties, |key, dtype| { + graph.resolve_edge_property(key, dtype, true) + })?; + #[cfg(feature = "python")] let mut pb = build_progress_bar("Loading edge properties".to_string(), df_view.num_rows)?; - let shared_constant_properties = match shared_constant_properties { - None => { - vec![] - } - Some(props) => props - .iter() - .map(|(key, prop)| { - Ok(( - graph - .resolve_edge_property(key, prop.dtype(), true)? - .inner(), - prop.clone(), - )) - }) - .collect::, GraphError>>()?, - }; + #[cfg(feature = "python")] + let _ = pb.update(0); + + let mut src_col_resolved = vec![]; + let mut dst_col_resolved = vec![]; + let mut eid_col_resolved = vec![]; + + let cache = graph.get_cache(); + let mut write_locked_graph = graph.write_lock()?; + let cache_shards = cache.map(|cache| { + (0..write_locked_graph.num_shards()) + .map(|_| cache.fork()) + .collect::>() + }); + + let g = write_locked_graph.graph; for chunk in df_view.chunks { let df = chunk?; - let const_prop_iter = combine_properties( - constant_properties, + let const_prop_cols = combine_properties( + &constant_properties, &constant_properties_indices, &df, - |name, dtype| graph.resolve_edge_property(name, dtype, true), + |key, dtype| graph.resolve_edge_property(key, dtype, true), )?; - let layer = lift_layer_col(layer, layer_index, &df)?; - let src_col = lift_node_col(src_index, &df)?; - let dst_col = lift_node_col(dst_index, &df)?; + let layer_col_resolved = layer.resolve(graph)?; + + let src_col = df.node_col(src_index)?; + src_col.validate(graph, LoadError::MissingSrcError)?; + + let dst_col = df.node_col(dst_index)?; + dst_col.validate(graph, LoadError::MissingDstError)?; + + // It's our graph, no one else can change it + src_col_resolved.resize_with(df.len(), Default::default); src_col .par_iter() - .zip(dst_col.par_iter()) - .zip(layer.par_iter()) - .zip(const_prop_iter.par_rows()) - .try_for_each(|(((src, dst), layer), cprops)| { - let src = src.ok_or(LoadError::MissingSrcError)?; - let dst = dst.ok_or(LoadError::MissingDstError)?; - let e = graph - .edge(src, dst) - .ok_or_else(|| GraphError::EdgeMissingError { - src: src.to_owned(), - dst: dst.to_owned(), - })?; - let layer_id = graph.resolve_layer(layer)?.inner(); - let props = cprops - .chain(shared_constant_properties.iter().cloned()) - .collect::>(); - if !props.is_empty() { - graph.internal_add_constant_edge_properties(e.edge.pid(), layer_id, &props)?; + .zip(src_col_resolved.par_iter_mut()) + .try_for_each(|(gid, resolved)| { + let gid = gid.ok_or(LoadError::FatalError)?; + let vid = g + .resolve_node_ref(gid.as_node_ref()) + .ok_or(LoadError::MissingNodeError)?; + *resolved = vid; + Ok::<(), LoadError>(()) + })?; + + dst_col_resolved.resize_with(df.len(), Default::default); + dst_col + .par_iter() + .zip(dst_col_resolved.par_iter_mut()) + .try_for_each(|(gid, resolved)| { + let gid = gid.ok_or(LoadError::FatalError)?; + let vid = g + .resolve_node_ref(gid.as_node_ref()) + .ok_or(LoadError::MissingNodeError)?; + *resolved = vid; + Ok::<(), LoadError>(()) + })?; + + // resolve all the edges + eid_col_resolved.resize_with(df.len(), Default::default); + let eid_col_shared = atomic_usize_from_mut_slice(cast_slice_mut(&mut eid_col_resolved)); + write_locked_graph + .nodes + .par_iter_mut() + .try_for_each(|shard| { + for (row, (src, dst)) in src_col_resolved + .iter() + .zip(dst_col_resolved.iter()) + .enumerate() + { + if let Some(src_node) = shard.get(*src) { + // we know this is here + let EID(eid) = src_node + .find_edge_eid(*dst, &LayerIds::All) + .ok_or(LoadError::MissingEdgeError(*src, *dst))?; + eid_col_shared[row].store(eid, Ordering::Relaxed); + } + } + Ok::<_, LoadError>(()) + })?; + + write_locked_graph + .edges + .par_iter_mut() + .try_for_each(|mut shard| { + let mut c_props = vec![]; + for (idx, (eid, layer)) in eid_col_resolved + .iter() + .zip(layer_col_resolved.iter()) + .enumerate() + { + let shard_id = shard.shard_id(); + if let Some(mut edge) = shard.get_mut(*eid) { + c_props.clear(); + c_props.extend(const_prop_cols.iter_row(idx)); + c_props.extend_from_slice(&shared_constant_properties); + + if let Some(caches) = cache_shards.as_ref() { + let cache = &caches[shard_id]; + cache.add_edge_cprops(*eid, *layer, &c_props); + } + + if !c_props.is_empty() { + let edge_layer = edge.layer_mut(*layer); + + for (id, prop) in c_props.drain(..) { + edge_layer.update_constant_prop(id, prop)?; + } + } + } } Ok::<(), GraphError>(()) })?; + + if let Some(cache) = cache { + cache.write()?; + } + if let Some(cache_shards) = cache_shards.as_ref() { + for cache in cache_shards { + cache.write()?; + } + } + #[cfg(feature = "python")] let _ = pb.update(df.len()); } @@ -716,8 +905,8 @@ mod tests { "time", "src", "dst", - Some(&["int_prop", "str_prop"]), - None, + &["int_prop", "str_prop"], + &[], None, Some(edge_list.layer), None, @@ -843,7 +1032,7 @@ mod tests { let df_view = build_df(chunk_size, &edges); let g = Graph::new(); let props = ["str_prop", "int_prop"]; - load_edges_from_df(df_view, "time", "src", "dst", Some(&props), None, None, None, None, &g).unwrap(); + load_edges_from_df(df_view, "time", "src", "dst", &props, &[], None, None, None, &g).unwrap(); let g2 = Graph::new(); for (src, dst, time, str_prop, int_prop) in edges { g2.add_edge(time, src, dst, [("str_prop", str_prop.clone().into_prop()), ("int_prop", int_prop.into_prop())], None).unwrap(); @@ -863,7 +1052,7 @@ mod tests { let cache_file = TempDir::new().unwrap(); g.cache(cache_file.path()).unwrap(); let props = ["str_prop", "int_prop"]; - load_edges_from_df(df_view, "time", "src", "dst", Some(&props), None, None, None, None, &g).unwrap(); + load_edges_from_df(df_view, "time", "src", "dst", &props, &[], None, None, None, &g).unwrap(); let g = Graph::load_cached(cache_file.path()).unwrap(); let g2 = Graph::new(); for (src, dst, time, str_prop, int_prop) in edges { @@ -876,3 +1065,71 @@ mod tests { }) } } + +pub(crate) fn load_graph_props_from_df< + G: StaticGraphViewOps + InternalPropertyAdditionOps + InternalAdditionOps, +>( + df_view: DFView>>, + time: &str, + properties: Option<&[&str]>, + constant_properties: Option<&[&str]>, + graph: &G, +) -> Result<(), GraphError> { + let properties = properties.unwrap_or(&[]); + let constant_properties = constant_properties.unwrap_or(&[]); + + let properties_indices = properties + .iter() + .map(|name| df_view.get_index(name)) + .collect::, GraphError>>()?; + let constant_properties_indices = constant_properties + .iter() + .map(|name| df_view.get_index(name)) + .collect::, GraphError>>()?; + + let time_index = df_view.get_index(time)?; + + #[cfg(feature = "python")] + let mut pb = build_progress_bar("Loading graph properties".to_string(), df_view.num_rows)?; + + let mut start_id = graph.reserve_event_ids(df_view.num_rows)?; + + for chunk in df_view.chunks { + let df = chunk?; + let prop_cols = combine_properties(properties, &properties_indices, &df, |key, dtype| { + graph.resolve_graph_property(key, dtype, false) + })?; + let const_prop_cols = combine_properties( + constant_properties, + &constant_properties_indices, + &df, + |key, dtype| graph.resolve_graph_property(key, dtype, true), + )?; + let time_col = df.time_col(time_index)?; + + time_col + .par_iter() + .zip(prop_cols.par_rows()) + .zip(const_prop_cols.par_rows()) + .enumerate() + .try_for_each(|(id, ((time, t_props), c_props))| { + let time = time.ok_or(LoadError::MissingTimeError)?; + let t = TimeIndexEntry(time, start_id + id); + let t_props: Vec<_> = t_props.collect(); + if !t_props.is_empty() { + graph.internal_add_properties(t, &t_props)?; + } + + let c_props: Vec<_> = c_props.collect(); + + if !c_props.is_empty() { + graph.internal_add_constant_properties(&c_props)?; + } + Ok::<(), GraphError>(()) + })?; + #[cfg(feature = "python")] + let _ = pb.update(df.len()); + start_id += df.len(); + } + Ok(()) +} diff --git a/raphtory/src/io/arrow/mod.rs b/raphtory/src/io/arrow/mod.rs index 89597f528..911dfb87a 100644 --- a/raphtory/src/io/arrow/mod.rs +++ b/raphtory/src/io/arrow/mod.rs @@ -54,8 +54,8 @@ mod test { "time", "src", "dst", - Some(&*vec!["prop1", "prop2"]), - None, + &["prop1", "prop2"], + &[], None, layer_name, layer_col, @@ -149,8 +149,8 @@ mod test { df, "time", "id", - Some(&*vec!["name"]), - None, + &*vec!["name"], + &[], None, Some("node_type"), None, diff --git a/raphtory/src/io/arrow/prop_handler.rs b/raphtory/src/io/arrow/prop_handler.rs index 54f5507ea..f3ad58a21 100644 --- a/raphtory/src/io/arrow/prop_handler.rs +++ b/raphtory/src/io/arrow/prop_handler.rs @@ -9,13 +9,15 @@ use crate::{ use chrono::{DateTime, Utc}; use polars_arrow::{ array::{ - Array, BooleanArray, FixedSizeListArray, ListArray, PrimitiveArray, StaticArray, Utf8Array, + Array, BooleanArray, FixedSizeListArray, ListArray, PrimitiveArray, StaticArray, + StructArray, Utf8Array, Utf8ViewArray, }, datatypes::{ArrowDataType as DataType, TimeUnit}, offset::Offset, }; -use raphtory_api::core::storage::dict_mapper::MaybeNew; +use raphtory_api::core::storage::{arc_str::ArcStr, dict_mapper::MaybeNew}; use rayon::prelude::*; +use rustc_hash::FxHashMap; pub struct PropCols { prop_ids: Vec, @@ -43,7 +45,7 @@ impl PropCols { } pub(crate) fn combine_properties( - props: &[&str], + props: &[impl AsRef], indices: &[usize], df: &DFChunk, prop_id_resolver: impl Fn(&str, PropType) -> Result, GraphError>, @@ -59,7 +61,7 @@ pub(crate) fn combine_properties( let prop_ids = props .iter() .zip(dtypes.into_iter()) - .map(|(name, dtype)| Ok(prop_id_resolver(name, dtype)?.inner())) + .map(|(name, dtype)| Ok(prop_id_resolver(name.as_ref(), dtype)?.inner())) .collect::, GraphError>>()?; Ok(PropCols { @@ -115,6 +117,10 @@ fn arr_as_prop(arr: Box) -> Prop { let arr = arr.as_any().downcast_ref::>().unwrap(); arr.iter().flatten().into_prop_list() } + DataType::Utf8View => { + let arr = arr.as_any().downcast_ref::().unwrap(); + arr.iter().flatten().into_prop_list() + } DataType::List(_) => { let arr = arr.as_any().downcast_ref::>().unwrap(); arr.iter() @@ -136,7 +142,59 @@ fn arr_as_prop(arr: Box) -> Prop { .map(|elem| arr_as_prop(elem)) .into_prop_list() } - _ => panic!("Data type not recognized"), + DataType::Timestamp(TimeUnit::Millisecond, Some(_)) => { + let arr = arr + .as_any() + .downcast_ref::>() + .expect(&format!( + "Expected TimestampMillisecondArray, got {:?}", + arr + )); + arr.iter() + .flatten() + .map(|elem| Prop::DTime(DateTime::::from_timestamp_millis(*elem).unwrap())) + .into_prop_list() + } + DataType::Timestamp(TimeUnit::Millisecond, None) => { + let arr = arr + .as_any() + .downcast_ref::>() + .expect(&format!( + "Expected TimestampMillisecondArray, got {:?}", + arr + )); + arr.iter() + .flatten() + .map(|elem| { + Prop::NDTime(DateTime::from_timestamp_millis(*elem).unwrap().naive_utc()) + }) + .into_prop_list() + } + DataType::Struct(_) => { + let arr = arr.as_any().downcast_ref::().unwrap(); + let cols = arr + .values() + .into_iter() + .map(|arr| lift_property_col(arr.as_ref())) + .collect::>(); + + let mut props = Vec::with_capacity(arr.len()); + for i in 0..arr.len() { + let fields = cols + .iter() + .zip(arr.fields()) + .filter_map(|(col, field)| { + col.get(i) + .map(|prop| (ArcStr::from(field.name.as_str()), prop)) + }) + .collect::>(); + props.push(Prop::Map(fields.into())); + } + + props.into_prop_list() + } + DataType::Null => Prop::List(vec![].into()), + dt => panic!("Data type not recognized {dt:?}"), } } @@ -153,41 +211,30 @@ fn data_type_as_prop_type(dt: &DataType) -> Result { DataType::Float64 => Ok(PropType::F64), DataType::Utf8 => Ok(PropType::Str), DataType::LargeUtf8 => Ok(PropType::Str), - DataType::List(v) => is_data_type_supported(v.data_type()).map(|_| PropType::List), - DataType::FixedSizeList(v, _) => { - is_data_type_supported(v.data_type()).map(|_| PropType::List) - } - DataType::LargeList(v) => is_data_type_supported(v.data_type()).map(|_| PropType::List), + DataType::Utf8View => Ok(PropType::Str), + DataType::Struct(fields) => Ok(PropType::map(fields.iter().filter_map(|f| { + data_type_as_prop_type(f.data_type()) + .ok() + .map(move |pt| (&f.name, pt)) + }))), + DataType::List(v) => Ok(PropType::List(Box::new(data_type_as_prop_type( + v.data_type(), + )?))), + DataType::FixedSizeList(v, _) => Ok(PropType::List(Box::new(data_type_as_prop_type( + v.data_type(), + )?))), + DataType::LargeList(v) => Ok(PropType::List(Box::new(data_type_as_prop_type( + v.data_type(), + )?))), DataType::Timestamp(_, v) => match v { None => Ok(PropType::NDTime), Some(_) => Ok(PropType::DTime), }, + DataType::Null => Ok(PropType::Empty), _ => Err(LoadError::InvalidPropertyType(dt.clone()).into()), } } -fn is_data_type_supported(dt: &DataType) -> Result<(), GraphError> { - match dt { - DataType::Boolean => {} - DataType::Int32 => {} - DataType::Int64 => {} - DataType::UInt8 => {} - DataType::UInt16 => {} - DataType::UInt32 => {} - DataType::UInt64 => {} - DataType::Float32 => {} - DataType::Float64 => {} - DataType::Utf8 => {} - DataType::LargeUtf8 => {} - DataType::List(v) => is_data_type_supported(v.data_type())?, - DataType::FixedSizeList(v, _) => is_data_type_supported(v.data_type())?, - DataType::LargeList(v) => is_data_type_supported(v.data_type())?, - DataType::Timestamp(_, _) => {} - _ => return Err(LoadError::InvalidPropertyType(dt.clone()).into()), - } - Ok(()) -} - trait PropCol: Send + Sync { fn get(&self, i: usize) -> Option; } @@ -211,6 +258,23 @@ impl PropCol for Wrap> { } } +impl PropCol for Wrap { + fn get(&self, i: usize) -> Option { + let fields = self + .0 + .values() + .iter() + .zip(self.0.fields()) + .filter_map(|(arr, field)| { + let prop = lift_property_col(arr.as_ref()).get(i)?; + Some((ArcStr::from(field.name.as_str()), prop)) + }) + .collect::>(); + + (!fields.is_empty()).then(|| Prop::Map(fields.into())) + } +} + impl PropCol for Wrap> { fn get(&self, i: usize) -> Option { if i >= self.0.len() { @@ -303,6 +367,14 @@ fn lift_property_col(arr: &dyn Array) -> Box { let arr = arr.as_any().downcast_ref::>().unwrap(); Box::new(Wrap(arr.clone())) } + DataType::Struct(_) => { + let arr = arr.as_any().downcast_ref::().unwrap(); + Box::new(Wrap(arr.clone())) + } + DataType::Utf8View => { + let arr = arr.as_any().downcast_ref::().unwrap(); + Box::new(arr.clone()) + } DataType::Timestamp(timeunit, timezone) => { let arr = arr .as_any() diff --git a/raphtory/src/io/parquet_loaders.rs b/raphtory/src/io/parquet_loaders.rs index 878fdcc50..78e4d5f35 100644 --- a/raphtory/src/io/parquet_loaders.rs +++ b/raphtory/src/io/parquet_loaders.rs @@ -29,7 +29,7 @@ use std::{ }; pub fn load_nodes_from_parquet< - G: StaticGraphViewOps + InternalPropertyAdditionOps + InternalAdditionOps, + G: StaticGraphViewOps + InternalPropertyAdditionOps + InternalAdditionOps + InternalCache, >( graph: &G, parquet_path: &Path, @@ -37,13 +37,13 @@ pub fn load_nodes_from_parquet< id: &str, node_type: Option<&str>, node_type_col: Option<&str>, - properties: Option<&[&str]>, - constant_properties: Option<&[&str]>, + properties: &[&str], + constant_properties: &[&str], shared_constant_properties: Option<&HashMap>, ) -> Result<(), GraphError> { let mut cols_to_check = vec![id, time]; - cols_to_check.extend(properties.unwrap_or(&Vec::new())); - cols_to_check.extend(constant_properties.unwrap_or(&Vec::new())); + cols_to_check.extend_from_slice(&properties); + cols_to_check.extend_from_slice(&constant_properties); if let Some(ref node_type_col) = node_type_col { cols_to_check.push(node_type_col.as_ref()); } @@ -55,8 +55,8 @@ pub fn load_nodes_from_parquet< df_view, time, id, - properties, - constant_properties, + &properties, + &constant_properties, shared_constant_properties, node_type, node_type_col, @@ -76,16 +76,17 @@ pub fn load_edges_from_parquet< time: &str, src: &str, dst: &str, - properties: Option<&[&str]>, - constant_properties: Option<&[&str]>, + properties: &[&str], + constant_properties: &[&str], shared_constant_properties: Option<&HashMap>, layer: Option<&str>, layer_col: Option<&str>, ) -> Result<(), GraphError> { let parquet_path = parquet_path.as_ref(); let mut cols_to_check = vec![src, dst, time]; - cols_to_check.extend(properties.unwrap_or(&Vec::new())); - cols_to_check.extend(constant_properties.unwrap_or(&Vec::new())); + cols_to_check.extend_from_slice(&properties); + cols_to_check.extend_from_slice(&constant_properties); + if let Some(ref layer_col) = layer_col { cols_to_check.push(layer_col.as_ref()); } @@ -98,8 +99,8 @@ pub fn load_edges_from_parquet< time, src, dst, - properties, - constant_properties, + &properties, + &constant_properties, shared_constant_properties, layer, layer_col, @@ -112,18 +113,19 @@ pub fn load_edges_from_parquet< } pub fn load_node_props_from_parquet< - G: StaticGraphViewOps + InternalPropertyAdditionOps + InternalAdditionOps, + G: StaticGraphViewOps + InternalPropertyAdditionOps + InternalAdditionOps + InternalCache, >( graph: &G, parquet_path: &Path, id: &str, node_type: Option<&str>, node_type_col: Option<&str>, - constant_properties: Option<&[&str]>, + constant_properties: &[&str], shared_constant_properties: Option<&HashMap>, ) -> Result<(), GraphError> { let mut cols_to_check = vec![id]; - cols_to_check.extend(constant_properties.unwrap_or(&Vec::new())); + cols_to_check.extend_from_slice(&constant_properties); + if let Some(ref node_type_col) = node_type_col { cols_to_check.push(node_type_col.as_ref()); } @@ -137,7 +139,7 @@ pub fn load_node_props_from_parquet< id, node_type, node_type_col, - constant_properties, + &constant_properties, shared_constant_properties, graph, ) @@ -148,13 +150,13 @@ pub fn load_node_props_from_parquet< } pub fn load_edge_props_from_parquet< - G: StaticGraphViewOps + InternalPropertyAdditionOps + InternalAdditionOps, + G: StaticGraphViewOps + InternalPropertyAdditionOps + InternalAdditionOps + InternalCache, >( graph: &G, parquet_path: &Path, src: &str, dst: &str, - constant_properties: Option<&[&str]>, + constant_properties: &[&str], shared_const_properties: Option<&HashMap>, layer: Option<&str>, layer_col: Option<&str>, @@ -163,7 +165,8 @@ pub fn load_edge_props_from_parquet< if let Some(ref layer_col) = layer_col { cols_to_check.push(layer_col.as_ref()); } - cols_to_check.extend(constant_properties.unwrap_or(&Vec::new())); + + cols_to_check.extend_from_slice(&constant_properties); for path in get_parquet_file_paths(parquet_path)? { let df_view = process_parquet_file_to_df(path.as_path(), Some(&cols_to_check))?; @@ -172,11 +175,11 @@ pub fn load_edge_props_from_parquet< df_view, src, dst, - constant_properties, + &constant_properties, shared_const_properties, layer, layer_col, - graph.core_graph(), + graph, ) .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; } @@ -209,6 +212,35 @@ pub fn load_edge_deletions_from_parquet< Ok(()) } +pub fn load_graph_props_from_parquet< + G: StaticGraphViewOps + InternalPropertyAdditionOps + InternalAdditionOps, +>( + graph: &G, + parquet_path: &Path, + time: &str, + properties: &[&str], + constant_properties: &[&str], +) -> Result<(), GraphError> { + let mut cols_to_check = vec![time]; + cols_to_check.extend_from_slice(&properties); + cols_to_check.extend_from_slice(&constant_properties); + + for path in get_parquet_file_paths(parquet_path)? { + let df_view = process_parquet_file_to_df(path.as_path(), Some(&cols_to_check))?; + df_view.check_cols_exist(&cols_to_check)?; + load_graph_props_from_df( + df_view, + time, + Some(&properties), + Some(&constant_properties), + graph, + ) + .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; + } + + Ok(()) +} + pub(crate) fn process_parquet_file_to_df( parquet_file_path: &Path, col_names: Option<&[&str]>, diff --git a/raphtory/src/lib.rs b/raphtory/src/lib.rs index 5d62ff4d2..344362852 100644 --- a/raphtory/src/lib.rs +++ b/raphtory/src/lib.rs @@ -132,7 +132,10 @@ pub mod prelude { pub use raphtory_api::core::{entities::GID, input::input_node::InputNode}; #[cfg(feature = "proto")] - pub use crate::serialise::{CacheOps, StableDecode, StableEncode}; + pub use crate::serialise::{ + parquet::{ParquetDecoder, ParquetEncoder}, + CacheOps, StableDecode, StableEncode, + }; } #[cfg(feature = "storage")] @@ -143,8 +146,10 @@ pub use raphtory_api::{atomic_extra, core::utils::logging}; #[cfg(test)] mod test_utils { use crate::prelude::*; + use chrono::{DateTime, NaiveDateTime, Utc}; use itertools::Itertools; - use proptest::{arbitrary::any, prelude::Strategy}; + use proptest::{arbitrary::any, prelude::*}; + use raphtory_api::core::PropType; use std::collections::HashMap; #[cfg(feature = "storage")] use tempfile::TempDir; @@ -187,18 +192,344 @@ mod test_utils { ) } + pub(crate) fn prop(p_type: &PropType) -> impl Strategy { + match p_type { + PropType::Str => any::().prop_map(|s| Prop::str(s)).boxed(), + PropType::I64 => any::().prop_map(|i| Prop::I64(i)).boxed(), + PropType::F64 => any::().prop_map(Prop::F64).boxed(), + PropType::U8 => any::().prop_map(Prop::U8).boxed(), + PropType::Bool => any::().prop_map(Prop::Bool).boxed(), + PropType::DTime => (1900..2024, 1..=12, 1..28, 0..24, 0..60, 0..60) + .prop_map(|(year, month, day, h, m, s)| { + Prop::DTime( + format!( + "{:04}-{:02}-{:02}T{:02}:{:02}:{:02}Z", + year, month, day, h, m, s + ) + .parse::>() + .unwrap(), + ) + }) + .boxed(), + PropType::NDTime => (1970..2024, 1..=12, 1..28, 0..24, 0..60, 0..60) + .prop_map(|(year, month, day, h, m, s)| { + // 2015-09-18T23:56:04 + Prop::NDTime( + format!( + "{:04}-{:02}-{:02}T{:02}:{:02}:{:02}", + year, month, day, h, m, s + ) + .parse::() + .unwrap(), + ) + }) + .boxed(), + PropType::List(p_type) => proptest::collection::vec(prop(p_type), 0..10) + .prop_map(|props| Prop::List(props.into())) + .boxed(), + PropType::Map(p_types) => { + let prop_types: Vec> = p_types + .clone() + .into_iter() + .map(|(name, p_type)| { + let pt_strat = prop(&p_type) + .prop_map(move |prop| (name.clone(), prop.clone())) + .boxed(); + pt_strat + }) + .collect_vec(); + + let props = proptest::sample::select(prop_types).prop_flat_map(|prop| prop); + + proptest::collection::vec(props, 1..10) + .prop_map(|props| Prop::map(props)) + .boxed() + } + _ => todo!(), + } + } + + pub(crate) fn prop_type() -> impl Strategy { + let leaf = proptest::sample::select(&[ + PropType::Str, + PropType::I64, + PropType::F64, + PropType::U8, + PropType::Bool, + PropType::DTime, + PropType::NDTime, + ]); + + leaf.prop_recursive(3, 10, 10, |inner| { + let dict = proptest::collection::hash_map(r"\w{1,10}", inner.clone(), 1..10) + .prop_map(|map| PropType::map(map)); + let list = inner + .clone() + .prop_map(|p_type| PropType::List(Box::new(p_type))); + prop_oneof![dict, list] + }) + } + + #[derive(Debug, Clone)] + pub struct GraphFixture { + pub nodes: NodeFixture, + pub no_props_edges: Vec<(u64, u64, i64)>, + pub edges: Vec<(u64, u64, i64, Vec<(String, Prop)>, Option<&'static str>)>, + pub edge_deletions: Vec<(u64, u64, i64)>, + pub edge_const_props: HashMap<(u64, u64), Vec<(String, Prop)>>, + } + + #[derive(Debug, Default, Clone)] + pub struct NodeFixture { + pub nodes: Vec<(u64, i64, Vec<(String, Prop)>)>, + pub node_const_props: HashMap>, + } + + impl)>> From for NodeFixture + where + u64: TryFrom, + i64: TryFrom, + { + fn from(value: I) -> Self { + Self { + nodes: value + .into_iter() + .filter_map(|(node, time, props)| { + Some((node.try_into().ok()?, time.try_into().ok()?, props)) + }) + .collect(), + node_const_props: HashMap::new(), + } + } + } + + impl From for GraphFixture { + fn from(node_fix: NodeFixture) -> Self { + Self { + nodes: node_fix, + edges: vec![], + edge_deletions: vec![], + no_props_edges: vec![], + edge_const_props: HashMap::new(), + } + } + } + + impl, Option<&'static str>)>> From + for GraphFixture + where + u64: TryFrom, + i64: TryFrom, + { + fn from(edges: I) -> Self { + Self { + edges: edges + .into_iter() + .filter_map(|(src, dst, t, props, layer)| { + Some(( + src.try_into().ok()?, + dst.try_into().ok()?, + t.try_into().ok()?, + props, + layer, + )) + }) + .collect(), + no_props_edges: vec![], + edge_deletions: vec![], + edge_const_props: HashMap::new(), + nodes: Default::default(), + } + } + } + + fn make_props( + schema: HashMap, + ) -> (BoxedStrategy<(String, Prop)>, BoxedStrategy<(String, Prop)>) { + let schema_vec = schema.into_iter().collect_vec(); + // split in half, one temporal one constant + let t_prop_s = schema_vec + .iter() + .take(schema_vec.len() / 2) + .cloned() + .collect::>(); + let c_prop_s = schema_vec + .iter() + .skip(schema_vec.len() / 2) + .cloned() + .collect::>(); + + let make_props = |props: &HashMap| { + props + .clone() + .into_iter() + .map(|(name, p_type)| { + prop(&p_type) + .prop_map(move |prop| (name.clone(), prop)) + .boxed() + }) + .collect_vec() + }; + + let t_props = make_props(&t_prop_s); + let t_props = proptest::sample::select(t_props).prop_flat_map(|prop| prop); + + let c_props = make_props(&c_prop_s); + let c_props = proptest::sample::select(c_props).prop_flat_map(|prop| prop); + (t_props.boxed(), c_props.boxed()) + } + pub(crate) fn build_nodes_dyn( + nodes: Vec, + len: usize, + ) -> impl Strategy { + proptest::collection::hash_map(r"\w{1,10}", prop_type(), 2..10).prop_flat_map( + move |schema| { + let (t_props, c_props) = make_props(schema); + + proptest::collection::vec( + ( + proptest::sample::select(nodes.clone()), + i64::MIN..i64::MAX, + proptest::collection::vec(t_props, 1..7), + proptest::collection::vec(c_props, 1..3), + ), + 0..=len, + ) + .prop_map(|edges| { + let const_props = edges + .iter() + .into_group_map_by(|(src, _, _, _)| src) + .iter() + .map(|(&src, &ref b)| { + let c_props = b + .iter() + .flat_map(|(_, _, _, c)| c.clone()) + .collect::>(); + (*src, c_props) + }) + .collect::>(); + + let nodes = edges + .into_iter() + .map(|(node, time, t_props, _)| (node, time, t_props)) + .collect::>(); + + NodeFixture { + nodes, + node_const_props: const_props, + } + }) + }, + ) + } + + pub(crate) fn build_edge_list_dyn( + len: usize, + num_nodes: usize, + ) -> impl Strategy { + let num_nodes = num_nodes as u64; + let edges = proptest::collection::hash_map(r"\w{1,10}", prop_type(), 2..10).prop_flat_map( + move |schema| { + let (t_props, c_props) = make_props(schema); + + proptest::collection::vec( + ( + 0..num_nodes, + 0..num_nodes, + i64::MIN..i64::MAX, + proptest::collection::vec(t_props, 1..7), + proptest::collection::vec(c_props, 1..3), + proptest::sample::select(vec![Some("a"), Some("b"), None]), + ), + 0..=len, + ) + .prop_flat_map(move |edges| { + let no_props = proptest::collection::vec( + (0..num_nodes, 0..num_nodes, i64::MIN..i64::MAX), + 0..=len, + ); + let del_edges = proptest::collection::vec( + (0..num_nodes, 0..num_nodes, i64::MIN..i64::MAX), + 0..=len, + ); + (no_props, del_edges).prop_map(move |(no_prop_edges, del_edges)| { + let edges = edges.clone(); + let const_props = edges + .iter() + .into_group_map_by(|(src, dst, _, _, _, _)| (src, dst)) + .iter() + .map(|(&a, &ref b)| { + let (src, dst) = a; + let c_props = b + .iter() + .flat_map(|(_, _, _, _, c, _)| c.clone()) + .collect::>(); + ((*src, *dst), c_props) + }) + .collect::>(); + + let edges = edges + .into_iter() + .map(|(src, dst, time, t_props, _, layer)| { + (src, dst, time, t_props, layer) + }) + .collect::>(); + + GraphFixture { + edges, + edge_const_props: const_props, + edge_deletions: del_edges, + no_props_edges: no_prop_edges, + nodes: Default::default(), + } + }) + }) + }, + ); + edges + } + + pub(crate) fn build_graph_strat( + len: usize, + num_nodes: usize, + ) -> impl Strategy { + build_edge_list_dyn(len, num_nodes).prop_flat_map(|g_fixture| { + let mut nodes = g_fixture + .edges + .iter() + .flat_map(|(src, dst, _, _, _)| [*src, *dst]) + .collect_vec(); + nodes.sort_unstable(); + nodes.dedup(); + + if nodes.is_empty() { + Just(g_fixture).boxed() + } else { + let GraphFixture { + edges, + edge_const_props, + no_props_edges, + .. + } = g_fixture; + build_nodes_dyn(nodes, 10) + .prop_map(move |nodes_f| GraphFixture { + nodes: nodes_f, + edges: edges.clone(), + edge_deletions: vec![], + no_props_edges: no_props_edges.clone(), + edge_const_props: edge_const_props.clone(), + }) + .boxed() + } + }) + } + pub(crate) fn build_node_props( max_num_nodes: u64, ) -> impl Strategy, Option)>> { (0..max_num_nodes).prop_flat_map(|num_nodes| { (0..num_nodes) - .map(|node| { - ( - proptest::strategy::Just(node), - any::>(), - any::>(), - ) - }) + .map(|node| (Just(node), any::>(), any::>())) .collect_vec() }) } @@ -227,6 +558,49 @@ mod test_utils { g } + pub(crate) fn build_graph<'a>(graph_fix: impl Into) -> Graph { + let g = Graph::new(); + let graph_fix = graph_fix.into(); + for (src, dst, time) in &graph_fix.no_props_edges { + g.add_edge(*time, *src, *dst, NO_PROPS, None).unwrap(); + } + for (src, dst, time, props, layer) in &graph_fix.edges { + g.add_edge(*time, src, dst, props.clone(), *layer).unwrap(); + } + for ((src, dst), props) in graph_fix.edge_const_props { + if let Some(edge) = g.edge(src, dst) { + edge.update_constant_properties(props, None).unwrap(); + } else { + g.add_edge(0, src, dst, NO_PROPS, None) + .unwrap() + .update_constant_properties(props, None) + .unwrap(); + } + } + for (node, t, t_props) in &graph_fix.nodes.nodes { + if let Some(n) = g.node(*node) { + n.add_updates(*t, t_props.clone()).unwrap(); + } else { + g.add_node(0, *node, t_props.clone(), None).unwrap(); + } + } + for (node, c_props) in &graph_fix.nodes.node_const_props { + if let Some(n) = g.node(*node) { + n.update_constant_properties(c_props.clone()).unwrap(); + } else { + let node = g.add_node(0, *node, NO_PROPS, None).unwrap(); + node.update_constant_properties(c_props.clone()).unwrap(); + } + } + + for (src, dst, time) in &graph_fix.edge_deletions { + if let Some(edge) = g.edge(*src, *dst) { + edge.delete(*time, None).unwrap(); + } + } + g + } + pub(crate) fn add_node_props<'a>( graph: &'a Graph, nodes: impl IntoIterator, Option)>, diff --git a/raphtory/src/python/graph/graph.rs b/raphtory/src/python/graph/graph.rs index 1b81298c1..bbe01b6e8 100644 --- a/raphtory/src/python/graph/graph.rs +++ b/raphtory/src/python/graph/graph.rs @@ -20,7 +20,10 @@ use crate::{ types::iterable::FromIterable, utils::{PyNodeRef, PyTime}, }, - serialise::{StableDecode, StableEncode}, + serialise::{ + parquet::{ParquetDecoder, ParquetEncoder}, + StableDecode, StableEncode, + }, }; use pyo3::{prelude::*, pybacked::PyBackedStr, types::PyDict}; use raphtory_api::core::{entities::GID, storage::arc_str::ArcStr}; @@ -179,6 +182,28 @@ impl PyGraph { Ok(graph) } + /// Persist graph to parquet files + /// + /// Arguments: + /// graph_dir (str | PathLike): the folder where the graph will be persisted as parquet + /// + pub fn to_parquet(&self, graph_dir: PathBuf) -> Result<(), GraphError> { + self.graph.encode_parquet(graph_dir) + } + + /// Read graph from parquet files + /// + /// Arguments: + /// graph_dir (str | PathLike): the folder where the graph is stored as parquet + /// + /// Returns: + /// Graph: a view of the graph + /// + #[staticmethod] + pub fn from_parquet(graph_dir: PathBuf) -> Result { + Graph::decode_parquet(graph_dir) + } + /// Adds a new node with the given id and properties to the graph. /// /// Arguments: @@ -636,8 +661,9 @@ impl PyGraph { constant_properties: Option>, shared_constant_properties: Option>, ) -> Result<(), GraphError> { - let properties = convert_py_prop_args(properties.as_deref()); - let constant_properties = convert_py_prop_args(constant_properties.as_deref()); + let properties = convert_py_prop_args(properties.as_deref()).unwrap_or_default(); + let constant_properties = + convert_py_prop_args(constant_properties.as_deref()).unwrap_or_default(); load_nodes_from_pandas( &self.graph, df, @@ -645,8 +671,8 @@ impl PyGraph { id, node_type, node_type_col, - properties.as_deref(), - constant_properties.as_deref(), + &properties, + &constant_properties, shared_constant_properties.as_ref(), ) } @@ -682,8 +708,9 @@ impl PyGraph { constant_properties: Option>, shared_constant_properties: Option>, ) -> Result<(), GraphError> { - let properties = convert_py_prop_args(properties.as_deref()); - let constant_properties = convert_py_prop_args(constant_properties.as_deref()); + let properties = convert_py_prop_args(properties.as_deref()).unwrap_or_default(); + let constant_properties = + convert_py_prop_args(constant_properties.as_deref()).unwrap_or_default(); load_nodes_from_parquet( &self.graph, parquet_path.as_path(), @@ -691,8 +718,8 @@ impl PyGraph { id, node_type, node_type_col, - properties.as_deref(), - constant_properties.as_deref(), + &properties, + &constant_properties, shared_constant_properties.as_ref(), ) } @@ -730,16 +757,17 @@ impl PyGraph { layer: Option<&str>, layer_col: Option<&str>, ) -> Result<(), GraphError> { - let properties = convert_py_prop_args(properties.as_deref()); - let constant_properties = convert_py_prop_args(constant_properties.as_deref()); + let properties = convert_py_prop_args(properties.as_deref()).unwrap_or_default(); + let constant_properties = + convert_py_prop_args(constant_properties.as_deref()).unwrap_or_default(); load_edges_from_pandas( &self.graph, df, time, src, dst, - properties.as_deref(), - constant_properties.as_deref(), + &properties, + &constant_properties, shared_constant_properties.as_ref(), layer, layer_col, @@ -779,16 +807,17 @@ impl PyGraph { layer: Option<&str>, layer_col: Option<&str>, ) -> Result<(), GraphError> { - let properties = convert_py_prop_args(properties.as_deref()); - let constant_properties = convert_py_prop_args(constant_properties.as_deref()); + let properties = convert_py_prop_args(properties.as_deref()).unwrap_or_default(); + let constant_properties = + convert_py_prop_args(constant_properties.as_deref()).unwrap_or_default(); load_edges_from_parquet( &self.graph, parquet_path.as_path(), time, src, dst, - properties.as_deref(), - constant_properties.as_deref(), + &properties, + &constant_properties, shared_constant_properties.as_ref(), layer, layer_col, @@ -822,14 +851,15 @@ impl PyGraph { constant_properties: Option>, shared_constant_properties: Option>, ) -> Result<(), GraphError> { - let constant_properties = convert_py_prop_args(constant_properties.as_deref()); + let constant_properties = + convert_py_prop_args(constant_properties.as_deref()).unwrap_or_default(); load_node_props_from_pandas( &self.graph, df, id, node_type, node_type_col, - constant_properties.as_deref(), + &constant_properties, shared_constant_properties.as_ref(), ) } @@ -861,14 +891,15 @@ impl PyGraph { constant_properties: Option>, shared_constant_properties: Option>, ) -> Result<(), GraphError> { - let constant_properties = convert_py_prop_args(constant_properties.as_deref()); + let constant_properties = + convert_py_prop_args(constant_properties.as_deref()).unwrap_or_default(); load_node_props_from_parquet( &self.graph, parquet_path.as_path(), id, node_type, node_type_col, - constant_properties.as_deref(), + &constant_properties, shared_constant_properties.as_ref(), ) } @@ -902,13 +933,14 @@ impl PyGraph { layer: Option<&str>, layer_col: Option<&str>, ) -> Result<(), GraphError> { - let constant_properties = convert_py_prop_args(constant_properties.as_deref()); + let constant_properties = + convert_py_prop_args(constant_properties.as_deref()).unwrap_or_default(); load_edge_props_from_pandas( &self.graph, df, src, dst, - constant_properties.as_deref(), + &constant_properties, shared_constant_properties.as_ref(), layer, layer_col, @@ -944,13 +976,14 @@ impl PyGraph { layer: Option<&str>, layer_col: Option<&str>, ) -> Result<(), GraphError> { - let constant_properties = convert_py_prop_args(constant_properties.as_deref()); + let constant_properties = + convert_py_prop_args(constant_properties.as_deref()).unwrap_or_default(); load_edge_props_from_parquet( &self.graph, parquet_path.as_path(), src, dst, - constant_properties.as_deref(), + &constant_properties, shared_constant_properties.as_ref(), layer, layer_col, diff --git a/raphtory/src/python/graph/graph_with_deletions.rs b/raphtory/src/python/graph/graph_with_deletions.rs index 9e6caf0b8..76a478e41 100644 --- a/raphtory/src/python/graph/graph_with_deletions.rs +++ b/raphtory/src/python/graph/graph_with_deletions.rs @@ -591,8 +591,9 @@ impl PyPersistentGraph { constant_properties: Option>, shared_constant_properties: Option>, ) -> Result<(), GraphError> { - let properties = convert_py_prop_args(properties.as_deref()); - let constant_properties = convert_py_prop_args(constant_properties.as_deref()); + let properties = convert_py_prop_args(properties.as_deref()).unwrap_or_default(); + let constant_properties = + convert_py_prop_args(constant_properties.as_deref()).unwrap_or_default(); load_nodes_from_pandas( &self.graph, df, @@ -600,8 +601,8 @@ impl PyPersistentGraph { id, node_type, node_type_col, - properties.as_deref(), - constant_properties.as_deref(), + &properties, + &constant_properties, shared_constant_properties.as_ref(), ) } @@ -635,8 +636,9 @@ impl PyPersistentGraph { constant_properties: Option>, shared_constant_properties: Option>, ) -> Result<(), GraphError> { - let properties = convert_py_prop_args(properties.as_deref()); - let constant_properties = convert_py_prop_args(constant_properties.as_deref()); + let properties = convert_py_prop_args(properties.as_deref()).unwrap_or_default(); + let constant_properties = + convert_py_prop_args(constant_properties.as_deref()).unwrap_or_default(); load_nodes_from_parquet( &self.graph, parquet_path.as_path(), @@ -644,8 +646,8 @@ impl PyPersistentGraph { id, node_type, node_type_col, - properties.as_deref(), - constant_properties.as_deref(), + &properties, + &constant_properties, shared_constant_properties.as_ref(), ) } @@ -681,16 +683,17 @@ impl PyPersistentGraph { layer: Option<&str>, layer_col: Option<&str>, ) -> Result<(), GraphError> { - let properties = convert_py_prop_args(properties.as_deref()); - let constant_properties = convert_py_prop_args(constant_properties.as_deref()); + let properties = convert_py_prop_args(properties.as_deref()).unwrap_or_default(); + let constant_properties = + convert_py_prop_args(constant_properties.as_deref()).unwrap_or_default(); load_edges_from_pandas( &self.graph, df, time, src, dst, - properties.as_deref(), - constant_properties.as_deref(), + &properties, + &constant_properties, shared_constant_properties.as_ref(), layer, layer_col, @@ -728,16 +731,17 @@ impl PyPersistentGraph { layer: Option<&str>, layer_col: Option<&str>, ) -> Result<(), GraphError> { - let properties = convert_py_prop_args(properties.as_deref()); - let constant_properties = convert_py_prop_args(constant_properties.as_deref()); + let properties = convert_py_prop_args(properties.as_deref()).unwrap_or_default(); + let constant_properties = + convert_py_prop_args(constant_properties.as_deref()).unwrap_or_default(); load_edges_from_parquet( &self.graph, parquet_path.as_path(), time, src, dst, - properties.as_deref(), - constant_properties.as_deref(), + &properties, + &constant_properties, shared_constant_properties.as_ref(), layer, layer_col, @@ -833,14 +837,15 @@ impl PyPersistentGraph { constant_properties: Option>, shared_constant_properties: Option>, ) -> Result<(), GraphError> { - let constant_properties = convert_py_prop_args(constant_properties.as_deref()); + let constant_properties = + convert_py_prop_args(constant_properties.as_deref()).unwrap_or_default(); load_node_props_from_pandas( &self.graph, df, id, node_type, node_type_col, - constant_properties.as_deref(), + &constant_properties, shared_constant_properties.as_ref(), ) } @@ -870,14 +875,15 @@ impl PyPersistentGraph { constant_properties: Option>, shared_constant_properties: Option>, ) -> Result<(), GraphError> { - let constant_properties = convert_py_prop_args(constant_properties.as_deref()); + let constant_properties = + convert_py_prop_args(constant_properties.as_deref()).unwrap_or_default(); load_node_props_from_parquet( &self.graph, parquet_path.as_path(), id, node_type, node_type_col, - constant_properties.as_deref(), + &constant_properties, shared_constant_properties.as_ref(), ) } @@ -909,13 +915,14 @@ impl PyPersistentGraph { layer: Option<&str>, layer_col: Option<&str>, ) -> Result<(), GraphError> { - let constant_properties = convert_py_prop_args(constant_properties.as_deref()); + let constant_properties = + convert_py_prop_args(constant_properties.as_deref()).unwrap_or_default(); load_edge_props_from_pandas( &self.graph, df, src, dst, - constant_properties.as_deref(), + &constant_properties, shared_constant_properties.as_ref(), layer, layer_col, @@ -949,13 +956,14 @@ impl PyPersistentGraph { layer: Option<&str>, layer_col: Option<&str>, ) -> Result<(), GraphError> { - let constant_properties = convert_py_prop_args(constant_properties.as_deref()); + let constant_properties = + convert_py_prop_args(constant_properties.as_deref()).unwrap_or_default(); load_edge_props_from_parquet( &self.graph, parquet_path.as_path(), src, dst, - constant_properties.as_deref(), + &constant_properties, shared_constant_properties.as_ref(), layer, layer_col, diff --git a/raphtory/src/python/graph/io/pandas_loaders.rs b/raphtory/src/python/graph/io/pandas_loaders.rs index 70b5e97db..6735c8d4e 100644 --- a/raphtory/src/python/graph/io/pandas_loaders.rs +++ b/raphtory/src/python/graph/io/pandas_loaders.rs @@ -24,7 +24,7 @@ pub(crate) fn convert_py_prop_args(properties: Option<&[PyBackedStr]>) -> Option pub(crate) fn load_nodes_from_pandas< 'py, - G: StaticGraphViewOps + InternalPropertyAdditionOps + InternalAdditionOps, + G: StaticGraphViewOps + InternalPropertyAdditionOps + InternalAdditionOps + InternalCache, >( graph: &G, df: &Bound<'py, PyAny>, @@ -32,13 +32,13 @@ pub(crate) fn load_nodes_from_pandas< id: &str, node_type: Option<&str>, node_type_col: Option<&str>, - properties: Option<&[&str]>, - constant_properties: Option<&[&str]>, + properties: &[&str], + constant_properties: &[&str], shared_constant_properties: Option<&HashMap>, ) -> Result<(), GraphError> { let mut cols_to_check = vec![id, time]; - cols_to_check.extend(properties.iter().flat_map(|v| v.iter())); - cols_to_check.extend(constant_properties.iter().flat_map(|v| v.iter())); + cols_to_check.extend_from_slice(properties); + cols_to_check.extend_from_slice(constant_properties); if let Some(ref node_type_col) = node_type_col { cols_to_check.push(node_type_col.as_ref()); } @@ -49,8 +49,8 @@ pub(crate) fn load_nodes_from_pandas< df_view, time, id, - properties.as_deref(), - constant_properties.as_deref(), + &properties, + &constant_properties, shared_constant_properties, node_type, node_type_col, @@ -67,15 +67,15 @@ pub(crate) fn load_edges_from_pandas< time: &str, src: &str, dst: &str, - properties: Option<&[&str]>, - constant_properties: Option<&[&str]>, + properties: &[&str], + constant_properties: &[&str], shared_constant_properties: Option<&HashMap>, layer: Option<&str>, layer_col: Option<&str>, ) -> Result<(), GraphError> { let mut cols_to_check = vec![src, dst, time]; - cols_to_check.extend(properties.unwrap_or(&Vec::new())); - cols_to_check.extend(constant_properties.unwrap_or(&Vec::new())); + cols_to_check.extend_from_slice(properties); + cols_to_check.extend_from_slice(constant_properties); if let Some(ref layer_col) = layer_col { cols_to_check.push(layer_col.as_ref()); } @@ -98,18 +98,18 @@ pub(crate) fn load_edges_from_pandas< pub(crate) fn load_node_props_from_pandas< 'py, - G: StaticGraphViewOps + InternalPropertyAdditionOps + InternalAdditionOps, + G: StaticGraphViewOps + InternalPropertyAdditionOps + InternalAdditionOps + InternalCache, >( graph: &G, df: &Bound<'py, PyAny>, id: &str, node_type: Option<&str>, node_type_col: Option<&str>, - constant_properties: Option<&[&str]>, + constant_properties: &[&str], shared_constant_properties: Option<&HashMap>, ) -> Result<(), GraphError> { let mut cols_to_check = vec![id]; - cols_to_check.extend(constant_properties.unwrap_or(&Vec::new())); + cols_to_check.extend_from_slice(constant_properties); if let Some(ref node_type_col) = node_type_col { cols_to_check.push(node_type_col.as_ref()); } @@ -128,14 +128,14 @@ pub(crate) fn load_node_props_from_pandas< pub(crate) fn load_edge_props_from_pandas< 'py, - G: StaticGraphViewOps + InternalPropertyAdditionOps + InternalAdditionOps, + G: StaticGraphViewOps + InternalPropertyAdditionOps + InternalAdditionOps + InternalCache, >( graph: &G, df: &Bound<'py, PyAny>, src: &str, dst: &str, - constant_properties: Option<&[&str]>, - shared_constant_properties: Option<&HashMap>, + constant_properties: &[&str], + shared_const_properties: Option<&HashMap>, layer: Option<&str>, layer_col: Option<&str>, ) -> Result<(), GraphError> { @@ -143,7 +143,7 @@ pub(crate) fn load_edge_props_from_pandas< if let Some(ref layer_col) = layer_col { cols_to_check.push(layer_col.as_ref()); } - cols_to_check.extend(constant_properties.unwrap_or(&Vec::new())); + cols_to_check.extend_from_slice(constant_properties); let df_view = process_pandas_py_df(df, cols_to_check.clone())?; df_view.check_cols_exist(&cols_to_check)?; load_edges_props_from_df( @@ -151,7 +151,7 @@ pub(crate) fn load_edge_props_from_pandas< src, dst, constant_properties, - shared_constant_properties, + shared_const_properties, layer, layer_col, graph, diff --git a/raphtory/src/python/types/repr.rs b/raphtory/src/python/types/repr.rs index a9d859571..c3da9edd3 100644 --- a/raphtory/src/python/types/repr.rs +++ b/raphtory/src/python/types/repr.rs @@ -228,7 +228,7 @@ impl Repr for Arc<[T]> { } } -impl Repr for HashMap { +impl Repr for HashMap { fn repr(&self) -> String { let repr = self .iter() diff --git a/raphtory/src/python/types/wrappers/prop.rs b/raphtory/src/python/types/wrappers/prop.rs index f25c3c243..bada687db 100644 --- a/raphtory/src/python/types/wrappers/prop.rs +++ b/raphtory/src/python/types/wrappers/prop.rs @@ -1,6 +1,5 @@ -use super::document::PyDocument; use crate::{ - core::{prop_array::PropArray, utils::errors::GraphError, DocumentInput, Prop}, + core::{prop_array::PropArray, utils::errors::GraphError, Prop}, db::graph::views::property_filter::internal::{ InternalEdgeFilterOps, InternalExplodedEdgeFilterOps, InternalNodePropertyFilterOps, }, @@ -36,7 +35,6 @@ impl<'py> IntoPyObject<'py> for Prop { py.None().into_bound(py) } } - Prop::Document(d) => PyDocument::from(d).into_pyobject(py)?.into_any(), Prop::I32(v) => v.into_pyobject(py)?.into_any(), Prop::U32(v) => v.into_pyobject(py)?.into_any(), Prop::F32(v) => v.into_pyobject(py)?.into_any(), @@ -67,12 +65,6 @@ impl<'source> FromPyObject<'source> for Prop { if let Ok(s) = ob.extract::() { return Ok(Prop::Str(s.into())); } - if let Ok(d) = ob.extract::() { - return Ok(Prop::Document(DocumentInput { - content: d.content, - life: d.life, - })); - } if let Ok(list) = ob.extract() { return Ok(Prop::List(Arc::new(list))); } @@ -103,7 +95,6 @@ impl Repr for Prop { Prop::DTime(v) => v.repr(), Prop::NDTime(v) => v.repr(), Prop::Array(v) => format!("{:?}", v), - Prop::Document(d) => d.content.repr(), // We can't reuse the __repr__ defined for PyDocument because it needs to run python code Prop::I32(v) => v.repr(), Prop::U32(v) => v.repr(), Prop::F32(v) => v.repr(), diff --git a/raphtory/src/serialise/mod.rs b/raphtory/src/serialise/mod.rs index 28918ff4a..f1adf6628 100644 --- a/raphtory/src/serialise/mod.rs +++ b/raphtory/src/serialise/mod.rs @@ -2,6 +2,7 @@ use memmap2::Mmap; use zip::{write::FileOptions, ZipArchive, ZipWriter}; pub(crate) mod incremental; +pub(crate) mod parquet; mod proto_ext; mod serialise; diff --git a/raphtory/src/serialise/parquet/edges.rs b/raphtory/src/serialise/parquet/edges.rs new file mode 100644 index 000000000..9fc367ba2 --- /dev/null +++ b/raphtory/src/serialise/parquet/edges.rs @@ -0,0 +1,176 @@ +use std::path::Path; + +use super::*; +use crate::{ + core::utils::iter::GenLockedIter, + db::{ + api::{ + storage::graph::edges::edge_storage_ops::EdgeStorageOps, + view::internal::{CoreGraphOps, TimeSemantics}, + }, + graph::edge::EdgeView, + }, + serialise::parquet::model::ParquetDelEdge, +}; +use arrow_schema::{DataType, Field}; +use model::ParquetCEdge; +use raphtory_api::{ + core::{ + entities::{LayerIds, EID}, + storage::timeindex::TimeIndexIntoOps, + }, + iter::IntoDynBoxed, +}; + +pub(crate) fn encode_edge_tprop( + g: &GraphStorage, + path: impl AsRef, +) -> Result<(), GraphError> { + run_encode( + g, + g.edge_meta().temporal_prop_meta(), + g.unfiltered_num_edges(), + path, + EDGES_T_PATH, + |id_type| { + vec![ + Field::new(TIME_COL, DataType::Int64, false), + Field::new(SRC_COL, id_type.clone(), false), + Field::new(DST_COL, id_type.clone(), false), + Field::new(LAYER_COL, DataType::Utf8, true), + ] + }, + |edges, g, decoder, writer| { + let row_group_size = 100_000; + let all_layers = LayerIds::All; + + for edge_rows in edges + .into_iter() + .map(EID) + .flat_map(|eid| { + let edge_ref = g.core_edge(eid).out_ref(); + g.edge_exploded(edge_ref, &all_layers) + }) + .map(|edge| ParquetTEdge(EdgeView::new(g, edge))) + .chunks(row_group_size) + .into_iter() + .map(|chunk| chunk.collect_vec()) + { + decoder.serialize(&edge_rows)?; + if let Some(rb) = decoder.flush()? { + writer.write(&rb)?; + writer.flush()?; + } + } + Ok(()) + }, + ) +} + +pub(crate) fn encode_edge_deletions( + g: &GraphStorage, + path: impl AsRef, +) -> Result<(), GraphError> { + run_encode( + g, + g.edge_meta().temporal_prop_meta(), + g.unfiltered_num_edges(), + path, + EDGES_D_PATH, + |id_type| { + vec![ + Field::new(TIME_COL, DataType::Int64, false), + Field::new(SRC_COL, id_type.clone(), false), + Field::new(DST_COL, id_type.clone(), false), + Field::new(LAYER_COL, DataType::Utf8, true), + ] + }, + |edges, g, decoder, writer| { + let row_group_size = 100_000; + let g = g.lock(); + let g = &g; + let g_edges = g.edges(); + let layers = g + .unique_layers() + .map(|s| s.to_string().to_owned()) + .collect::>(); + let layers = &layers; + + for edge_rows in edges + .into_iter() + .map(EID) + .flat_map(|eid| { + (0..g.unfiltered_num_layers()).flat_map(move |layer_id| { + let edge = g_edges.edge(eid); + let edge_ref = edge.out_ref(); + GenLockedIter::from(edge, |edge| { + edge.deletions(layer_id).into_iter().into_dyn_boxed() + }) + .map(move |deletions| ParquetDelEdge { + del: deletions, + layer: &layers[layer_id], + edge: EdgeView::new(g, edge_ref), + }) + }) + }) + .chunks(row_group_size) + .into_iter() + .map(|chunk| chunk.collect_vec()) + { + decoder.serialize(&edge_rows)?; + if let Some(rb) = decoder.flush()? { + writer.write(&rb)?; + writer.flush()?; + } + } + Ok(()) + }, + ) +} + +pub(crate) fn encode_edge_cprop( + g: &GraphStorage, + path: impl AsRef, +) -> Result<(), GraphError> { + run_encode( + g, + g.edge_meta().const_prop_meta(), + g.unfiltered_num_edges(), + path, + EDGES_C_PATH, + |id_type| { + vec![ + Field::new(SRC_COL, id_type.clone(), false), + Field::new(DST_COL, id_type.clone(), false), + Field::new(LAYER_COL, DataType::Utf8, true), + ] + }, + |edges, g, decoder, writer| { + let row_group_size = 100_000.min(edges.len()); + let layers = 0..g.unfiltered_num_layers(); + + for edge_rows in edges + .into_iter() + .map(EID) + .flat_map(|eid| { + let edge_ref = g.core_edge(eid).out_ref(); + layers + .clone() + .into_iter() + .map(move |l_id| edge_ref.at_layer(l_id)) + }) + .map(|edge| ParquetCEdge(EdgeView::new(g, edge))) + .chunks(row_group_size) + .into_iter() + .map(|chunk| chunk.collect_vec()) + { + decoder.serialize(&edge_rows)?; + if let Some(rb) = decoder.flush()? { + writer.write(&rb)?; + writer.flush()?; + } + } + Ok(()) + }, + ) +} diff --git a/raphtory/src/serialise/parquet/graph.rs b/raphtory/src/serialise/parquet/graph.rs new file mode 100644 index 000000000..c82474d68 --- /dev/null +++ b/raphtory/src/serialise/parquet/graph.rs @@ -0,0 +1,125 @@ +use crate::{ + core::utils::errors::GraphError, + db::api::storage::graph::storage_ops::GraphStorage, + prelude::{GraphViewOps, Prop}, + serialise::parquet::{ + model::ParquetProp, run_encode, EVENT_GRAPH_TYPE, GRAPH_C_PATH, GRAPH_TYPE, GRAPH_T_PATH, + PERSISTENT_GRAPH_TYPE, TIME_COL, + }, +}; +use arrow_schema::{DataType, Field}; +use itertools::Itertools; +use parquet::format::KeyValue; +use raphtory_api::{core::storage::arc_str::ArcStr, GraphType}; +use serde::{ser::SerializeMap, Serialize}; +use std::{collections::HashMap, path::Path}; + +pub fn encode_graph_tprop(g: &GraphStorage, path: impl AsRef) -> Result<(), GraphError> { + run_encode( + g, + g.graph_meta().temporal_prop_meta(), + 1, + path, + GRAPH_T_PATH, + |_| vec![Field::new(TIME_COL, DataType::Int64, false)], + |_, g, decoder, writer| { + let merged_props = g + .properties() + .temporal() + .into_iter() + .map(|(k, view)| view.into_iter().map(move |(t, prop)| (k.clone(), t, prop))) + .kmerge_by(|(_, t1, _), (_, t2, _)| t1 < t2); + + let mut row = HashMap::::new(); + let mut rows = vec![]; + let mut last_t: Option = None; + for (key, t1, prop) in merged_props { + if let Some(last_t) = last_t { + if last_t != t1 { + let mut old = HashMap::::new(); + std::mem::swap(&mut row, &mut old); + rows.push(Row { + t: last_t, + row: old, + }); + } + } + + row.insert(key, prop); + last_t = Some(t1); + } + if !row.is_empty() { + rows.push(Row { + t: last_t.unwrap(), + row, + }); + } + + decoder.serialize(&rows)?; + if let Some(rb) = decoder.flush()? { + writer.write(&rb)?; + writer.flush()?; + } + + Ok(()) + }, + ) +} + +#[derive(Debug)] +struct Row { + t: i64, + row: HashMap, +} + +impl Serialize for Row { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + let mut state = serializer.serialize_map(Some(self.row.len()))?; + for (k, v) in self.row.iter() { + state.serialize_entry(k, &ParquetProp(v))?; + } + state.serialize_entry(TIME_COL, &self.t)?; + state.end() + } +} + +pub fn encode_graph_cprop( + g: &GraphStorage, + graph_type: GraphType, + path: impl AsRef, +) -> Result<(), GraphError> { + run_encode( + g, + g.graph_meta().const_prop_meta(), + 1, + path, + GRAPH_C_PATH, + |_| vec![Field::new(TIME_COL, DataType::Int64, true)], + |_, g, decoder, writer| { + let row = g.properties().constant().as_map(); + + let rows = vec![Row { t: 0, row }]; + decoder.serialize(&rows)?; + if let Some(rb) = decoder.flush()? { + writer.write(&rb)?; + writer.flush()?; + } + + match graph_type { + GraphType::EventGraph => writer.append_key_value_metadata(KeyValue::new( + GRAPH_TYPE.to_string(), + Some(EVENT_GRAPH_TYPE.to_string()), + )), + GraphType::PersistentGraph => writer.append_key_value_metadata(KeyValue::new( + GRAPH_TYPE.to_string(), + Some(PERSISTENT_GRAPH_TYPE.to_string()), + )), + }; + + Ok(()) + }, + ) +} diff --git a/raphtory/src/serialise/parquet/mod.rs b/raphtory/src/serialise/parquet/mod.rs new file mode 100644 index 000000000..57b6157dd --- /dev/null +++ b/raphtory/src/serialise/parquet/mod.rs @@ -0,0 +1,739 @@ +use crate::{ + core::{arrow_dtype_from_prop_type, utils::errors::GraphError}, + db::{ + api::{ + mutation::internal::InternalAdditionOps, storage::graph::storage_ops::GraphStorage, + view::internal::CoreGraphOps, + }, + graph::views::deletion_graph::PersistentGraph, + }, + io::parquet_loaders::{ + load_edge_deletions_from_parquet, load_edge_props_from_parquet, load_edges_from_parquet, + load_graph_props_from_parquet, load_node_props_from_parquet, load_nodes_from_parquet, + }, + prelude::*, + serialise::parquet::{ + edges::encode_edge_deletions, + graph::{encode_graph_cprop, encode_graph_tprop}, + model::get_id_type, + nodes::{encode_nodes_cprop, encode_nodes_tprop}, + }, +}; +use arrow_json::{reader::Decoder, ReaderBuilder}; +use arrow_schema::{DataType, Field, Schema, SchemaRef}; +use edges::{encode_edge_cprop, encode_edge_tprop}; +use itertools::Itertools; +use model::ParquetTEdge; +use parquet::{ + arrow::{arrow_reader::ArrowReaderMetadata, ArrowWriter}, + basic::Compression, + file::properties::WriterProperties, +}; +use raphtory_api::{ + core::entities::{properties::props::PropMapper, GidType}, + GraphType, +}; +use rayon::iter::{ParallelBridge, ParallelIterator}; +use std::{ + fs::File, + ops::Range, + path::{Path, PathBuf}, +}; + +mod edges; +mod model; +mod nodes; + +mod graph; + +pub trait ParquetEncoder { + fn encode_parquet(&self, path: impl AsRef) -> Result<(), GraphError>; +} + +pub trait ParquetDecoder { + fn decode_parquet(path: impl AsRef) -> Result + where + Self: Sized; +} + +const NODE_ID: &str = "rap_node_id"; +const TYPE_COL: &str = "rap_node_type"; +const TIME_COL: &str = "rap_time"; +const SRC_COL: &str = "rap_src"; +const DST_COL: &str = "rap_dst"; +const LAYER_COL: &str = "rap_layer"; +const EDGES_T_PATH: &str = "edges_t"; +const EDGES_D_PATH: &str = "edges_d"; // deletions +const EDGES_C_PATH: &str = "edges_c"; +const NODES_T_PATH: &str = "nodes_t"; +const NODES_C_PATH: &str = "nodes_c"; + +const GRAPH_T_PATH: &str = "graph_t"; +const GRAPH_C_PATH: &str = "graph_c"; + +const GRAPH_TYPE: &str = "graph_type"; + +const EVENT_GRAPH_TYPE: &str = "rap_event_graph"; + +const PERSISTENT_GRAPH_TYPE: &str = "rap_persistent_graph"; + +impl ParquetEncoder for Graph { + fn encode_parquet(&self, path: impl AsRef) -> Result<(), GraphError> { + let gs = self.core_graph().clone(); + encode_graph_storage(&gs, path, GraphType::EventGraph) + } +} + +impl ParquetEncoder for PersistentGraph { + fn encode_parquet(&self, path: impl AsRef) -> Result<(), GraphError> { + let gs = self.core_graph().clone(); + encode_graph_storage(&gs, path, GraphType::PersistentGraph) + } +} + +fn encode_graph_storage( + g: &GraphStorage, + path: impl AsRef, + graph_type: GraphType, +) -> Result<(), GraphError> { + encode_edge_tprop(g, path.as_ref())?; + encode_edge_cprop(g, path.as_ref())?; + encode_edge_deletions(g, path.as_ref())?; + encode_nodes_tprop(g, path.as_ref())?; + encode_nodes_cprop(g, path.as_ref())?; + encode_graph_tprop(g, path.as_ref())?; + encode_graph_cprop(g, graph_type, path.as_ref())?; + Ok(()) +} + +pub(crate) fn run_encode( + g: &GraphStorage, + meta: &PropMapper, + size: usize, + path: impl AsRef, + suffix: &str, + default_fields_fn: impl Fn(&DataType) -> Vec, + encode_fn: impl Fn( + Range, + &GraphStorage, + &mut Decoder, + &mut ArrowWriter, + ) -> Result<(), GraphError> + + Sync, +) -> Result<(), GraphError> { + let schema = derive_schema(meta, g.id_type(), default_fields_fn)?; + let root_dir = path.as_ref().join(suffix); + std::fs::create_dir_all(&root_dir)?; + + if size > 0 { + let chunk_size = (size / rayon::current_num_threads()).max(128); + let iter = (0..size).step_by(chunk_size); + + let num_digits = iter.len().to_string().len(); + + iter.enumerate() + .par_bridge() + .try_for_each(|(chunk, first)| { + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + let items = first..(first + chunk_size).min(size); + + let node_file = + File::create(root_dir.join(format!("{chunk:0num_digits$}.parquet")))?; + let mut writer = ArrowWriter::try_new(node_file, schema.clone(), Some(props))?; + + let mut decoder = ReaderBuilder::new(schema.clone()).build_decoder()?; + + encode_fn(items, g, &mut decoder, &mut writer)?; + + writer.close()?; + Ok::<_, GraphError>(()) + })?; + } + Ok(()) +} + +pub(crate) fn derive_schema( + prop_meta: &PropMapper, + id_type: Option, + default_fields_fn: impl Fn(&DataType) -> Vec, +) -> Result { + let fields = arrow_fields(prop_meta)?; + let id_type = get_id_type(id_type); + + let make_schema = |id_type: DataType, prop_columns: Vec| { + let default_fields = default_fields_fn(&id_type); + + Schema::new( + default_fields + .into_iter() + .chain(prop_columns.into_iter()) + .collect::>(), + ) + .into() + }; + + let schema = if let Ok(id_type) = id_type { + make_schema(id_type, fields) + } else { + make_schema(DataType::UInt64, fields) + }; + Ok(schema) +} + +fn arrow_fields(meta: &PropMapper) -> Result, GraphError> { + meta.get_keys() + .into_iter() + .filter_map(|name| { + let prop_id = meta.get_id(&name)?; + meta.get_dtype(prop_id) + .map(move |prop_type| (name, prop_type)) + }) + .map(|(name, prop_type)| { + arrow_dtype_from_prop_type(&prop_type).map(|d_type| Field::new(name, d_type, true)) + }) + .collect() +} + +fn ls_parquet_files(dir: &Path) -> Result, GraphError> { + Ok(std::fs::read_dir(dir)? + .filter_map(Result::ok) + .map(|entry| entry.path()) + .filter(|path| path.is_file() && path.extension().map_or(false, |ext| ext == "parquet"))) +} + +fn collect_prop_columns( + path: &Path, + exclude: &[&str], +) -> Result<(Vec, Option), GraphError> { + let prop_columns_fn = + |path: &Path, exclude: &[&str]| -> Result<(Vec, Option), GraphError> { + let reader = ArrowReaderMetadata::load(&File::open(path)?, Default::default())?; + let cols = reader + .schema() + .fields() + .iter() + .map(|f| f.name().to_string()) + .filter(|f_name| !exclude.iter().any(|ex| ex == f_name)) + .collect_vec(); + let graph_type = reader + .metadata() + .file_metadata() + .key_value_metadata() + .and_then(|meta| { + meta.iter() + .find(|kv| &kv.key == GRAPH_TYPE) + .and_then(|kv| kv.value.as_ref()) + .and_then(|v| match v.as_ref() { + EVENT_GRAPH_TYPE => Some(GraphType::EventGraph), + PERSISTENT_GRAPH_TYPE => Some(GraphType::PersistentGraph), + _ => None, + }) + }); + Ok((cols, graph_type)) + }; + let mut prop_columns = vec![]; + let mut g_type: Option = None; + for path in ls_parquet_files(path)? { + let (columns, tpe) = prop_columns_fn(&path, exclude)?; + if g_type.is_none() { + g_type = tpe; + } + prop_columns.extend_from_slice(&columns); + } + prop_columns.sort(); + prop_columns.dedup(); + Ok((prop_columns, g_type)) +} + +fn decode_graph_storage( + path: impl AsRef, + expected_gt: GraphType, +) -> Result { + let g = Graph::new(); + + let c_graph_path = path.as_ref().join(GRAPH_C_PATH); + + let g_type = { + let exclude = vec![TIME_COL]; + let (c_props, g_type) = collect_prop_columns(&c_graph_path, &exclude)?; + let c_props = c_props.iter().map(|s| s.as_str()).collect::>(); + load_graph_props_from_parquet(&g, &c_graph_path, TIME_COL, &[], &c_props)?; + + g_type.ok_or_else(|| GraphError::LoadFailure("Graph type not found".to_string()))? + }; + + if g_type != expected_gt { + return Err(GraphError::LoadFailure(format!( + "Expected graph type {:?}, got {:?}", + expected_gt, g_type + ))); + } + + let t_graph_path = path.as_ref().join(GRAPH_T_PATH); + + if std::fs::exists(&t_graph_path)? { + let exclude = vec![TIME_COL]; + let (t_props, _) = collect_prop_columns(&t_graph_path, &exclude)?; + let t_props = t_props.iter().map(|s| s.as_str()).collect::>(); + load_graph_props_from_parquet(&g, &t_graph_path, TIME_COL, &t_props, &[])?; + } + + let exclude = vec![TIME_COL, SRC_COL, DST_COL, LAYER_COL]; + let t_edge_path = path.as_ref().join(EDGES_T_PATH); + + if std::fs::exists(&t_edge_path)? { + let (t_prop_columns, _) = collect_prop_columns(&t_edge_path, &exclude)?; + let t_prop_columns = t_prop_columns + .iter() + .map(|s| s.as_str()) + .collect::>(); + + load_edges_from_parquet( + &g, + &t_edge_path, + TIME_COL, + SRC_COL, + DST_COL, + &t_prop_columns, + &[], + None, + None, + Some(LAYER_COL), + )?; + } + + let c_edge_path = path.as_ref().join(EDGES_C_PATH); + if std::fs::exists(&c_edge_path)? { + let (c_prop_columns, _) = collect_prop_columns(&c_edge_path, &exclude)?; + let constant_properties = c_prop_columns + .iter() + .map(|s| s.as_str()) + .collect::>(); + + load_edge_props_from_parquet( + &g, + &c_edge_path, + SRC_COL, + DST_COL, + &constant_properties, + None, + None, + Some(LAYER_COL), + )?; + } + + let d_edge_path = path.as_ref().join(EDGES_D_PATH); + if std::fs::exists(&d_edge_path)? { + load_edge_deletions_from_parquet( + g.core_graph(), + &d_edge_path, + TIME_COL, + SRC_COL, + DST_COL, + None, + Some(LAYER_COL), + )?; + } + + let t_node_path = path.as_ref().join(NODES_T_PATH); + + if std::fs::exists(&t_node_path)? { + let exclude = vec![NODE_ID, TIME_COL, TYPE_COL]; + let (t_prop_columns, _) = collect_prop_columns(&t_node_path, &exclude)?; + let t_prop_columns = t_prop_columns + .iter() + .map(|s| s.as_str()) + .collect::>(); + + load_nodes_from_parquet( + &g, + &t_node_path, + TIME_COL, + NODE_ID, + None, + Some(TYPE_COL), + &t_prop_columns, + &[], + None, + )?; + } + + let c_node_path = path.as_ref().join(NODES_C_PATH); + if std::fs::exists(&c_node_path)? { + let exclude = vec![NODE_ID, TYPE_COL]; + let (c_prop_columns, _) = collect_prop_columns(&c_node_path, &exclude)?; + let c_prop_columns = c_prop_columns + .iter() + .map(|s| s.as_str()) + .collect::>(); + + load_node_props_from_parquet( + &g, + &c_node_path, + NODE_ID, + None, + Some(TYPE_COL), + &c_prop_columns, + None, + )?; + } + + Ok(g.core_graph().clone()) +} +impl ParquetDecoder for Graph { + fn decode_parquet(path: impl AsRef) -> Result + where + Self: Sized, + { + let gs = decode_graph_storage(path, GraphType::EventGraph)?; + Ok(Graph::from_internal_graph(gs)) + } +} + +impl ParquetDecoder for PersistentGraph { + fn decode_parquet(path: impl AsRef) -> Result + where + Self: Sized, + { + let gs = decode_graph_storage(path, GraphType::PersistentGraph)?; + Ok(PersistentGraph::from_internal_graph(gs)) + } +} + +#[cfg(test)] +mod test { + use super::*; + use crate::{ + db::{ + api::{ + storage::graph::edges::edge_storage_ops::EdgeStorageOps, + view::internal::TimeSemantics, + }, + graph::graph::assert_graph_equal, + }, + test_utils::{ + build_edge_list_dyn, build_graph, build_graph_strat, build_nodes_dyn, GraphFixture, + NodeFixture, + }, + }; + use chrono::{DateTime, Utc}; + use proptest::prelude::*; + use std::collections::HashMap; + + #[test] + fn node_temp_props() { + let nodes: NodeFixture = [(0, 0, vec![("a".to_string(), Prop::U8(5))])].into(); + check_parquet_encoding(nodes.into()); + } + + #[test] + fn edge_const_props_maps() { + let g_fixture = GraphFixture { + edges: vec![], + no_props_edges: vec![(1, 1, 1)], + edge_deletions: vec![], + edge_const_props: vec![ + ( + (0u64, 0u64), + vec![("x".to_string(), Prop::map([("n", Prop::U64(23))]))], + ), + ( + (0u64, 1u64), + vec![( + "a".to_string(), + Prop::map([("a", Prop::U8(1)), ("b", Prop::str("baa"))]), + )], + ), + ] + .into_iter() + .collect(), + nodes: Default::default(), + }; + + check_parquet_encoding(g_fixture); + } + + #[test] + fn write_edges_to_parquet() { + let dt = "2012-12-12 12:12:12+00:00" + .parse::>() + .unwrap(); + check_parquet_encoding( + [ + (0, 1, 12, vec![("one".to_string(), Prop::DTime(dt))], None), + ( + 1, + 2, + 12, + vec![ + ("two".to_string(), Prop::I32(2)), + ("three".to_string(), Prop::I64(3)), + ( + "four".to_string(), + Prop::List(vec![Prop::I32(1), Prop::I32(2)].into()), + ), + ], + Some("b"), + ), + ( + 2, + 3, + 12, + vec![ + ("three".to_string(), Prop::I64(3)), + ("one".to_string(), Prop::DTime(dt)), + ("five".to_string(), Prop::List(vec![Prop::str("a")].into())), + ], + Some("a"), + ), + ] + .into(), + ); + } + + #[test] + fn write_edges_empty_prop_first() { + check_parquet_encoding( + [ + ( + 0, + 1, + 12, + vec![("a".to_string(), Prop::List(vec![].into()))], + None, + ), + ( + 1, + 2, + 12, + vec![("a".to_string(), Prop::List(vec![Prop::str("aa")].into()))], + None, + ), + ] + .into(), + ); + } + + #[test] + fn edges_dates() { + let dt = "2012-12-12 12:12:12+00:00" + .parse::>() + .unwrap(); + check_parquet_encoding( + [( + 0, + 0, + 0, + vec![("a".to_string(), Prop::List(vec![Prop::DTime(dt)].into()))], + None, + )] + .into(), + ); + } + #[test] + fn edges_maps() { + let dt = "2012-12-12 12:12:12+00:00" + .parse::>() + .unwrap(); + check_parquet_encoding( + [( + 0, + 0, + 0, + vec![( + "a".to_string(), + Prop::map([("a", Prop::DTime(dt)), ("b", Prop::str("s"))]), + )], + None, + )] + .into(), + ); + } + + #[test] + fn edges_maps2() { + check_parquet_encoding( + [ + ( + 0, + 0, + 0, + vec![("a".to_string(), Prop::map([("a", Prop::I32(1))]))], + None, + ), + ( + 0, + 0, + 0, + vec![("a".to_string(), Prop::map([("b", Prop::str("x"))]))], + None, + ), + ] + .into(), + ); + } + + #[test] + fn edges_maps3() { + check_parquet_encoding( + [ + (0, 0, 0, vec![("a".to_string(), Prop::U8(5))], None), + ( + 0, + 0, + 0, + vec![("b".to_string(), Prop::map([("c", Prop::U8(66))]))], + None, + ), + ] + .into(), + ); + } + + #[test] + fn edges_map4() { + let g_fix = GraphFixture { + edges: vec![(0, 0, 0, vec![("a".to_string(), Prop::U8(5))], None)], + edge_deletions: vec![(0, 0, 1)], + no_props_edges: vec![], + edge_const_props: vec![( + (0u64, 0u64), + vec![( + "x".to_string(), + Prop::List( + vec![ + Prop::map([("n", Prop::I64(23))]), + Prop::map([("b", Prop::F64(0.2))]), + ] + .into(), + ), + )], + )] + .into_iter() + .collect(), + nodes: Default::default(), + }; + + check_parquet_encoding(g_fix) + } + + // proptest + fn check_parquet_encoding(edges: GraphFixture) { + let g = build_graph(edges); + let temp_dir = tempfile::tempdir().unwrap(); + g.encode_parquet(&temp_dir).unwrap(); + let g2 = Graph::decode_parquet(&temp_dir).unwrap(); + assert_graph_equal(&g, &g2); + } + + #[test] + fn nodes_props_1() { + let dt = "2012-12-12 12:12:12+00:00" + .parse::>() + .unwrap(); + let node_fixtures = NodeFixture { + nodes: vec![( + 0, + 0, + vec![ + ("a".to_string(), Prop::U8(5)), + ("a".to_string(), Prop::U8(5)), + ], + )], + node_const_props: vec![(0, vec![("b".to_string(), Prop::DTime(dt))])] + .into_iter() + .collect(), + }; + + check_parquet_encoding(node_fixtures.into()); + } + + fn check_graph_props(nf: NodeFixture) { + let g = Graph::new(); + let temp_dir = tempfile::tempdir().unwrap(); + for (_, t, props) in nf.nodes { + g.add_properties(t, props).unwrap(); + } + + let const_props = nf + .node_const_props + .into_iter() + .flat_map(|(_, props)| props) + .collect::>(); + g.add_constant_properties(const_props).unwrap(); + + g.encode_parquet(&temp_dir).unwrap(); + let g2 = Graph::decode_parquet(&temp_dir).unwrap(); + assert_graph_equal(&g, &g2); + } + + #[test] + fn graph_props() { + let mut nf: NodeFixture = [(0, 1, vec![("a".to_string(), Prop::U8(5))])].into(); + nf.node_const_props = vec![(1, vec![("b".to_string(), Prop::str("baa"))])] + .into_iter() + .collect(); + check_graph_props(nf) + } + + #[test] + fn edge_props_1() { + let gp_fix = GraphFixture { + edge_const_props: vec![((0u64, 0u64), vec![("a".to_string(), Prop::I64(5))])] + .into_iter() + .collect(), + edge_deletions: vec![(6, 2, 4444)], + edges: vec![( + 0, + 0, + -67, + vec![ + ("x".to_string(), Prop::I64(5)), + ("b".to_string(), Prop::Bool(false)), + ], + Some("a"), + )], + no_props_edges: vec![(7, 0, 469)], + nodes: NodeFixture::default(), + }; + check_parquet_encoding(gp_fix); + } + + #[test] + fn graph_const_props() { + let mut nf: NodeFixture = NodeFixture::default(); + nf.node_const_props = vec![(1, vec![("b".to_string(), Prop::str("baa"))])] + .into_iter() + .collect(); + check_parquet_encoding(nf.into()) + } + + #[test] + fn write_graph_props_to_parquet() { + proptest!(|(nodes in build_nodes_dyn(vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 10))| { + check_graph_props(nodes); + }); + } + + #[test] + fn write_nodes_any_props_to_parquet() { + proptest!(|(nodes in build_nodes_dyn(vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 10))| { + check_parquet_encoding(nodes.into()); + }); + } + #[test] + fn write_edges_any_props_to_parquet() { + proptest!(|(edges in build_edge_list_dyn(10, 10))| { + check_parquet_encoding(edges); + }); + } + + #[test] + fn write_graph_to_parquet() { + proptest!(|(edges in build_graph_strat(10, 10))| { + check_parquet_encoding(edges); + }) + } +} diff --git a/raphtory/src/serialise/parquet/model.rs b/raphtory/src/serialise/parquet/model.rs new file mode 100644 index 000000000..b07bbb0df --- /dev/null +++ b/raphtory/src/serialise/parquet/model.rs @@ -0,0 +1,207 @@ +use super::{Prop, DST_COL, LAYER_COL, NODE_ID, SRC_COL, TIME_COL, TYPE_COL}; +use crate::{ + db::{ + api::{storage::graph::storage_ops::GraphStorage, view::StaticGraphViewOps}, + graph::{edge::EdgeView, node::NodeView}, + }, + prelude::*, +}; +use arrow_schema::DataType; +use raphtory_api::core::{ + entities::GidType, + storage::{arc_str::ArcStr, timeindex::TimeIndexEntry}, +}; +use serde::{ + ser::{Error, SerializeMap, SerializeSeq}, + Serialize, +}; + +pub(crate) struct ParquetProp<'a>(pub &'a Prop); + +impl<'a> Serialize for ParquetProp<'a> { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + match self.0 { + Prop::I32(i) => serializer.serialize_i32(*i), + Prop::I64(i) => serializer.serialize_i64(*i), + Prop::F32(f) => serializer.serialize_f32(*f), + Prop::F64(f) => serializer.serialize_f64(*f), + Prop::U8(u) => serializer.serialize_u8(*u), + Prop::U16(u) => serializer.serialize_u16(*u), + Prop::U32(u) => serializer.serialize_u32(*u), + Prop::U64(u) => serializer.serialize_u64(*u), + Prop::Str(s) => serializer.serialize_str(s), + Prop::Bool(b) => serializer.serialize_bool(*b), + Prop::DTime(dt) => serializer.serialize_i64(dt.timestamp_millis()), + Prop::NDTime(dt) => serializer.serialize_i64(dt.and_utc().timestamp_millis()), + Prop::List(l) => { + let mut state = serializer.serialize_seq(Some(l.len()))?; + for prop in l.iter() { + state.serialize_element(&ParquetProp(prop))?; + } + state.end() + } + Prop::Map(m) => { + let mut state = serializer.serialize_map(Some(m.len()))?; + for (k, v) in m.iter() { + state.serialize_entry(k, &ParquetProp(v))?; + } + state.end() + } + _ => todo!(), + } + } +} + +#[derive(Debug)] +struct ParquetGID(GID); + +impl Serialize for ParquetGID { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + match &self.0 { + GID::U64(id) => serializer.serialize_u64(*id), + GID::Str(id) => serializer.serialize_str(id), + } + } +} + +#[derive(Debug)] +pub(crate) struct ParquetTEdge<'a, G: StaticGraphViewOps>(pub(crate) EdgeView<&'a G>); + +impl<'a, G: StaticGraphViewOps> Serialize for ParquetTEdge<'a, G> { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + let edge = &self.0; + let mut state = serializer.serialize_map(None)?; + let t = edge + .edge + .time() + .ok_or(S::Error::custom("Edge has no time"))?; + let layer = edge + .layer_name() + .map_err(|_| S::Error::custom("Edge has no layer"))?; + + state.serialize_entry(TIME_COL, &t.0)?; + state.serialize_entry(SRC_COL, &ParquetGID(edge.src().id()))?; + state.serialize_entry(DST_COL, &ParquetGID(edge.dst().id()))?; + state.serialize_entry(LAYER_COL, &layer)?; + + for (name, prop) in edge.properties().temporal().iter_latest() { + state.serialize_entry(&name, &ParquetProp(&prop))?; + } + + state.end() + } +} + +#[derive(Debug)] +pub(crate) struct ParquetCEdge<'a, G: StaticGraphViewOps>(pub(crate) EdgeView<&'a G>); + +impl<'a, G: StaticGraphViewOps> Serialize for ParquetCEdge<'a, G> { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + let edge = &self.0; + let mut state = serializer.serialize_map(None)?; + let layer = edge + .layer_name() + .map_err(|_| S::Error::custom("Edge has no layer"))?; + + state.serialize_entry(SRC_COL, &ParquetGID(edge.src().id()))?; + state.serialize_entry(DST_COL, &ParquetGID(edge.dst().id()))?; + state.serialize_entry(LAYER_COL, &layer)?; + + for (name, prop) in edge.properties().constant().iter() { + state.serialize_entry(&name, &ParquetProp(&prop))?; + } + + state.end() + } +} + +pub(crate) struct ParquetDelEdge<'a, G> { + pub layer: &'a str, + pub edge: EdgeView<&'a G>, + pub del: TimeIndexEntry, +} + +impl<'a, G: StaticGraphViewOps> Serialize for ParquetDelEdge<'a, G> { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + let edge = &self.edge; + let mut state = serializer.serialize_map(None)?; + + state.serialize_entry(TIME_COL, &self.del.0)?; + state.serialize_entry(SRC_COL, &ParquetGID(edge.src().id()))?; + state.serialize_entry(DST_COL, &ParquetGID(edge.dst().id()))?; + state.serialize_entry(LAYER_COL, &self.layer)?; + + state.end() + } +} + +pub(crate) struct ParquetTNode<'a> { + pub node: NodeView<&'a GraphStorage>, + pub cols: &'a [ArcStr], + pub t: TimeIndexEntry, + pub props: Vec<(usize, Prop)>, +} + +impl<'a> Serialize for ParquetTNode<'a> { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + let mut state = serializer.serialize_map(None)?; + + state.serialize_entry(NODE_ID, &ParquetGID(self.node.id()))?; + state.serialize_entry(TIME_COL, &self.t.0)?; + state.serialize_entry(TYPE_COL, &self.node.node_type())?; + + for (name, prop) in self.props.iter() { + state.serialize_entry(&self.cols[*name], &ParquetProp(&prop))?; + } + + state.end() + } +} + +pub(crate) struct ParquetCNode<'a> { + pub node: NodeView<&'a GraphStorage>, +} + +impl<'a> Serialize for ParquetCNode<'a> { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + let mut state = serializer.serialize_map(None)?; + + state.serialize_entry(NODE_ID, &ParquetGID(self.node.id()))?; + state.serialize_entry(TYPE_COL, &self.node.node_type())?; + + for (name, prop) in self.node.properties().constant().iter() { + state.serialize_entry(&name, &ParquetProp(&prop))?; + } + + state.end() + } +} + +pub(crate) fn get_id_type(id_type: Option) -> Result { + match id_type { + Some(GidType::Str) => Ok(DataType::Utf8), + Some(GidType::U64) => Ok(DataType::UInt64), + None => Err(DataType::UInt64), // The graph is empty what now? + } +} diff --git a/raphtory/src/serialise/parquet/nodes.rs b/raphtory/src/serialise/parquet/nodes.rs new file mode 100644 index 000000000..5bf153592 --- /dev/null +++ b/raphtory/src/serialise/parquet/nodes.rs @@ -0,0 +1,116 @@ +use crate::{ + core::utils::{errors::GraphError, iter::GenLockedIter}, + db::{ + api::{ + properties::internal::TemporalPropertiesRowView, + storage::graph::storage_ops::GraphStorage, view::internal::CoreGraphOps, + }, + graph::node::NodeView, + }, + serialise::parquet::{ + model::{ParquetCNode, ParquetTNode}, + run_encode, NODES_C_PATH, NODES_T_PATH, NODE_ID, TIME_COL, TYPE_COL, + }, +}; +use arrow_schema::{DataType, Field}; +use itertools::Itertools; +use raphtory_api::{core::entities::VID, iter::IntoDynBoxed}; +use std::path::Path; + +pub(crate) fn encode_nodes_tprop( + g: &GraphStorage, + path: impl AsRef, +) -> Result<(), GraphError> { + run_encode( + g, + g.node_meta().temporal_prop_meta(), + g.unfiltered_num_nodes(), + path, + NODES_T_PATH, + |id_type| { + vec![ + Field::new(NODE_ID, id_type.clone(), false), + Field::new(TIME_COL, DataType::Int64, false), + Field::new(TYPE_COL, DataType::Utf8, true), + ] + }, + |nodes, g, decoder, writer| { + let row_group_size = 100_000; + + let cols = g + .node_meta() + .temporal_prop_meta() + .get_keys() + .into_iter() + .collect_vec(); + let cols = &cols; + for node_rows in nodes + .into_iter() + .map(VID) + .map(|vid| NodeView::new_internal(g, vid)) + .flat_map(move |node| { + GenLockedIter::from(node, |node| { + node.rows() + .into_iter() + .map(|(t, props)| ParquetTNode { + node: *node, + cols, + t, + props, + }) + .into_dyn_boxed() + }) + }) + .chunks(row_group_size) + .into_iter() + .map(|chunk| chunk.collect_vec()) + { + decoder.serialize(&node_rows)?; + if let Some(rb) = decoder.flush()? { + writer.write(&rb)?; + writer.flush()?; + } + } + Ok(()) + }, + ) +} + +pub(crate) fn encode_nodes_cprop( + g: &GraphStorage, + path: impl AsRef, +) -> Result<(), GraphError> { + run_encode( + g, + g.node_meta().const_prop_meta(), + g.unfiltered_num_nodes(), + path, + NODES_C_PATH, + |id_type| { + vec![ + Field::new(NODE_ID, id_type.clone(), false), + Field::new(TYPE_COL, DataType::Utf8, true), + ] + }, + |nodes, g, decoder, writer| { + let row_group_size = 100_000; + + for node_rows in nodes + .into_iter() + .map(VID) + .map(|vid| NodeView::new_internal(g, vid)) + .map(move |node| ParquetCNode { node }) + .chunks(row_group_size) + .into_iter() + .map(|chunk| chunk.collect_vec()) + { + decoder.serialize(&node_rows)?; + if let Some(rb) = decoder.flush()? { + writer.write(&rb)?; + writer.flush()?; + } + } + Ok(()) + }, + ) +} diff --git a/raphtory/src/serialise/proto_ext.rs b/raphtory/src/serialise/proto_ext.rs index ee5893d0c..eb83a2c1d 100644 --- a/raphtory/src/serialise/proto_ext.rs +++ b/raphtory/src/serialise/proto_ext.rs @@ -3,9 +3,7 @@ use super::proto::{ prop_type::{Array as ArrayType, Scalar as ScalarType}, }; use crate::{ - core::{ - prop_array::PropArray, utils::errors::GraphError, DocumentInput, Lifespan, Prop, PropType, - }, + core::{prop_array::PropArray, utils::errors::GraphError, Prop, PropType}, serialise::proto::{ self, graph_update::{ @@ -29,7 +27,7 @@ use raphtory_api::core::{ timeindex::{AsTime, TimeIndexEntry}, }, }; -use std::{borrow::Borrow, sync::Arc}; +use std::{borrow::Borrow, collections::HashMap, sync::Arc}; fn as_proto_prop_type(p_type: &PropType) -> Option { let val = match p_type { @@ -43,11 +41,10 @@ fn as_proto_prop_type(p_type: &PropType) -> Option { PropType::F32 => SPropType::F32, PropType::F64 => SPropType::F64, PropType::Bool => SPropType::Bool, - PropType::List => SPropType::List, - PropType::Map => SPropType::Map, PropType::NDTime => SPropType::NdTime, PropType::DTime => SPropType::DTime, - PropType::Document => SPropType::Document, + PropType::Map(_) => SPropType::Map, + PropType::List(_) => SPropType::List, _ => { return None; } @@ -96,11 +93,11 @@ pub fn as_prop_type(p_type: SPropType) -> Option { SPropType::F32 => Some(PropType::F32), SPropType::F64 => Some(PropType::F64), SPropType::Bool => Some(PropType::Bool), - SPropType::List => Some(PropType::List), - SPropType::Map => Some(PropType::Map), + SPropType::List => Some(PropType::List(Box::new(PropType::Empty))), + SPropType::Map => Some(PropType::Map(HashMap::new())), SPropType::NdTime => Some(PropType::NDTime), SPropType::DTime => Some(PropType::DTime), - SPropType::Document => Some(PropType::Document), + SPropType::Document => None, SPropType::Graph => None, SPropType::PersistentGraph => None, } @@ -666,23 +663,6 @@ fn as_prop_value(value: Option<&prop::Value>) -> Result, GraphError prop::Value::DTime(dt) => Some(Prop::DTime( DateTime::parse_from_rfc3339(dt).unwrap().into(), )), - prop::Value::DocumentInput(doc) => Some(Prop::Document(DocumentInput { - content: doc.content.clone(), - life: doc - .life - .as_ref() - .map(|l| match l.l_type { - Some(prop::lifespan::LType::Interval(prop::lifespan::Interval { - start, - end, - })) => Lifespan::Interval { start, end }, - Some(prop::lifespan::LType::Event(prop::lifespan::Event { time })) => { - Lifespan::Event { time } - } - None => Lifespan::Inherited, - }) - .unwrap_or(Lifespan::Inherited), - })), prop::Value::Array(blob) => Some(Prop::Array(PropArray::from_vec_u8(&blob.data)?)), _ => None, }; @@ -755,24 +735,6 @@ fn as_proto_prop(prop: &Prop) -> proto::Prop { Prop::Array(blob) => prop::Value::Array(Array { data: blob.to_vec_u8(), }), - Prop::Document(doc) => { - let life = match doc.life { - Lifespan::Interval { start, end } => { - Some(prop::lifespan::LType::Interval(prop::lifespan::Interval { - start, - end, - })) - } - Lifespan::Event { time } => { - Some(prop::lifespan::LType::Event(prop::lifespan::Event { time })) - } - Lifespan::Inherited => None, - }; - prop::Value::DocumentInput(prop::DocumentInput { - content: doc.content.clone(), - life: Some(prop::Lifespan { l_type: life }), - }) - } }; proto::Prop { value: Some(value) } diff --git a/raphtory/src/serialise/serialise.rs b/raphtory/src/serialise/serialise.rs index 615bea918..118f12074 100644 --- a/raphtory/src/serialise/serialise.rs +++ b/raphtory/src/serialise/serialise.rs @@ -1,3 +1,4 @@ +use super::{proto_ext::PropTypeExt, GraphFolder}; use crate::{ core::{ entities::{graph::tgraph::TemporalGraph, LayerIds}, @@ -24,15 +25,13 @@ use crate::{ use itertools::Itertools; use prost::Message; use raphtory_api::core::{ - entities::{GidRef, EID, VID}, + entities::{properties::props::PropMapper, GidRef, EID, VID}, storage::timeindex::TimeIndexEntry, - Direction, + unify_types, Direction, PropType, }; use rayon::prelude::*; use std::{iter, sync::Arc}; -use super::{proto_ext::PropTypeExt, GraphFolder}; - macro_rules! zip_tprop_updates { ($iter:expr) => { &$iter @@ -83,14 +82,6 @@ pub trait CacheOps: Sized { impl StableEncode for GraphStorage { fn encode_to_proto(&self) -> proto::Graph { - #[cfg(feature = "storage")] - if let GraphStorage::Disk(storage) = self { - assert!( - storage.inner.layers().len() <= 1, - "Disk based storage not supported right now because it doesn't have aligned edges" - ); - } - let storage = self.lock(); let mut graph = proto::Graph::default(); @@ -334,10 +325,15 @@ impl StableDecode for TemporalGraph { } }); - storage + let new_edge_property_types = storage .write_lock_edges()? .into_par_iter_mut() - .try_for_each(|mut shard| { + .map(|mut shard| { + let mut const_prop_types = + vec![PropType::Empty; storage.edge_meta.const_prop_meta().len()]; + let mut temporal_prop_types = + vec![PropType::Empty; storage.edge_meta.temporal_prop_meta().len()]; + for edge in graph.edges.iter() { if let Some(mut new_edge) = shard.get_mut(edge.eid()) { let edge_store = new_edge.edge_store_mut(); @@ -363,6 +359,13 @@ impl StableDecode for TemporalGraph { for prop_update in update.props() { let (id, prop) = prop_update?; let prop = storage.process_prop_value(&prop); + if let Ok(new_type) = unify_types( + &const_prop_types[id], + &prop.dtype(), + &mut false, + ) { + const_prop_types[id] = new_type; // the original types saved in protos are now incomplete we need to update them + } edge_layer.update_constant_prop(id, prop)?; } } @@ -377,6 +380,14 @@ impl StableDecode for TemporalGraph { for prop_update in update.props() { let (id, prop) = prop_update?; let prop = storage.process_prop_value(&prop); + if let Ok(new_type) = unify_types( + &temporal_prop_types[id], + &prop.dtype(), + &mut false, + ) { + temporal_prop_types[id] = new_type; + // the original types saved in protos are now incomplete we need to update them + } edge_layer.add_prop(update.time(), id, prop)?; } } @@ -387,12 +398,31 @@ impl StableDecode for TemporalGraph { } } } - Ok::<(), GraphError>(()) - })?; - storage + Ok::<_, GraphError>((const_prop_types, temporal_prop_types)) + }) + .try_reduce_with(|(l_const, l_temp), (r_const, r_temp)| { + unify_property_types(&l_const, &r_const, &l_temp, &r_temp) + }) + .transpose()?; + + if let Some((const_prop_types, temp_prop_types)) = new_edge_property_types { + update_meta( + const_prop_types, + temp_prop_types, + &storage.edge_meta.const_prop_meta(), + &storage.edge_meta.temporal_prop_meta(), + ); + } + + let new_nodes_property_types = storage .write_lock_nodes()? .into_par_iter_mut() - .try_for_each(|mut shard| { + .map(|mut shard| { + let mut const_prop_types = + vec![PropType::Empty; storage.node_meta.const_prop_meta().len()]; + let mut temporal_prop_types = + vec![PropType::Empty; storage.node_meta.temporal_prop_meta().len()]; + for node in graph.nodes.iter() { let vid = VID(node.vid as usize); let gid = match node.gid.as_ref().unwrap() { @@ -437,6 +467,13 @@ impl StableDecode for TemporalGraph { for prop_update in update.props() { let (id, prop) = prop_update?; let prop = storage.process_prop_value(&prop); + if let Ok(new_type) = unify_types( + &const_prop_types[id], + &prop.dtype(), + &mut false, + ) { + const_prop_types[id] = new_type; // the original types saved in protos are now incomplete we need to update them + } node.update_constant_prop(id, prop)?; } } @@ -447,6 +484,13 @@ impl StableDecode for TemporalGraph { for prop_update in update.props() { let (id, prop) = prop_update?; let prop = storage.process_prop_value(&prop); + if let Ok(new_type) = unify_types( + &temporal_prop_types[id], + &prop.dtype(), + &mut false, + ) { + temporal_prop_types[id] = new_type; // the original types saved in protos are now incomplete we need to update them + } props.push((id, prop)); } @@ -471,33 +515,106 @@ impl StableDecode for TemporalGraph { } } } - Ok::<(), GraphError>(()) - })?; - - graph.updates.par_iter().try_for_each(|update| { - if let Some(update) = update.update.as_ref() { - match update { - Update::UpdateGraphCprops(props) => { - storage.internal_update_constant_properties(&proto_ext::collect_props( - &props.properties, - )?)?; - } - Update::UpdateGraphTprops(props) => { - let time = TimeIndexEntry(props.time, props.secondary as usize); - storage.internal_add_properties( - time, - &proto_ext::collect_props(&props.properties)?, - )?; + Ok::<_, GraphError>((const_prop_types, temporal_prop_types)) + }) + .try_reduce_with(|(l_const, l_temp), (r_const, r_temp)| { + unify_property_types(&l_const, &r_const, &l_temp, &r_temp) + }) + .transpose()?; + + if let Some((const_prop_types, temp_prop_types)) = new_nodes_property_types { + update_meta( + const_prop_types, + temp_prop_types, + &storage.node_meta.const_prop_meta(), + &storage.node_meta.temporal_prop_meta(), + ); + } + + let graph_prop_new_types = graph + .updates + .par_iter() + .map(|update| { + let mut const_prop_types = + vec![PropType::Empty; storage.graph_meta.const_prop_meta().len()]; + let mut graph_prop_types = + vec![PropType::Empty; storage.graph_meta.temporal_prop_meta().len()]; + + if let Some(update) = update.update.as_ref() { + match update { + Update::UpdateGraphCprops(props) => { + let c_props = proto_ext::collect_props(&props.properties)?; + for (id, prop) in &c_props { + const_prop_types[*id] = prop.dtype(); + } + storage.internal_update_constant_properties(&c_props)?; + } + Update::UpdateGraphTprops(props) => { + let time = TimeIndexEntry(props.time, props.secondary as usize); + let t_props = proto_ext::collect_props(&props.properties)?; + for (id, prop) in &t_props { + graph_prop_types[*id] = prop.dtype(); + } + storage.internal_add_properties(time, &t_props)?; + } + _ => {} } - _ => {} } - } - Ok::<_, GraphError>(()) - })?; + Ok::<_, GraphError>((const_prop_types, graph_prop_types)) + }) + .try_reduce_with(|(l_const, l_temp), (r_const, r_temp)| { + unify_property_types(&l_const, &r_const, &l_temp, &r_temp) + }) + .transpose()?; + + if let Some((const_prop_types, temp_prop_types)) = graph_prop_new_types { + update_meta( + const_prop_types, + temp_prop_types, + &PropMapper::default(), + &storage.graph_meta.temporal_prop_meta(), + ); + } Ok(storage) } } +fn update_meta( + const_prop_types: Vec, + temp_prop_types: Vec, + const_meta: &PropMapper, + temp_meta: &PropMapper, +) { + let keys = { const_meta.get_keys().iter().cloned().collect::>() }; + for ((id, prop_type), key) in const_prop_types.into_iter().enumerate().zip(keys) { + const_meta.set_id_and_dtype(key, id, prop_type); + } + let keys = { temp_meta.get_keys().iter().cloned().collect::>() }; + + for ((id, prop_type), key) in temp_prop_types.into_iter().enumerate().zip(keys) { + temp_meta.set_id_and_dtype(key, id, prop_type); + } +} + +fn unify_property_types( + l_const: &[PropType], + r_const: &[PropType], + l_temp: &[PropType], + r_temp: &[PropType], +) -> Result<(Vec, Vec), GraphError> { + let const_pt = l_const + .into_iter() + .zip(r_const) + .map(|(l, r)| unify_types(&l, &r, &mut false)) + .collect::, _>>()?; + let temp_pt = l_temp + .into_iter() + .zip(r_temp) + .map(|(l, r)| unify_types(&l, &r, &mut false)) + .collect::, _>>()?; + Ok((const_pt, temp_pt)) +} + impl StableDecode for GraphStorage { fn decode_from_proto(graph: &proto::Graph) -> Result { Ok(GraphStorage::Unlocked(Arc::new( @@ -547,12 +664,11 @@ impl StableDecode for PersistentGraph { mod proto_test { use std::{collections::HashMap, path::PathBuf}; - use arrow_array::types::Int32Type; + use arrow_array::types::{Int32Type, UInt8Type}; use tempfile::TempDir; use super::*; use crate::{ - core::{DocumentInput, Lifespan}, db::{ api::{mutation::DeletionOps, properties::internal::ConstPropertiesOps}, graph::graph::assert_graph_equal, @@ -629,32 +745,7 @@ mod proto_test { Some(Prop::U32(47)), ], ), - ( - "doc".into(), - vec![ - Some(Prop::Document(DocumentInput { - content: "Hello, World!".to_string(), - life: Lifespan::Interval { - start: -11, - end: 100, - }, - })), - Some(Prop::Document(DocumentInput { - content: "Hello, World!".to_string(), - life: Lifespan::Interval { - start: -11, - end: 100, - }, - })), - Some(Prop::Document(DocumentInput { - content: "Hello, World!".to_string(), - life: Lifespan::Interval { - start: -11, - end: 100, - }, - })), - ], - ), + ("doc".into(), vec![None, None, None]), ( "dtime".into(), vec![ @@ -743,66 +834,76 @@ mod proto_test { ( "properties".into(), vec![ - Some(Prop::Map( - vec![ - ("is_adult", Prop::Bool(true)), - ("weight", Prop::F64(75.5)), - ( - "children", - Prop::List(vec![Prop::str("Bob"), Prop::str("Charlie")].into()), - ), - ("height", Prop::F32(1.75)), - ("name", Prop::str("Alice")), - ("age", Prop::U32(47)), - ("score", Prop::I32(27)), - ] - .into_iter() - .map(|(k, v)| (k.into(), v)) - .collect::>() - .into(), - )), - Some(Prop::Map( - vec![ - ("is_adult", Prop::Bool(true)), - ("age", Prop::U32(47)), - ("name", Prop::str("Alice")), - ("score", Prop::I32(27)), - ("height", Prop::F32(1.75)), - ( - "children", - Prop::List(vec![Prop::str("Bob"), Prop::str("Charlie")].into()), - ), - ("weight", Prop::F64(75.5)), - ] - .into_iter() - .map(|(k, v)| (k.into(), v)) - .collect::>() - .into(), - )), - Some(Prop::Map( - vec![ - ("weight", Prop::F64(75.5)), - ("name", Prop::str("Alice")), - ("age", Prop::U32(47)), - ("height", Prop::F32(1.75)), - ("score", Prop::I32(27)), - ( - "children", - Prop::List(vec![Prop::str("Bob"), Prop::str("Charlie")].into()), - ), - ("is_adult", Prop::Bool(true)), - ] - .into_iter() - .map(|(k, v)| (k.into(), v)) - .collect::>() - .into(), - )), + Some(Prop::map(vec![ + ("is_adult", Prop::Bool(true)), + ("weight", Prop::F64(75.5)), + ( + "children", + Prop::List(vec![Prop::str("Bob"), Prop::str("Charlie")].into()), + ), + ("height", Prop::F32(1.75)), + ("name", Prop::str("Alice")), + ("age", Prop::U32(47)), + ("score", Prop::I32(27)), + ])), + Some(Prop::map(vec![ + ("is_adult", Prop::Bool(true)), + ("age", Prop::U32(47)), + ("name", Prop::str("Alice")), + ("score", Prop::I32(27)), + ("height", Prop::F32(1.75)), + ( + "children", + Prop::List(vec![Prop::str("Bob"), Prop::str("Charlie")].into()), + ), + ("weight", Prop::F64(75.5)), + ])), + Some(Prop::map(vec![ + ("weight", Prop::F64(75.5)), + ("name", Prop::str("Alice")), + ("age", Prop::U32(47)), + ("height", Prop::F32(1.75)), + ("score", Prop::I32(27)), + ( + "children", + Prop::List(vec![Prop::str("Bob"), Prop::str("Charlie")].into()), + ), + ("is_adult", Prop::Bool(true)), + ])), ], ), ] .into_iter() .collect(); + let check_prop_mapper = |pm: &PropMapper| { + assert_eq!( + pm.get_id("properties").and_then(|id| pm.get_dtype(id)), + Some(PropType::map([ + ("is_adult", PropType::Bool), + ("weight", PropType::F64), + ("children", PropType::List(Box::new(PropType::Str))), + ("height", PropType::F32), + ("name", PropType::Str), + ("age", PropType::U32), + ("score", PropType::I32), + ])) + ); + assert_eq!( + pm.get_id("children").and_then(|id| pm.get_dtype(id)), + Some(PropType::List(Box::new(PropType::Str))) + ); + }; + + let pm = graph.node_meta().const_prop_meta(); + check_prop_mapper(pm); + + let pm = graph.edge_meta().temporal_prop_meta(); + check_prop_mapper(pm); + + let pm = graph.graph_meta().temporal_prop_meta(); + check_prop_mapper(pm); + let mut vec1 = actual.keys().into_iter().collect::>(); let mut vec2 = expected.keys().into_iter().collect::>(); vec1.sort(); @@ -1326,12 +1427,7 @@ mod proto_test { )); props.push(( "properties", - Prop::Map(Arc::new( - props - .iter() - .map(|(k, v)| (ArcStr::from(*k), v.clone())) - .collect(), - )), + Prop::map(props.iter().map(|(k, v)| (ArcStr::from(*k), v.clone()))), )); let fmt = "%Y-%m-%d %H:%M:%S"; props.push(( @@ -1352,14 +1448,8 @@ mod proto_test { )); props.push(( - "doc", - Prop::Document(DocumentInput { - content: "Hello, World!".into(), - life: Lifespan::Interval { - start: -11i64, - end: 100i64, - }, - }), + "array", + Prop::from_arr::(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), )); } } diff --git a/raphtory/src/vectors/template.rs b/raphtory/src/vectors/template.rs index 3664cbd53..24ac22bec 100644 --- a/raphtory/src/vectors/template.rs +++ b/raphtory/src/vectors/template.rs @@ -152,7 +152,6 @@ impl From for Value { .iter() .map(|(key, value)| (key.to_string(), value.clone())) .collect(), - Prop::Document(value) => Value::from(value.content), } } }