From d5dd4a628ddb5ff1e3969fa96ae694a4668a85ca Mon Sep 17 00:00:00 2001 From: benolt Date: Fri, 28 Jun 2024 07:14:55 +0200 Subject: [PATCH 1/2] Local image parsing --- shinkai-libs/shinkai-ocr/Cargo.toml | 6 +- shinkai-libs/shinkai-ocr/README.md | 6 +- shinkai-libs/shinkai-ocr/build.rs | 91 ++++++++++--------- shinkai-libs/shinkai-ocr/src/lib.rs | 1 + shinkai-libs/shinkai-ocr/src/pdf_parser.rs | 5 +- .../shinkai-vector-resources/Cargo.toml | 6 +- .../local_parsing/image_parsing.rs | 21 +++++ .../local_parsing/local_parsing.rs | 4 +- .../src/file_parser/local_parsing/mod.rs | 1 + .../src/resource_errors.rs | 2 + 10 files changed, 86 insertions(+), 57 deletions(-) create mode 100644 shinkai-libs/shinkai-vector-resources/src/file_parser/local_parsing/image_parsing.rs diff --git a/shinkai-libs/shinkai-ocr/Cargo.toml b/shinkai-libs/shinkai-ocr/Cargo.toml index e2ec5bcf0..e266265a1 100644 --- a/shinkai-libs/shinkai-ocr/Cargo.toml +++ b/shinkai-libs/shinkai-ocr/Cargo.toml @@ -16,7 +16,8 @@ tokio = { version = "1", features = ["full"] } [features] default = [] -static = ["pdfium-render/static", "pdfium-render/libc++"] +static-pdf-parser = ["pdfium-render/static", "pdfium-render/libc++"] +dynamic-pdf-parser = [] [[test]] name = "image_parser_tests" @@ -24,4 +25,5 @@ path = "tests/image_parser_tests.rs" [[test]] name = "pdf_parser_tests" -path = "tests/pdf_parser_tests.rs" \ No newline at end of file +path = "tests/pdf_parser_tests.rs" +required-features = ["static-pdf-parser"] \ No newline at end of file diff --git a/shinkai-libs/shinkai-ocr/README.md b/shinkai-libs/shinkai-ocr/README.md index 1066cf794..20516a673 100644 --- a/shinkai-libs/shinkai-ocr/README.md +++ b/shinkai-libs/shinkai-ocr/README.md @@ -18,10 +18,10 @@ RUSTFLAGS=-g cargo build --release ### Static linking PDFium -By default the project binds to the PDFium dynamic library at runtime. To statically link PDFium build with feature `static` enabled: +By default the project binds to the PDFium dynamic library at runtime. To statically link PDFium build with feature `static-pdf-parser` enabled: ```sh -cargo build --release --features static +cargo build --release --features static-pdf-parser ``` The project needs to link the PDFium static library which should be available as `libpdfium.a` in the PDFium directory. If you wish to build PDFium from source follow the steps in the *Building PDFium static library from source* section. @@ -77,5 +77,5 @@ PDFIUM_DYNAMIC_LIB_PATH=$(PWD)/pdfium/linux-x64 cargo test -- --test-threads=1 ## Running tests ```sh -cargo test --features static -- --test-threads=1 +cargo test --features static-pdf-parser -- --test-threads=1 ``` \ No newline at end of file diff --git a/shinkai-libs/shinkai-ocr/build.rs b/shinkai-libs/shinkai-ocr/build.rs index b4cd9fb9f..e3c2446db 100644 --- a/shinkai-libs/shinkai-ocr/build.rs +++ b/shinkai-libs/shinkai-ocr/build.rs @@ -1,65 +1,68 @@ -use std::{env, path::PathBuf}; - fn main() { - #[cfg(target_os = "linux")] - let os = "linux"; - - #[cfg(target_os = "macos")] - let os = "mac"; + #[cfg(any(feature = "dynamic-pdf-parser", feature = "static-pdf-parser"))] + { + use std::{env, path::PathBuf}; - #[cfg(target_os = "windows")] - let os = "win"; + #[cfg(target_os = "linux")] + let os = "linux"; - #[cfg(target_arch = "aarch64")] - let arch = "arm64"; + #[cfg(target_os = "macos")] + let os = "mac"; - #[cfg(target_arch = "x86_64")] - let arch = "x64"; + #[cfg(target_os = "windows")] + let os = "win"; - let current_directory = env::var("CARGO_MANIFEST_DIR").unwrap(); + #[cfg(target_arch = "aarch64")] + let arch = "arm64"; - let pdfium_directory = format!("pdfium/{}-{}", os, arch); - let pdfium_lib_path = PathBuf::from(¤t_directory).join(pdfium_directory); + #[cfg(target_arch = "x86_64")] + let arch = "x64"; - #[cfg(feature = "static")] - { - println!("cargo:rustc-link-search=native={}", pdfium_lib_path.display()); - println!("cargo:rustc-link-lib=static=pdfium"); + let current_directory = env::var("CARGO_MANIFEST_DIR").unwrap(); - #[cfg(target_os = "linux")] - println!("cargo:rustc-link-lib=dylib=stdc++"); + let pdfium_directory = format!("pdfium/{}-{}", os, arch); + let pdfium_lib_path = PathBuf::from(¤t_directory).join(pdfium_directory); - #[cfg(target_os = "macos")] + #[cfg(feature = "static-pdf-parser")] { - println!("cargo:rustc-link-lib=dylib=c++"); - println!("cargo:rustc-link-lib=framework=CoreGraphics"); + println!("cargo:rustc-link-search=native={}", pdfium_lib_path.display()); + println!("cargo:rustc-link-lib=static=pdfium"); + + #[cfg(target_os = "linux")] + println!("cargo:rustc-link-lib=dylib=stdc++"); + + #[cfg(target_os = "macos")] + { + println!("cargo:rustc-link-lib=dylib=c++"); + println!("cargo:rustc-link-lib=framework=CoreGraphics"); + } } - } - #[cfg(not(feature = "static"))] - { - let out_dir = env::var("OUT_DIR").unwrap(); - let out_dir = PathBuf::from(&out_dir); - let out_dir = out_dir.iter().collect::>(); + #[cfg(feature = "dynamic-pdf-parser")] + { + let out_dir = env::var("OUT_DIR").unwrap(); + let out_dir = PathBuf::from(&out_dir); + let out_dir = out_dir.iter().collect::>(); - let target_dir = out_dir.iter().take(out_dir.len() - 4).collect::(); - let bin_dir = target_dir.join(env::var("PROFILE").unwrap()); - let pdfium_dest_dir = bin_dir.join(format!("pdfium/{}-{}", os, arch)); + let target_dir = out_dir.iter().take(out_dir.len() - 4).collect::(); + let bin_dir = target_dir.join(env::var("PROFILE").unwrap()); + let pdfium_dest_dir = bin_dir.join(format!("pdfium/{}-{}", os, arch)); - let _ = std::fs::create_dir_all(&pdfium_dest_dir); + let _ = std::fs::create_dir_all(&pdfium_dest_dir); - #[cfg(target_os = "linux")] - let pdfium_lib = "libpdfium.so"; + #[cfg(target_os = "linux")] + let pdfium_lib = "libpdfium.so"; - #[cfg(target_os = "macos")] - let pdfium_lib = "libpdfium.dylib"; + #[cfg(target_os = "macos")] + let pdfium_lib = "libpdfium.dylib"; - #[cfg(target_os = "windows")] - let pdfium_lib = "pdfium.dll"; + #[cfg(target_os = "windows")] + let pdfium_lib = "pdfium.dll"; - let pdfium_lib_source = pdfium_lib_path.join(pdfium_lib); - let pdfium_lib_dest = pdfium_dest_dir.join(pdfium_lib); + let pdfium_lib_source = pdfium_lib_path.join(pdfium_lib); + let pdfium_lib_dest = pdfium_dest_dir.join(pdfium_lib); - std::fs::copy(pdfium_lib_source, pdfium_lib_dest).unwrap(); + std::fs::copy(pdfium_lib_source, pdfium_lib_dest).unwrap(); + } } } diff --git a/shinkai-libs/shinkai-ocr/src/lib.rs b/shinkai-libs/shinkai-ocr/src/lib.rs index 65a95da7a..330330c3d 100644 --- a/shinkai-libs/shinkai-ocr/src/lib.rs +++ b/shinkai-libs/shinkai-ocr/src/lib.rs @@ -1,2 +1,3 @@ pub mod image_parser; +#[cfg(any(feature = "dynamic-pdf-parser", feature = "static-pdf-parser"))] pub mod pdf_parser; diff --git a/shinkai-libs/shinkai-ocr/src/pdf_parser.rs b/shinkai-libs/shinkai-ocr/src/pdf_parser.rs index 53b0b13ed..5c87ecb5c 100644 --- a/shinkai-libs/shinkai-ocr/src/pdf_parser.rs +++ b/shinkai-libs/shinkai-ocr/src/pdf_parser.rs @@ -1,5 +1,4 @@ use pdfium_render::prelude::*; -use std::io::Write; use crate::image_parser::ImageParser; @@ -22,7 +21,7 @@ impl PDFParser { pub fn new() -> anyhow::Result { let image_parser = ImageParser::new()?; - #[cfg(not(feature = "static"))] + #[cfg(feature = "dynamic-pdf-parser")] let pdfium = { let lib_path = match std::env::var("PDFIUM_DYNAMIC_LIB_PATH").ok() { Some(lib_path) => lib_path, @@ -49,7 +48,7 @@ impl PDFParser { Pdfium::new(Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name_at_path(&lib_path)).unwrap()) }; - #[cfg(feature = "static")] + #[cfg(feature = "static-pdf-parser")] let pdfium = Pdfium::new(Pdfium::bind_to_statically_linked_library().unwrap()); Ok(PDFParser { image_parser, pdfium }) diff --git a/shinkai-libs/shinkai-vector-resources/Cargo.toml b/shinkai-libs/shinkai-vector-resources/Cargo.toml index 082615acb..0a6fcb1ca 100644 --- a/shinkai-libs/shinkai-vector-resources/Cargo.toml +++ b/shinkai-libs/shinkai-vector-resources/Cargo.toml @@ -36,7 +36,7 @@ base64 = "0.13.0" futures = "0.3.30" urlencoding = "1.1.1" docx-rust = "0.1.7" -shinkai_ocr = { path = "../shinkai-ocr", optional = true } +shinkai_ocr = { path = "../shinkai-ocr" } [build-dependencies] reqwest = { version = "0.11.26", features = ["json", "tokio-native-tls", "blocking", "multipart"] } @@ -44,8 +44,8 @@ reqwest = { version = "0.11.26", features = ["json", "tokio-native-tls", "blocki [features] default = ["desktop-only"] desktop-only = ["reqwest/blocking", "comrak"] -dynamic-pdf-parser = ["shinkai_ocr"] -static-pdf-parser = ["shinkai_ocr/static"] +dynamic-pdf-parser = ["shinkai_ocr/dynamic-pdf-parser"] +static-pdf-parser = ["shinkai_ocr/static-pdf-parser"] wasm-http = [] diff --git a/shinkai-libs/shinkai-vector-resources/src/file_parser/local_parsing/image_parsing.rs b/shinkai-libs/shinkai-vector-resources/src/file_parser/local_parsing/image_parsing.rs new file mode 100644 index 000000000..28c043114 --- /dev/null +++ b/shinkai-libs/shinkai-vector-resources/src/file_parser/local_parsing/image_parsing.rs @@ -0,0 +1,21 @@ +use shinkai_ocr::image_parser::ImageParser; + +use crate::{ + file_parser::{file_parser::ShinkaiFileParser, file_parser_types::TextGroup}, + resource_errors::VRError, +}; + +use super::LocalFileParser; + +impl LocalFileParser { + pub fn process_image_file(file_buffer: Vec, max_node_text_size: u64) -> Result, VRError> { + let image_parser = ImageParser::new().map_err(|_| VRError::FailedImageParsing)?; + let text = image_parser + .process_image_file(file_buffer) + .map_err(|_| VRError::FailedImageParsing)?; + + let text_groups = ShinkaiFileParser::parse_and_split_into_text_groups(text, max_node_text_size); + + Ok(text_groups) + } +} diff --git a/shinkai-libs/shinkai-vector-resources/src/file_parser/local_parsing/local_parsing.rs b/shinkai-libs/shinkai-vector-resources/src/file_parser/local_parsing/local_parsing.rs index d855c871a..5741fc6b4 100644 --- a/shinkai-libs/shinkai-vector-resources/src/file_parser/local_parsing/local_parsing.rs +++ b/shinkai-libs/shinkai-vector-resources/src/file_parser/local_parsing/local_parsing.rs @@ -24,8 +24,7 @@ impl LocalFileParser { VRSourceReference::Standard(source) => match source { SourceReference::Other(_) => Err(VRError::UnsupportedFileType(file_name.to_string())), SourceReference::FileRef(file_source) => match file_source.clone().file_type { - SourceFileType::Image(_) - | SourceFileType::Code(_) + SourceFileType::Code(_) | SourceFileType::ConfigFileType(_) | SourceFileType::Video(_) | SourceFileType::Audio(_) @@ -47,6 +46,7 @@ impl LocalFileParser { _ => Err(VRError::UnsupportedFileType(file_name.to_string())), }, + SourceFileType::Image(_) => LocalFileParser::process_image_file(file_buffer, max_node_text_size), }, SourceReference::ExternalURI(_) => Err(VRError::UnsupportedFileType(file_name.to_string())), }, diff --git a/shinkai-libs/shinkai-vector-resources/src/file_parser/local_parsing/mod.rs b/shinkai-libs/shinkai-vector-resources/src/file_parser/local_parsing/mod.rs index 45184bdb1..770e91387 100644 --- a/shinkai-libs/shinkai-vector-resources/src/file_parser/local_parsing/mod.rs +++ b/shinkai-libs/shinkai-vector-resources/src/file_parser/local_parsing/mod.rs @@ -1,6 +1,7 @@ pub mod csv_parsing; pub mod docx_parsing; pub mod html_parsing; +pub mod image_parsing; pub mod json_parsing; pub mod local_parsing; pub mod md_parsing; diff --git a/shinkai-libs/shinkai-vector-resources/src/resource_errors.rs b/shinkai-libs/shinkai-vector-resources/src/resource_errors.rs index a1b778caa..c971fca94 100644 --- a/shinkai-libs/shinkai-vector-resources/src/resource_errors.rs +++ b/shinkai-libs/shinkai-vector-resources/src/resource_errors.rs @@ -11,6 +11,7 @@ pub enum VRError { FailedEmbeddingGeneration(String), NoNodeFound, InvalidModelArchitecture, + FailedImageParsing, FailedJSONParsing, FailedCSVParsing, FailedPDFParsing, @@ -54,6 +55,7 @@ impl fmt::Display for VRError { VRError::InvalidModelArchitecture => { write!(f, "An unsupported model architecture was specified.") } + VRError::FailedImageParsing => write!(f, "Failed image parsing."), VRError::FailedJSONParsing => write!(f, "Failed JSON parsing."), VRError::FailedCSVParsing => write!(f, "Failed CSV parsing."), VRError::FailedPDFParsing => write!(f, "Failed PDF parsing."), From fa32302cd5c64d5de5a677c91586ebcada8fe0ec Mon Sep 17 00:00:00 2001 From: benolt Date: Tue, 16 Jul 2024 12:50:27 +0200 Subject: [PATCH 2/2] update build script and dynamic binding --- shinkai-libs/shinkai-ocr/build.rs | 5 +---- shinkai-libs/shinkai-ocr/src/pdf_parser.rs | 8 +++++++- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/shinkai-libs/shinkai-ocr/build.rs b/shinkai-libs/shinkai-ocr/build.rs index e3c2446db..c0c21d387 100644 --- a/shinkai-libs/shinkai-ocr/build.rs +++ b/shinkai-libs/shinkai-ocr/build.rs @@ -46,9 +46,6 @@ fn main() { let target_dir = out_dir.iter().take(out_dir.len() - 4).collect::(); let bin_dir = target_dir.join(env::var("PROFILE").unwrap()); - let pdfium_dest_dir = bin_dir.join(format!("pdfium/{}-{}", os, arch)); - - let _ = std::fs::create_dir_all(&pdfium_dest_dir); #[cfg(target_os = "linux")] let pdfium_lib = "libpdfium.so"; @@ -60,7 +57,7 @@ fn main() { let pdfium_lib = "pdfium.dll"; let pdfium_lib_source = pdfium_lib_path.join(pdfium_lib); - let pdfium_lib_dest = pdfium_dest_dir.join(pdfium_lib); + let pdfium_lib_dest = bin_dir.join(pdfium_lib); std::fs::copy(pdfium_lib_source, pdfium_lib_dest).unwrap(); } diff --git a/shinkai-libs/shinkai-ocr/src/pdf_parser.rs b/shinkai-libs/shinkai-ocr/src/pdf_parser.rs index 82802cb10..f26eaaf07 100644 --- a/shinkai-libs/shinkai-ocr/src/pdf_parser.rs +++ b/shinkai-libs/shinkai-ocr/src/pdf_parser.rs @@ -47,7 +47,13 @@ impl PDFParser { } }; - Pdfium::new(Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name_at_path(&lib_path)).unwrap()) + // Look for the dynamic library in the specified path or fall back to the current directory. + let bindings = match Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name_at_path(&lib_path)) { + Ok(bindings) => bindings, + Err(_) => Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name_at_path("./"))?, + }; + + Pdfium::new(bindings) }; #[cfg(feature = "static-pdf-parser")]