From d3948ab63fe7bfbb6a1a82a3e452ef02080b6356 Mon Sep 17 00:00:00 2001 From: Hyunbin Kim Date: Wed, 2 Oct 2024 23:05:40 +0900 Subject: [PATCH] [IN PROGRESS] optimizing feature extraction --- Cargo.lock | 119 +++++++++---------------------------- Cargo.toml | 5 +- src/structure/core.rs | 7 ++- src/utils/convert.rs | 134 +++++++++++++++++++++++++++++++++--------- 4 files changed, 143 insertions(+), 122 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index bddc19b..d9a2d27 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -19,9 +19,9 @@ dependencies = [ [[package]] name = "autocfg" -version = "1.1.0" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" +checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" [[package]] name = "bindgen" @@ -29,7 +29,7 @@ version = "0.69.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a00dc851838a2120612785d195287475a3ac45514741da670b735818822129a0" dependencies = [ - "bitflags 2.5.0", + "bitflags", "cexpr", "clang-sys", "itertools", @@ -46,12 +46,6 @@ dependencies = [ "which", ] -[[package]] -name = "bitflags" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" - [[package]] name = "bitflags" version = "2.5.0" @@ -141,11 +135,12 @@ checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345" [[package]] name = "dashmap" -version = "5.5.3" +version = "6.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "978747c1d849a7d2ee5e8adc0159961c48fb7e5db2f06af6723b80123bb53856" +checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" dependencies = [ "cfg-if", + "crossbeam-utils", "hashbrown", "lock_api", "once_cell", @@ -200,6 +195,7 @@ dependencies = [ "cmake", "dashmap", "flate2", + "lazy_static", "libc", "memmap2", "peak_alloc", @@ -265,9 +261,9 @@ dependencies = [ [[package]] name = "lazy_static" -version = "1.4.0" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" [[package]] name = "lazycell" @@ -288,7 +284,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c2a198fb6b0eada2a8df47933734e6d35d350665a33a3593d7164fa52c75c19" dependencies = [ "cfg-if", - "windows-targets 0.52.5", + "windows-targets", ] [[package]] @@ -299,9 +295,9 @@ checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" [[package]] name = "lock_api" -version = "0.4.11" +version = "0.4.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c168f8615b12bc01f9c17e2eb0cc07dcae1940121185446edc3744920e8ef45" +checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17" dependencies = [ "autocfg", "scopeguard", @@ -361,15 +357,15 @@ checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" [[package]] name = "parking_lot_core" -version = "0.9.9" +version = "0.9.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e" +checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" dependencies = [ "cfg-if", "libc", "redox_syscall", "smallvec", - "windows-targets 0.48.5", + "windows-targets", ] [[package]] @@ -478,11 +474,11 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.4.1" +version = "0.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" +checksum = "9b6dfecf2c74bce2466cabf93f6664d6998a69eb21e39f4207930065b27b771f" dependencies = [ - "bitflags 1.3.2", + "bitflags", ] [[package]] @@ -526,7 +522,7 @@ version = "0.38.34" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f" dependencies = [ - "bitflags 2.5.0", + "bitflags", "errno", "libc", "linux-raw-sys", @@ -576,9 +572,9 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" [[package]] name = "smallvec" -version = "1.13.1" +version = "1.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7" +checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" [[package]] name = "syn" @@ -655,22 +651,7 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ - "windows-targets 0.52.5", -] - -[[package]] -name = "windows-targets" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" -dependencies = [ - "windows_aarch64_gnullvm 0.48.5", - "windows_aarch64_msvc 0.48.5", - "windows_i686_gnu 0.48.5", - "windows_i686_msvc 0.48.5", - "windows_x86_64_gnu 0.48.5", - "windows_x86_64_gnullvm 0.48.5", - "windows_x86_64_msvc 0.48.5", + "windows-targets", ] [[package]] @@ -679,46 +660,28 @@ version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6f0713a46559409d202e70e28227288446bf7841d3211583a4b53e3f6d96e7eb" dependencies = [ - "windows_aarch64_gnullvm 0.52.5", - "windows_aarch64_msvc 0.52.5", - "windows_i686_gnu 0.52.5", + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", "windows_i686_gnullvm", - "windows_i686_msvc 0.52.5", - "windows_x86_64_gnu 0.52.5", - "windows_x86_64_gnullvm 0.52.5", - "windows_x86_64_msvc 0.52.5", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", ] -[[package]] -name = "windows_aarch64_gnullvm" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" - [[package]] name = "windows_aarch64_gnullvm" version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7088eed71e8b8dda258ecc8bac5fb1153c5cffaf2578fc8ff5d61e23578d3263" -[[package]] -name = "windows_aarch64_msvc" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" - [[package]] name = "windows_aarch64_msvc" version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9985fd1504e250c615ca5f281c3f7a6da76213ebd5ccc9561496568a2752afb6" -[[package]] -name = "windows_i686_gnu" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" - [[package]] name = "windows_i686_gnu" version = "0.52.5" @@ -731,48 +694,24 @@ version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "87f4261229030a858f36b459e748ae97545d6f1ec60e5e0d6a3d32e0dc232ee9" -[[package]] -name = "windows_i686_msvc" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" - [[package]] name = "windows_i686_msvc" version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "db3c2bf3d13d5b658be73463284eaf12830ac9a26a90c717b7f771dfe97487bf" -[[package]] -name = "windows_x86_64_gnu" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" - [[package]] name = "windows_x86_64_gnu" version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4e4246f76bdeff09eb48875a0fd3e2af6aada79d409d33011886d3e1581517d9" -[[package]] -name = "windows_x86_64_gnullvm" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" - [[package]] name = "windows_x86_64_gnullvm" version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "852298e482cd67c356ddd9570386e2862b5673c85bd5f88df9ab6802b334c596" -[[package]] -name = "windows_x86_64_msvc" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" - [[package]] name = "windows_x86_64_msvc" version = "0.52.5" diff --git a/Cargo.toml b/Cargo.toml index 4d7294a..6e9f62f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,7 +24,6 @@ pico-args = { version = "0.5.0", features = ["combined-flags"], path = "lib/pico rayon = "1.8.0" byteorder = "1.5.0" memmap2 = "0.9.0" -dashmap = { version = "5.5.3", features = ["rayon"] } rustc-hash = "1.1.0" peak_alloc = "0.2.1" flate2 = { version = "1.0.28" } @@ -32,6 +31,8 @@ toml = "0.8.12" regex = "1.10.4" petgraph = "0.6.4" libc = "0.2.155" +lazy_static = "1.5.0" +dashmap = { version = "6.1.0", features = ["rayon"] } [build-dependencies] @@ -40,4 +41,4 @@ cmake = { version = "0.1.50", optional = true } [features] -foldcomp = ["bindgen", "cmake"] \ No newline at end of file +foldcomp = ["bindgen", "cmake"] diff --git a/src/structure/core.rs b/src/structure/core.rs index dd71450..66eb7cc 100644 --- a/src/structure/core.rs +++ b/src/structure/core.rs @@ -383,12 +383,13 @@ impl CompactStructure { (ca1, ca2, cb1, cb2, n1, n2) { - let ca_dist = self.get_ca_distance(idx1, idx2).unwrap(); + let ca_dist = ca1.calc_distance(&ca2); if ca_dist > dist_cutoff { return None; } - let cb_dist = self.get_cb_distance(idx1, idx2).unwrap(); - let ca_cb_angle = self.get_ca_cb_angle(idx1, idx2, true).unwrap(); + let cb_dist = cb1.calc_distance(&cb2); + // let ca_cb_angle = self.get_ca_cb_angle(idx1, idx2, true).unwrap(); + let ca_cb_angle = ca1.calc_angle(&cb1, &ca2, &cb2, true); let theta1 = calc_torsion_radian(&n1, &ca1, &cb1, &cb2); let theta2 = calc_torsion_radian(&cb1, &cb2, &ca2, &n2); Some((ca_dist, cb_dist, ca_cb_angle, theta1, theta2)) diff --git a/src/utils/convert.rs b/src/utils/convert.rs index c8621c0..977bfdf 100644 --- a/src/utils/convert.rs +++ b/src/utils/convert.rs @@ -1,3 +1,7 @@ + +use std::collections::HashMap; +use lazy_static::lazy_static; + // Constants // 1. for cb_dist pub const MIN_DIST: f32 = 2.0; @@ -49,35 +53,111 @@ pub fn normalize_f32_value(val: f32, min: f32, max: f32) -> f32 { } +// pub fn map_aa_to_u8(aa: &[u8; 3]) -> u8 { +// // Applied to handle the case of non-standard amino acids +// // reference: gemmi/blob/master/src/resinfo.cpp (https://github.com/project-gemmi/gemmi) +// match aa { +// b"ALA" | b"ABA" | b"ORN" | b"DAL" | b"AIB" | b"ALC" | b"MDO" | b"MAA" | b"DAB" => 0, // ALA, A, total 9 +// b"ARG" | b"DAR" | b"CIR" | b"AGM" => 1, // ARG, R, total 4 +// b"ASN" | b"DSG" | b"MEN" | b"SNN" => 2, // ASN, N, total 4 +// b"ASP" | b"0TD" | b"DAS" | b"IAS" | b"PHD" | b"BFD" | b"ASX" => 3, // ASP, D, total 7, ASX is included here +// b"CYS" | b"CSO" | b"CSD" | b"CME" | b"OCS" | b"CAS" | b"CSX" | b"CSS" | +// b"YCM" | b"DCY" | b"SMC" | b"SCH" | b"SCY" | b"CAF" | b"SNC" | b"SEC" => 4, // CYS, C, total 16, SEC is included here +// b"GLN" | b"DGN" | b"CRQ" | b"MEQ" => 5, // GLN, Q, total 4 +// b"GLU" | b"PCA" | b"DGL" | b"CGU" | b"FGA" | b"B3E" | b"GLX" => 6, // GLU, E, total 7, GLX is included here +// b"GLY" | b"CR2" | b"SAR" | b"GHP" | b"GL3" => 7, // GLY, G, total 5 +// b"HIS" | b"HIC" | b"DHI" | b"NEP" | b"CR8" | b"MHS" => 8, // HIS, H, total 6 +// b"ILE" | b"DIL" => 9, // ILE, I, total 2 +// b"LEU" | b"DLE" | b"NLE" | b"MLE" | b"MK8"=> 10, // LEU, L, total 5 +// b"LYS" | b"KCX" | b"LLP" | b"MLY" | b"M3L" | b"ALY" | b"MLZ" | b"DLY" | +// b"KPI" | b"PYL" => 11, // LYS, K, total 10, PYL is included here +// b"MET" | b"MSE" | b"FME" | b"NRQ" | b"CXM" | b"SME" | b"MHO" | b"MED" => 12, // MET, M, total 8 +// b"PHE" | b"DPN" | b"PHI" | b"MEA" | b"PHL" => 13, // PHE, F, total 5 +// b"PRO" | b"HYP" | b"DPR" => 14, // PRO, P, total 3 +// b"SER" | b"CSH" | b"SEP" | b"DSN" | b"SAC" | b"GYS" | b"DHA" | b"OAS" => 15, // SER, S, total 8 +// b"THR" | b"TPO" | b"CRO" | b"DTH" | b"BMT" | b"CRF" => 16, // THR, T, total 6 +// b"TRP" | b"DTR" | b"TRQ" | b"TOX" | b"0AF" => 17, // TRP, W, total 5 +// b"TYR" | b"PTR" | b"TYS" | b"TPQ" | b"DTY" | b"OMY" => 18, +// b"VAL" | b"DVA" | b"MVA" | b"FVA" => 19, +// _ => 255, +// } +// } + +lazy_static! { + static ref AA_MAP: HashMap<&'static [u8; 3], u8> = { + let mut m: HashMap<&'static [u8; 3], u8> = HashMap::new(); + // ALA group + let ala_codes = [b"ALA", b"ABA", b"ORN", b"DAL", b"AIB", b"ALC", b"MDO", b"MAA", b"DAB"]; + for code in ala_codes.iter() { + m.insert(code, 0); + } + // ARG group + let arg_codes = [b"ARG", b"DAR", b"CIR", b"AGM"]; + for code in arg_codes.iter() { m.insert(code, 1); } + // ASN group + let asn_codes = [b"ASN", b"DSG", b"MEN", b"SNN"]; + for code in asn_codes.iter() { m.insert(code, 2); } + // ASP group + let asp_codes = [b"ASP", b"0TD", b"DAS", b"IAS", b"PHD", b"BFD", b"ASX"]; + for code in asp_codes.iter() { m.insert(code, 3); } + // CYS group (total 16 including SEC) + let cys_codes = [b"CYS", b"CSO", b"CSD", b"CME", b"OCS", b"CAS", b"CSX", b"CSS", b"YCM", b"DCY", b"SMC", b"SCH", b"SCY", b"CAF", b"SNC", b"SEC"]; + for code in cys_codes.iter() { + m.insert(code, 4); + } + // GLN group + let gln_codes = [b"GLN", b"DGN", b"CRQ", b"MEQ"]; + for code in gln_codes.iter() { m.insert(code, 5); } + // GLU group + let glu_codes = [b"GLU", b"PCA", b"DGL", b"CGU", b"FGA", b"B3E", b"GLX"]; + for code in glu_codes.iter() { m.insert(code, 6); } + // GLY group + let gly_codes = [b"GLY", b"CR2", b"SAR", b"GHP", b"GL3"]; + for code in gly_codes.iter() { m.insert(code, 7); } + // HIS group + let his_codes = [b"HIS", b"HIC", b"DHI", b"NEP", b"CR8", b"MHS"]; + for code in his_codes.iter() { m.insert(code, 8); } + // ILE group + let ile_codes = [b"ILE", b"DIL"]; + for code in ile_codes.iter() { m.insert(code, 9); } + // LEU group + let leu_codes = [b"LEU", b"DLE", b"NLE", b"MLE", b"MK8"]; + for code in leu_codes.iter() { m.insert(code, 10); } + // LYS group + let lys_codes = [b"LYS", b"KCX", b"LLP", b"MLY", b"M3L", b"ALY", b"MLZ", b"DLY", b"KPI", b"PYL"]; + for code in lys_codes.iter() { m.insert(code, 11); } + // MET group + let met_codes = [b"MET", b"MSE", b"FME", b"NRQ", b"CXM", b"SME", b"MHO", b"MED"]; + for code in met_codes.iter() { m.insert(code, 12); } + // PHE group + let phe_codes = [b"PHE", b"DPN", b"PHI", b"MEA", b"PHL"]; + for code in phe_codes.iter() { m.insert(code, 13); } + // PRO group + let pro_codes = [b"PRO", b"HYP", b"DPR"]; + for code in pro_codes.iter() { m.insert(code, 14); } + // SER group + let ser_codes = [b"SER", b"CSH", b"SEP", b"DSN", b"SAC", b"GYS", b"DHA", b"OAS"]; + for code in ser_codes.iter() { m.insert(code, 15); } + // THR group + let thr_codes = [b"THR", b"TPO", b"CRO", b"DTH", b"BMT", b"CRF"]; + for code in thr_codes.iter() { m.insert(code, 16); } + // TRP group + let trp_codes = [b"TRP", b"DTR", b"TRQ", b"TOX", b"0AF"]; + for code in trp_codes.iter() { m.insert(code, 17); } + // TYR group + let tyr_codes = [b"TYR", b"PTR", b"TYS", b"TPQ", b"DTY", b"OMY"]; + for code in tyr_codes.iter() { m.insert(code, 18); } + // VAL group + let val_codes = [b"VAL", b"DVA", b"MVA", b"FVA"]; + for code in val_codes.iter() { m.insert(code, 19); } + m + }; +} + pub fn map_aa_to_u8(aa: &[u8; 3]) -> u8 { - // Applied to handle the case of non-standard amino acids - // reference: gemmi/blob/master/src/resinfo.cpp (https://github.com/project-gemmi/gemmi) - match aa { - b"ALA" | b"ABA" | b"ORN" | b"DAL" | b"AIB" | b"ALC" | b"MDO" | b"MAA" | b"DAB" => 0, // ALA, A, total 9 - b"ARG" | b"DAR" | b"CIR" | b"AGM" => 1, // ARG, R, total 4 - b"ASN" | b"DSG" | b"MEN" | b"SNN" => 2, // ASN, N, total 4 - b"ASP" | b"0TD" | b"DAS" | b"IAS" | b"PHD" | b"BFD" | b"ASX" => 3, // ASP, D, total 7, ASX is included here - b"CYS" | b"CSO" | b"CSD" | b"CME" | b"OCS" | b"CAS" | b"CSX" | b"CSS" | - b"YCM" | b"DCY" | b"SMC" | b"SCH" | b"SCY" | b"CAF" | b"SNC" | b"SEC" => 4, // CYS, C, total 16, SEC is included here - b"GLN" | b"DGN" | b"CRQ" | b"MEQ" => 5, // GLN, Q, total 4 - b"GLU" | b"PCA" | b"DGL" | b"CGU" | b"FGA" | b"B3E" | b"GLX" => 6, // GLU, E, total 7, GLX is included here - b"GLY" | b"CR2" | b"SAR" | b"GHP" | b"GL3" => 7, // GLY, G, total 5 - b"HIS" | b"HIC" | b"DHI" | b"NEP" | b"CR8" | b"MHS" => 8, // HIS, H, total 6 - b"ILE" | b"DIL" => 9, // ILE, I, total 2 - b"LEU" | b"DLE" | b"NLE" | b"MLE" | b"MK8"=> 10, // LEU, L, total 5 - b"LYS" | b"KCX" | b"LLP" | b"MLY" | b"M3L" | b"ALY" | b"MLZ" | b"DLY" | - b"KPI" | b"PYL" => 11, // LYS, K, total 10, PYL is included here - b"MET" | b"MSE" | b"FME" | b"NRQ" | b"CXM" | b"SME" | b"MHO" | b"MED" => 12, // MET, M, total 8 - b"PHE" | b"DPN" | b"PHI" | b"MEA" | b"PHL" => 13, // PHE, F, total 5 - b"PRO" | b"HYP" | b"DPR" => 14, // PRO, P, total 3 - b"SER" | b"CSH" | b"SEP" | b"DSN" | b"SAC" | b"GYS" | b"DHA" | b"OAS" => 15, // SER, S, total 8 - b"THR" | b"TPO" | b"CRO" | b"DTH" | b"BMT" | b"CRF" => 16, // THR, T, total 6 - b"TRP" | b"DTR" | b"TRQ" | b"TOX" | b"0AF" => 17, // TRP, W, total 5 - b"TYR" | b"PTR" | b"TYS" | b"TPQ" | b"DTY" | b"OMY" => 18, - b"VAL" | b"DVA" | b"MVA" | b"FVA" => 19, - _ => 255, - } + *AA_MAP.get(aa).unwrap_or(&255) } + pub fn map_u8_to_aa(aa: u8) -> &'static str { match aa { 0 => "ALA",