-
Notifications
You must be signed in to change notification settings - Fork 53
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
bellman-ford inspired shortest path for distributed graph. works bett…
…er than approach in approx centrality when graph is sharded. still need to implement some low-hanging fruits for optimisation
- Loading branch information
1 parent
7633b61
commit 49abc74
Showing
13 changed files
with
746 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
source = "https://www.cdc.gov/healthywater/swimming/" | ||
host = "0.0.0.0:5000" | ||
output_path = "data/shortest_paths" | ||
|
||
[gossip] | ||
addr = "0.0.0.0:5001" | ||
seed_nodes = ["0.0.0.0:3102"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
node_id = 0 | ||
shard = 0 | ||
host = "0.0.0.0:3101" | ||
|
||
[gossip] | ||
addr = "0.0.0.0:3102" | ||
seed_nodes = [] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
shard = 0 | ||
graph_path = "data/webgraph" | ||
host = "0.0.0.0:5002" | ||
|
||
[gossip] | ||
addr = "0.0.0.0:5003" | ||
seed_nodes = ["0.0.0.0:3102"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,3 +17,4 @@ | |
pub mod approximated_harmonic_centrality; | ||
pub mod dht; | ||
pub mod harmonic_centrality; | ||
pub mod shortest_path; |
230 changes: 230 additions & 0 deletions
230
crates/core/src/entrypoint/ampc/shortest_path/coordinator.rs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,230 @@ | ||
// Stract is an open source web search engine. | ||
// Copyright (C) 2024 Stract ApS | ||
// | ||
// This program is free software: you can redistribute it and/or modify | ||
// it under the terms of the GNU Affero General Public License as | ||
// published by the Free Software Foundation, either version 3 of the | ||
// License, or (at your option) any later version. | ||
// | ||
// This program is distributed in the hope that it will be useful, | ||
// but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
// GNU Affero General Public License for more details. | ||
// | ||
// You should have received a copy of the GNU Affero General Public License | ||
// along with this program. If not, see <https://www.gnu.org/licenses/> | ||
|
||
use std::collections::BTreeMap; | ||
use std::net::SocketAddr; | ||
use std::path::Path; | ||
|
||
use itertools::Itertools; | ||
use url::Url; | ||
|
||
use super::mapper::ShortestPathMapper; | ||
use super::{worker::RemoteShortestPathWorker, ShortestPathTables}; | ||
use super::{DhtTable as _, Finisher, Meta, Setup, ShortestPathJob}; | ||
use crate::ampc::{Coordinator, DefaultDhtTable, DhtConn}; | ||
use crate::config::ShortestPathCoordinatorConfig; | ||
use crate::distributed::cluster::Cluster; | ||
use crate::distributed::member::{Member, Service, ShardId}; | ||
use crate::webpage::url_ext::UrlExt; | ||
use crate::{webgraph, Result}; | ||
|
||
pub struct ShortestPathSetup { | ||
dht: DhtConn<ShortestPathTables>, | ||
source: webgraph::NodeID, | ||
} | ||
|
||
impl ShortestPathSetup { | ||
pub async fn new(cluster: &Cluster, source: webgraph::NodeID) -> Result<Self> { | ||
let dht_members: Vec<_> = cluster | ||
.members() | ||
.await | ||
.into_iter() | ||
.filter_map(|member| { | ||
if let Service::Dht { host, shard } = member.service { | ||
Some((shard, host)) | ||
} else { | ||
None | ||
} | ||
}) | ||
.collect(); | ||
|
||
Ok(Self::new_for_dht_members(&dht_members, source)) | ||
} | ||
pub fn new_for_dht_members( | ||
dht_members: &[(ShardId, SocketAddr)], | ||
source: webgraph::NodeID, | ||
) -> Self { | ||
let initial = ShortestPathTables { | ||
distances: DefaultDhtTable::new(dht_members, "distances"), | ||
meta: DefaultDhtTable::new(dht_members, "meta"), | ||
changed_nodes: DefaultDhtTable::new(dht_members, "changed_nodes"), | ||
}; | ||
|
||
let dht = DhtConn::new(initial); | ||
|
||
Self { dht, source } | ||
} | ||
} | ||
|
||
impl Setup for ShortestPathSetup { | ||
type DhtTables = ShortestPathTables; | ||
|
||
fn init_dht(&self) -> DhtConn<Self::DhtTables> { | ||
self.dht.clone() | ||
} | ||
|
||
fn setup_round(&self, dht: &Self::DhtTables) { | ||
dht.meta.set( | ||
(), | ||
Meta { | ||
round_had_changes: false, | ||
}, | ||
); | ||
} | ||
|
||
fn setup_first_round(&self, dht: &Self::DhtTables) { | ||
dht.distances.set(self.source, 0); | ||
dht.meta.set( | ||
(), | ||
Meta { | ||
round_had_changes: true, | ||
}, | ||
); | ||
} | ||
} | ||
|
||
pub struct ShortestPathFinish; | ||
|
||
impl Finisher for ShortestPathFinish { | ||
type Job = ShortestPathJob; | ||
|
||
fn is_finished(&self, dht: &ShortestPathTables) -> bool { | ||
!dht.meta.get(()).unwrap().round_had_changes | ||
} | ||
} | ||
|
||
pub fn build( | ||
dht: &[(ShardId, SocketAddr)], | ||
workers: Vec<RemoteShortestPathWorker>, | ||
source: webgraph::NodeID, | ||
) -> Coordinator<ShortestPathJob> { | ||
let setup = ShortestPathSetup::new_for_dht_members(dht, source); | ||
|
||
Coordinator::new(setup, workers.clone()) | ||
.with_mapper(ShortestPathMapper::RelaxEdges) | ||
.with_mapper(ShortestPathMapper::UpdateChangedNodes) | ||
} | ||
|
||
struct ClusterInfo { | ||
// dropping the handle will leave the cluster | ||
_handle: Cluster, | ||
dht: Vec<(ShardId, SocketAddr)>, | ||
workers: Vec<RemoteShortestPathWorker>, | ||
} | ||
|
||
async fn setup_gossip(config: ShortestPathCoordinatorConfig) -> Result<ClusterInfo> { | ||
let handle = Cluster::join( | ||
Member::new(Service::ShortestPathCoordinator { host: config.host }), | ||
config.gossip.addr, | ||
config.gossip.seed_nodes.unwrap_or_default(), | ||
) | ||
.await?; | ||
|
||
tokio::time::sleep(tokio::time::Duration::from_secs(5)).await; | ||
|
||
let members = handle.members().await; | ||
|
||
let dht = members | ||
.iter() | ||
.filter_map(|member| { | ||
if let Service::Dht { host, shard } = member.service { | ||
Some((shard, host)) | ||
} else { | ||
None | ||
} | ||
}) | ||
.collect(); | ||
|
||
let workers = members | ||
.iter() | ||
.filter_map(|member| { | ||
if let Service::ShortestPathWorker { host, shard } = member.service { | ||
Some(RemoteShortestPathWorker::new(shard, host)) | ||
} else { | ||
None | ||
} | ||
}) | ||
.collect::<Result<Vec<RemoteShortestPathWorker>>>()?; | ||
|
||
Ok(ClusterInfo { | ||
_handle: handle, | ||
dht, | ||
workers, | ||
}) | ||
} | ||
|
||
pub fn run(config: ShortestPathCoordinatorConfig) -> Result<()> { | ||
let source = webgraph::Node::from(Url::robust_parse(&config.source)?).id(); | ||
let tokio_conf = config.clone(); | ||
let cluster = tokio::runtime::Builder::new_current_thread() | ||
.enable_all() | ||
.build()? | ||
.block_on(setup_gossip(tokio_conf))?; | ||
|
||
let jobs: Vec<_> = cluster | ||
.workers | ||
.iter() | ||
.map(|worker| ShortestPathJob { | ||
shard: worker.shard(), | ||
source, | ||
}) | ||
.collect(); | ||
|
||
tracing::info!("starting {} jobs", jobs.len()); | ||
|
||
let coordinator = build(&cluster.dht, cluster.workers.clone(), source); | ||
let res = coordinator.run(jobs, ShortestPathFinish)?; | ||
|
||
let output_path = Path::new(&config.output_path); | ||
|
||
if !output_path.exists() { | ||
std::fs::create_dir_all(output_path)?; | ||
} | ||
|
||
let mut writer = csv::Writer::from_writer( | ||
std::fs::OpenOptions::new() | ||
.create(true) | ||
.write(true) | ||
.truncate(true) | ||
.open(output_path.join("distances.csv"))?, | ||
); | ||
let mut distances = res.distances.iter().collect::<Vec<_>>(); | ||
distances.sort_by_key(|(_id, distance)| *distance); | ||
|
||
let id2node: BTreeMap<_, _> = cluster | ||
.workers | ||
.iter() | ||
.flat_map(|w| { | ||
distances | ||
.iter() | ||
.chunks(10_000) | ||
.into_iter() | ||
.flat_map(move |c| { | ||
let ids = c.map(|(id, _)| *id).collect::<Vec<_>>(); | ||
w.batch_id2node(ids) | ||
}) | ||
.collect::<Vec<_>>() | ||
}) | ||
.collect(); | ||
|
||
for (id, distance) in distances { | ||
if let Some(node) = id2node.get(&id) { | ||
writer.write_record(&[node.as_str().to_string(), distance.to_string()])?; | ||
} | ||
} | ||
|
||
Ok(()) | ||
} |
Oops, something went wrong.