diff --git a/brood/src/commands.rs b/brood/src/commands.rs index 6c9bd5e..9885f8a 100644 --- a/brood/src/commands.rs +++ b/brood/src/commands.rs @@ -1,5 +1,6 @@ pub mod export; pub mod ingest; +pub mod longest_path; pub mod path; pub mod show; pub mod stats; diff --git a/brood/src/commands/list_pages.rs b/brood/src/commands/list_pages.rs deleted file mode 100644 index 5f659ea..0000000 --- a/brood/src/commands/list_pages.rs +++ /dev/null @@ -1,23 +0,0 @@ -use std::fs::File; -use std::io::{self, BufReader}; -use std::path::Path; - -use crate::data::store; - -pub fn run(datafile: &Path) -> io::Result<()> { - let mut databuf = BufReader::new(File::open(datafile)?); - let data = store::read_adjacency_list(&mut databuf)?; - - for (page_idx, page) in data.pages() { - if page.data.redirect { - for link_idx in data.link_range(page_idx) { - let target_page = data.page(data.link(link_idx).to); - println!("{:?} -> {:?}", page.data.title, target_page.data.title); - } - } else { - println!("{:?}", page.data.title); - } - } - - Ok(()) -} diff --git a/brood/src/commands/longest_path.rs b/brood/src/commands/longest_path.rs new file mode 100644 index 0000000..1ac8e40 --- /dev/null +++ b/brood/src/commands/longest_path.rs @@ -0,0 +1,70 @@ +use std::io; + +use crate::{ + algo::Dijkstra, + data::Data, + graph::NodeIdx, + util::{self, TitleNormalizer}, +}; + +/// Find the article with the longest shortest path away from the starting +/// article. +#[derive(Debug, clap::Parser)] +pub struct Cmd { + start: String, + #[arg(long, short, default_value_t = 1)] + top: usize, +} + +fn print_path(data: &Data, start: NodeIdx, goal: NodeIdx, path: Option<(u32, Vec)>) { + let start = &data.pages[start.usize()].title; + let goal = &data.pages[goal.usize()].title; + + let Some((cost, path)) = path else { + println!("No path found from {start} to {goal}"); + return; + }; + + println!("Path found (cost {cost}, length {}):", path.len()); + + for page in path { + println!("{}", util::fmt_page(&data.pages[page.usize()])); + } +} + +impl Cmd { + pub fn run(self, data: Data) -> io::Result<()> { + let normalizer = TitleNormalizer::new(); + + println!(">> Resolve article"); + let start = util::resolve_title(&normalizer, &data, &self.start); + println!("Start: {}", data.pages[start.usize()].title); + + println!(">> Search paths"); + println!("> Preparing dijkstra"); + let mut dijkstra = Dijkstra::new(&data.graph); + println!("> Running dijkstra"); + dijkstra.run( + start, + |_| false, + |source, _edge, _target| !data.pages[source.usize()].redirect as u32, + ); + + println!(">> Find longest paths"); + let mut costs = data + .graph + .nodes() + .map(|n| (dijkstra.cost(n), n)) + .filter(|(c, _)| *c < u32::MAX) // Only reachable nodes please + .collect::>(); + costs.sort_unstable(); + + for (cost, goal) in costs.iter().rev().take(self.top) { + let path = dijkstra.path(*goal); + println!(); + print_path(&data, start, *goal, Some((*cost, path))); + } + + Ok(()) + } +} diff --git a/brood/src/commands/longest_shortest_path.rs b/brood/src/commands/longest_shortest_path.rs deleted file mode 100644 index e15eb17..0000000 --- a/brood/src/commands/longest_shortest_path.rs +++ /dev/null @@ -1,173 +0,0 @@ -use std::collections::BinaryHeap; -use std::fs::File; -use std::io::{self, BufReader}; -use std::path::Path; - -use crate::data::adjacency_list::AdjacencyList; -use crate::data::info::{LinkInfo, PageInfo}; -use crate::data::store; -use crate::util; - -struct DijkstraPageInfo { - cost: u32, - /// Index of the previous page. - prev: u32, - redirect: bool, -} - -impl DijkstraPageInfo { - fn from_page_info(info: PageInfo) -> Self { - Self { - cost: u32::MAX, - prev: u32::MAX, - redirect: info.redirect, - } - } -} - -struct DijkstraLinkInfo { - cost: u32, -} - -impl DijkstraLinkInfo { - fn from_link_info(info: LinkInfo) -> Self { - Self { - cost: 1, - // cost: 1000 + info.start, - // cost: 10000 + info.start, - // cost: 1000 + info.start / 10, - } - } -} - -#[derive(Clone, Copy, PartialEq, Eq)] -struct Entry { - cost: u32, - page_idx: u32, -} - -impl Entry { - pub fn new(cost: u32, page_idx: u32) -> Self { - Self { cost, page_idx } - } -} - -// Manual implementation so the queue is a min-heap instead of a max-heap. -impl Ord for Entry { - fn cmp(&self, other: &Self) -> std::cmp::Ordering { - other - .cost - .cmp(&self.cost) - .then_with(|| self.page_idx.cmp(&other.page_idx)) - } -} - -impl PartialOrd for Entry { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -/// Closely matches the dijkstra example in [std::collections::binary_heap]. -fn full_dijkstra( - data: AdjacencyList, - from: u32, -) -> AdjacencyList { - println!("> Prepare state"); - let mut data = data - .change_page_data(DijkstraPageInfo::from_page_info) - .change_link_data(DijkstraLinkInfo::from_link_info); - let mut queue = BinaryHeap::new(); - data.page_mut(from).data.cost = 0; - queue.push(Entry::new(0, from)); - - println!("> Run dijkstra"); - while let Some(Entry { cost, page_idx }) = queue.pop() { - let page = data.page(page_idx); - if cost > page.data.cost { - // This queue entry is outdated - continue; - } - - let redirect = page.data.redirect; - for link_idx in data.link_range(page_idx) { - let link = data.link(link_idx); - - let next = Entry { - cost: cost + if redirect { 0 } else { link.data.cost }, - page_idx: link.to, - }; - - let target_page = data.page_mut(link.to); - if next.cost < target_page.data.cost { - target_page.data.cost = next.cost; - target_page.data.prev = page_idx; - queue.push(next); - } - } - } - - data -} - -fn find_longest_shortest_path( - data: AdjacencyList, - from: u32, -) -> Option> { - let to = data - .pages - .iter() - .enumerate() - .filter(|(_, p)| p.data.cost != u32::MAX) - .max_by_key(|(_, p)| p.data.cost)? - .0 as u32; - - let mut steps = vec![]; - let mut at = to; - loop { - steps.push(at); - at = data.page(at).data.prev; - if at == u32::MAX { - break; - }; - } - steps.reverse(); - if steps.first() == Some(&from) { - Some(steps) - } else { - None - } -} - -pub fn run(datafile: &Path, from: &str) -> io::Result<()> { - println!(">> Import"); - let mut databuf = BufReader::new(File::open(datafile)?); - let data = store::read_adjacency_list(&mut databuf)?; - let pages = data.pages.clone(); - - println!(">> Locate from and to"); - let from_idx = util::resolve_redirects(&data, util::find_index_of_title(&pages, from)); - println!("From: {:?}", data.page(from_idx).data.title); - - println!(">> Find all shortest paths"); - let data = full_dijkstra(data, from_idx); - - println!(">> Find longest shortest path"); - let path = find_longest_shortest_path(data, from_idx); - - if let Some(path) = path { - println!("Path found:"); - for page_idx in path { - let page = &pages[page_idx as usize]; - if page.data.redirect { - println!(" v {:?}", page.data.title); - } else { - println!(" - {:?}", page.data.title); - } - } - } else { - println!("No path found"); - } - - Ok(()) -} diff --git a/brood/src/main.rs b/brood/src/main.rs index ba71e52..66b14df 100644 --- a/brood/src/main.rs +++ b/brood/src/main.rs @@ -14,8 +14,9 @@ enum Command { Ingest(commands::ingest::Cmd), Export(commands::export::Cmd), Show(commands::show::Cmd), - Path(commands::path::Cmd), Stats(commands::stats::Cmd), + Path(commands::path::Cmd), + LongestPath(commands::longest_path::Cmd), } #[derive(Debug, Parser)] @@ -73,7 +74,8 @@ fn main() -> io::Result<()> { Command::Ingest(_) => unreachable!(), Command::Export(cmd) => cmd.run(data), Command::Show(cmd) => cmd.run(data), - Command::Path(cmd) => cmd.run(data), Command::Stats(cmd) => cmd.run(data), + Command::Path(cmd) => cmd.run(data), + Command::LongestPath(cmd) => cmd.run(data), } }