diff --git a/brood/src/algo.rs b/brood/src/algo.rs new file mode 100644 index 0000000..b6bf26a --- /dev/null +++ b/brood/src/algo.rs @@ -0,0 +1,77 @@ +use std::{cmp::Reverse, collections::BinaryHeap}; + +use crate::graph::{EdgeIdx, Graph, NodeIdx}; + +pub struct Dijkstra<'a> { + graph: &'a Graph, + cost: Vec, + pred: Vec, +} + +impl<'a> Dijkstra<'a> { + pub fn new(graph: &'a Graph) -> Self { + Self { + graph, + cost: vec![u32::MAX; graph.nodes.len()], + pred: vec![NodeIdx::NONE; graph.nodes.len()], + } + } + + pub fn run( + &mut self, + start: NodeIdx, + goal: impl Fn(NodeIdx) -> bool, + cost: impl Fn(NodeIdx, EdgeIdx, NodeIdx) -> u32, + ) { + self.cost[start.usize()] = 0; + let mut queue = BinaryHeap::new(); + queue.push((Reverse(0), start)); + + while let Some((Reverse(curr_cost), curr)) = queue.pop() { + if goal(curr) { + break; // We've found the shortest path to our target + } + + // These seem to never actually occur + // if curr_cost > self.cost[curr.usize()] { + // continue; // Outdated entry + // } + + for edge in self.graph.edge_range(curr).map(EdgeIdx::new) { + let next = self.graph.edges[edge.usize()]; + let next_cost = curr_cost + cost(curr, edge, next); + if next_cost < self.cost[next.usize()] { + self.cost[next.usize()] = next_cost; + self.pred[next.usize()] = curr; + queue.push((Reverse(next_cost), next)); + } + } + } + } + + #[inline] + pub fn cost(&self, node: NodeIdx) -> u32 { + self.cost[node.usize()] + } + + #[inline] + pub fn pred(&self, node: NodeIdx) -> NodeIdx { + self.pred[node.usize()] + } + + pub fn path(&self, goal: NodeIdx) -> Vec { + let mut path = vec![]; + let mut at = goal; + + loop { + path.push(at); + at = self.pred(at); + if at == NodeIdx::NONE { + break; + } + } + + path.reverse(); + path + } +} diff --git a/brood/src/commands.rs b/brood/src/commands.rs index 6da3050..d4b8155 100644 --- a/brood/src/commands.rs +++ b/brood/src/commands.rs @@ -3,5 +3,6 @@ pub mod list_links; pub mod list_pages; pub mod longest_shortest_path; pub mod path; +pub mod path2; pub mod philosophy_game; pub mod reexport; diff --git a/brood/src/commands/path2.rs b/brood/src/commands/path2.rs new file mode 100644 index 0000000..55c72ed --- /dev/null +++ b/brood/src/commands/path2.rs @@ -0,0 +1,77 @@ +use std::{ + fs::File, + io::{self, BufReader}, + path::Path, +}; + +use crate::{ + algo::Dijkstra, + data::{info::PageInfo, store}, + graph::{Graph, NodeIdx}, + util, +}; + +pub fn find_index_of_title(pages: &[PageInfo], title: &str) -> NodeIdx { + let title = util::normalize_link(title); + pages + .iter() + .enumerate() + .find(|(_, p)| util::normalize_link(&p.title) == title) + .map(|(i, _)| NodeIdx::new(i)) + .expect("invalid title") +} + +pub fn resolve_redirects(pages: &[PageInfo], graph: &Graph, mut page: NodeIdx) -> NodeIdx { + loop { + if pages[page.usize()].redirect { + if let Some(next) = graph.edges_for(page).first() { + page = *next; + continue; + } + } + + return page; + } +} + +pub fn path(datafile: &Path, start: &str, goal: &str) -> io::Result<()> { + println!(">> Import"); + let mut databuf = BufReader::new(File::open(datafile)?); + let (pages, _links, graph) = store::read_graph(&mut databuf)?; + + println!(">> Locate from and to"); + let start = resolve_redirects(&pages, &graph, find_index_of_title(&pages, start)); + let goal = resolve_redirects(&pages, &graph, find_index_of_title(&pages, goal)); + println!("Start: {:?}", pages[start.usize()].title); + println!("Goal: {:?}", pages[goal.usize()].title); + + println!(">> Find path"); + println!("> Preparing dijkstra"); + let mut dijkstra = Dijkstra::new(&graph); + println!("> Running dijkstra"); + dijkstra.run( + start, + |node| node == goal, + |source, _edge, _target| !pages[source.usize()].redirect as u32, + ); + + if dijkstra.cost(goal) == u32::MAX { + println!("No path found"); + return Ok(()); + } + + println!("> Collecting path"); + let path = dijkstra.path(goal); + let cost = dijkstra.cost(goal); + println!("Path found (cost {cost}, length {}):", path.len()); + for page in path { + let info = &pages[page.usize()]; + if info.redirect { + println!(" v {:?}", info.title); + } else { + println!(" - {:?}", info.title); + } + } + + Ok(()) +} diff --git a/brood/src/data/store.rs b/brood/src/data/store.rs index afba1a3..06a35eb 100644 --- a/brood/src/data/store.rs +++ b/brood/src/data/store.rs @@ -1,5 +1,7 @@ use std::io::{self, Read, Write}; +use crate::graph::{EdgeIdx, Graph, NodeIdx}; + use super::{ adjacency_list::{AdjacencyList, Link, Page}, info::{LinkInfo, PageInfo}, @@ -132,3 +134,27 @@ pub fn read_adjacency_list(from: &mut R) -> io::Result io::Result<(Vec, Vec, Graph)> { + let n_pages = read_u32(from)?; + let n_links = read_u32(from)?; + + let mut pages = Vec::with_capacity(n_pages as usize); + let mut links = Vec::with_capacity(n_links as usize); + let mut graph = Graph::with_capacity(n_pages as usize, n_links as usize); + + for _ in 0..n_pages { + let page = read_page(from)?; + graph.nodes.push(EdgeIdx(page.start)); + pages.push(page.data); + } + + for _ in 0..n_links { + let link = read_link(from)?; + graph.edges.push(NodeIdx(link.to)); + links.push(link.data); + } + + graph.check_consistency(); + Ok((pages, links, graph)) +} diff --git a/brood/src/graph.rs b/brood/src/graph.rs new file mode 100644 index 0000000..9cd39d4 --- /dev/null +++ b/brood/src/graph.rs @@ -0,0 +1,293 @@ +use std::ops::{Add, AddAssign, Range, Sub, SubAssign}; + +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +pub struct NodeIdx(pub u32); + +impl NodeIdx { + pub const NONE: Self = Self(u32::MAX); + + #[inline] + pub const fn new(value: usize) -> Self { + Self(value as u32) + } + + #[inline] + pub const fn usize(self) -> usize { + self.0 as usize + } +} + +impl From for NodeIdx { + fn from(value: u32) -> Self { + Self(value) + } +} + +impl From for NodeIdx { + fn from(value: usize) -> Self { + Self::new(value) + } +} + +impl Add for NodeIdx { + type Output = Self; + + fn add(self, rhs: Self) -> Self::Output { + Self(self.0 + rhs.0) + } +} + +impl AddAssign for NodeIdx { + fn add_assign(&mut self, rhs: Self) { + self.0 += rhs.0; + } +} + +impl Sub for NodeIdx { + type Output = Self; + + fn sub(self, rhs: Self) -> Self::Output { + Self(self.0 - rhs.0) + } +} + +impl SubAssign for NodeIdx { + fn sub_assign(&mut self, rhs: Self) { + self.0 -= rhs.0; + } +} + +impl Add for NodeIdx { + type Output = Self; + + fn add(self, rhs: u32) -> Self::Output { + Self(self.0 + rhs) + } +} + +impl AddAssign for NodeIdx { + fn add_assign(&mut self, rhs: u32) { + self.0 += rhs; + } +} + +impl Sub for NodeIdx { + type Output = Self; + + fn sub(self, rhs: u32) -> Self::Output { + Self(self.0 - rhs) + } +} + +impl SubAssign for NodeIdx { + fn sub_assign(&mut self, rhs: u32) { + self.0 -= rhs; + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +pub struct EdgeIdx(pub u32); + +impl EdgeIdx { + pub const NONE: Self = Self(u32::MAX); + + #[inline] + pub const fn new(value: usize) -> Self { + Self(value as u32) + } + + #[inline] + pub const fn usize(self) -> usize { + self.0 as usize + } +} + +impl From for EdgeIdx { + fn from(value: u32) -> Self { + Self(value) + } +} + +impl From for EdgeIdx { + fn from(value: usize) -> Self { + Self::new(value) + } +} + +impl Add for EdgeIdx { + type Output = Self; + + fn add(self, rhs: Self) -> Self::Output { + Self(self.0 + rhs.0) + } +} + +impl AddAssign for EdgeIdx { + fn add_assign(&mut self, rhs: Self) { + self.0 += rhs.0; + } +} + +impl Sub for EdgeIdx { + type Output = Self; + + fn sub(self, rhs: Self) -> Self::Output { + Self(self.0 - rhs.0) + } +} + +impl SubAssign for EdgeIdx { + fn sub_assign(&mut self, rhs: Self) { + self.0 -= rhs.0; + } +} + +impl Add for EdgeIdx { + type Output = Self; + + fn add(self, rhs: u32) -> Self::Output { + Self(self.0 + rhs) + } +} + +impl AddAssign for EdgeIdx { + fn add_assign(&mut self, rhs: u32) { + self.0 += rhs; + } +} + +impl Sub for EdgeIdx { + type Output = Self; + + fn sub(self, rhs: u32) -> Self::Output { + Self(self.0 - rhs) + } +} + +impl SubAssign for EdgeIdx { + fn sub_assign(&mut self, rhs: u32) { + self.0 -= rhs; + } +} + +#[derive(Default)] +pub struct Graph { + /// A node points to the first of its edges. + /// + /// A special case is that if the subsequent node points to the same edge, + /// the current node has no edges. + pub nodes: Vec, + + /// An edge points to a target node. + /// + /// The source node is defined implicitly by the graph data structure. + pub edges: Vec, +} + +impl Graph { + pub fn new() -> Self { + Self::default() + } + + pub fn with_capacity(nodes: usize, edges: usize) -> Self { + Self { + nodes: Vec::with_capacity(nodes), + edges: Vec::with_capacity(edges), + } + } + + pub fn check_consistency(&self) { + if self.nodes.is_empty() { + assert!(self.edges.is_empty(), "edges must belong to existing nodes"); + return; + } + + assert!(self.nodes.len() < u32::MAX as usize, "too many nodes"); + assert!(self.edges.len() < u32::MAX as usize, "too many edges"); + + assert_eq!( + *self.nodes.first().unwrap(), + EdgeIdx(0), + "first node pointer must be 0" + ); + + for (ni, node) in self.nodes.iter().cloned().enumerate() { + assert!( + node.usize() < self.edges.len(), + "node pointers must in range" + ); + + if let Some(succ) = self.nodes.get(ni + 1) { + assert!(node <= *succ, "node pointers must be well-ordered"); + } + } + + for edge in &self.edges { + assert!( + edge.usize() < self.nodes.len(), + "edge pointers must be in range" + ); + } + } + + pub fn nodes(&self) -> impl Iterator + '_ { + (0..self.nodes.len()).map(NodeIdx::new) + } + + pub fn edges(&self) -> impl Iterator + '_ { + Edges::new(self) + } + + pub fn edge_start(&self, node: NodeIdx) -> EdgeIdx { + self.nodes + .get(node.usize()) + .copied() + .unwrap_or_else(|| self.edges.len().into()) + } + + pub fn edge_range(&self, node: NodeIdx) -> Range { + let start = self.nodes[node.usize()]; + let end = self.edge_start(node + 1); + start.usize()..end.usize() + } + + pub fn edges_for(&self, node: NodeIdx) -> &[NodeIdx] { + &self.edges[self.edge_range(node)] + } +} + +struct Edges<'a> { + graph: &'a Graph, + ni: NodeIdx, + ei: EdgeIdx, +} + +impl<'a> Edges<'a> { + fn new(graph: &'a Graph) -> Self { + Self { + graph, + ni: NodeIdx(0), + ei: EdgeIdx(0), + } + } +} + +impl Iterator for Edges<'_> { + type Item = (NodeIdx, NodeIdx); + + fn next(&mut self) -> Option { + if self.ei.usize() >= self.graph.edges.len() { + return None; + } + let to = self.graph.edges[self.ei.usize()]; + + // if would not be sufficient because some nodes may not have any edges. + while self.ei >= self.graph.edge_start(self.ni + 1) { + self.ni += 1; + } + let from = self.ni; + + self.ei += 1; + Some((from, to)) + } +} diff --git a/brood/src/main.rs b/brood/src/main.rs index 57d1b81..501540b 100644 --- a/brood/src/main.rs +++ b/brood/src/main.rs @@ -1,11 +1,16 @@ +mod algo; pub mod commands; mod data; +mod graph; mod util; -use std::io; -use std::path::PathBuf; +use std::fs::File; +use std::io::{self, BufReader}; +use std::path::{Path, PathBuf}; +use std::time::Instant; use clap::Parser; +use data::store; #[derive(Debug, PartialEq, Eq, Parser)] pub enum PhilosophyGameCmd { @@ -35,8 +40,18 @@ enum Command { #[arg(short, long)] flip: bool, }, + /// Find a path from one article to another. + Path2 { + from: String, + to: String, + /// Flip start and end article. + #[arg(short, long)] + flip: bool, + }, /// Find the longest shortest path starting at an article. - LongestShortestPath { from: String }, + LongestShortestPath { + from: String, + }, /// Analyze articles using "Philosophy Game" rules. PhilosophyGame { #[command(subcommand)] @@ -49,6 +64,7 @@ enum Command { /// The page to inspect. page: String, }, + Test, } #[derive(Debug, Parser)] @@ -74,6 +90,13 @@ fn main() -> io::Result<()> { commands::path::path(&args.datafile, &from, &to) } } + Command::Path2 { from, to, flip } => { + if flip { + commands::path2::path(&args.datafile, &to, &from) + } else { + commands::path2::path(&args.datafile, &from, &to) + } + } Command::LongestShortestPath { from } => { commands::longest_shortest_path::run(&args.datafile, &from) } @@ -82,5 +105,21 @@ fn main() -> io::Result<()> { } Command::ListPages => commands::list_pages::run(&args.datafile), Command::ListLinks { page } => commands::list_links::run(&args.datafile, &page), + Command::Test => test(&args.datafile), } } + +fn test(datafile: &Path) -> io::Result<()> { + let a = Instant::now(); + // println!(">> Import adjacency list"); + // let mut databuf = BufReader::new(File::open(datafile)?); + // let adjlist = store::read_adjacency_list(&mut databuf)?; + println!(">> Import graph"); + let mut databuf = BufReader::new(File::open(datafile)?); + let (pages, links, graph) = store::read_graph(&mut databuf)?; + let b = Instant::now(); + + println!("{:?}", b.duration_since(a)); + + Ok(()) +}