diff --git a/brood/src/commands/ingest.rs b/brood/src/commands/ingest.rs index 7e0f223..42fc13a 100644 --- a/brood/src/commands/ingest.rs +++ b/brood/src/commands/ingest.rs @@ -9,8 +9,8 @@ use serde::Deserialize; use thousands::Separable; use crate::{ - data::{self, Link, Page}, - graph::{Graph, NodeIdx}, + data::{Data, Link, Page}, + graph::NodeIdx, util::{Counter, TitleNormalizer}, }; @@ -87,11 +87,9 @@ fn read_page_data( normalizer: &TitleNormalizer, title_lookup: &HashMap, r: &mut BufReader, -) -> io::Result<(Vec, Vec, Graph)> { +) -> io::Result { let mut counter = Counter::new(); - let mut pages = vec![]; - let mut links = vec![]; - let mut graph = Graph::new(); + let mut data = Data::new(); for (i, line) in r.lines().enumerate() { counter.tick(); @@ -106,8 +104,8 @@ fn read_page_data( continue; } - graph.add_node(); - pages.push(Page { + data.graph.add_node(); + data.pages.push(Page { id: page.id, title: page.title, length: page.length, @@ -123,14 +121,14 @@ fn read_page_data( for (target, start, len, flags) in page_links { if let Some((_, brood_i)) = title_lookup.get(&normalizer.normalize(&target)) { - graph.edges.push(NodeIdx(*brood_i)); - links.push(Link { start, len, flags }); + data.graph.edges.push(NodeIdx(*brood_i)); + data.links.push(Link { start, len, flags }); } } } counter.done(); - Ok((pages, links, graph)) + Ok(data) } /// Convert sift data to brood data. @@ -141,7 +139,7 @@ pub struct Cmd { } impl Cmd { - pub fn run(self, data: &Path) -> io::Result<()> { + pub fn run(self, brood_data: &Path) -> io::Result<()> { let normalizer = TitleNormalizer::new(); println!(">> First pass"); @@ -158,18 +156,24 @@ impl Cmd { sift_data.seek(io::SeekFrom::Start(0))?; println!("> Reading page data"); - let (pages, links, graph) = read_page_data(&normalizer, &title_lookup, &mut sift_data)?; - assert_eq!(pages.len(), title_lookup.len()); + let data = read_page_data(&normalizer, &title_lookup, &mut sift_data)?; + assert_eq!(data.pages.len(), title_lookup.len()); drop(title_lookup); // Don't hoard memory drop(sift_data); // No longer needed println!("> Checking consistency"); - graph.check_consistency(); + data.graph.check_consistency(); println!(">> Export"); - println!("Pages: {:>13}", pages.len().separate_with_underscores()); - println!("Links: {:>13}", links.len().separate_with_underscores()); - data::write_to_file(data, &pages, &links, &graph)?; + println!( + "Pages: {:>13}", + data.pages.len().separate_with_underscores() + ); + println!( + "Links: {:>13}", + data.links.len().separate_with_underscores() + ); + data.write_to_file(brood_data)?; Ok(()) } diff --git a/brood/src/commands/path.rs b/brood/src/commands/path.rs index 7a5dcb9..882203c 100644 --- a/brood/src/commands/path.rs +++ b/brood/src/commands/path.rs @@ -2,7 +2,7 @@ use std::{io, path::Path}; use crate::{ algo::Dijkstra, - data, + data::Data, util::{self, TitleNormalizer}, }; @@ -18,22 +18,22 @@ impl Cmd { let normalizer = TitleNormalizer::new(); println!(">> Import"); - let (pages, _links, graph) = data::read_from_file(data)?; + let data = Data::read_from_file(data)?; println!(">> Resolve articles"); - let start = util::resolve_title(&normalizer, &pages, &graph, &self.start); - let goal = util::resolve_title(&normalizer, &pages, &graph, &self.goal); - println!("Start: {}", pages[start.usize()].title); - println!("Goal: {}", pages[goal.usize()].title); + let start = util::resolve_title(&normalizer, &data, &self.start); + let goal = util::resolve_title(&normalizer, &data, &self.goal); + println!("Start: {}", data.pages[start.usize()].title); + println!("Goal: {}", data.pages[goal.usize()].title); println!(">> Find path"); println!("> Preparing dijkstra"); - let mut dijkstra = Dijkstra::new(&graph); + let mut dijkstra = Dijkstra::new(&data.graph); println!("> Running dijkstra"); dijkstra.run( start, |node| node == goal, - |source, _edge, _target| !pages[source.usize()].redirect as u32, + |source, _edge, _target| !data.pages[source.usize()].redirect as u32, ); if dijkstra.cost(goal) == u32::MAX { @@ -48,7 +48,7 @@ impl Cmd { println!(); println!("Path found (cost {cost}, length {}):", path.len()); for page in path { - let info = &pages[page.usize()]; + let info = &data.pages[page.usize()]; if info.redirect { println!("v {:?}", info.title); } else { diff --git a/brood/src/data.rs b/brood/src/data.rs index 69fc362..20c95a6 100644 --- a/brood/src/data.rs +++ b/brood/src/data.rs @@ -109,69 +109,88 @@ fn read_link(r: &mut impl Read) -> io::Result { }) } -fn write(w: &mut impl Write, pages: &[Page], links: &[Link], graph: &Graph) -> io::Result<()> { - assert!(pages.len() < u32::MAX as usize); - assert!(links.len() < u32::MAX as usize); - assert_eq!(pages.len(), graph.nodes.len()); - assert_eq!(links.len(), graph.edges.len()); - write_u32(w, pages.len() as u32)?; - write_u32(w, links.len() as u32)?; - - for page in pages { - write_page(w, page)?; - } - - for link in links { - write_link(w, link)?; - } - - for node in &graph.nodes { - write_u32(w, node.0)?; - } - - for edge in &graph.edges { - write_u32(w, edge.0)?; - } - - Ok(()) +#[derive(Default)] +pub struct Data { + pub pages: Vec, + pub links: Vec, + pub graph: Graph, } -fn read(r: &mut impl Read) -> io::Result<(Vec, Vec, Graph)> { - let n_pages = read_u32(r)?; - let n_links = read_u32(r)?; - - let mut pages = Vec::with_capacity(n_pages as usize); - let mut links = Vec::with_capacity(n_links as usize); - let mut graph = Graph::with_capacity(n_pages as usize, n_links as usize); - - for _ in 0..n_pages { - pages.push(read_page(r)?); +impl Data { + pub fn new() -> Self { + Self::default() } - for _ in 0..n_links { - links.push(read_link(r)?); + pub fn with_capacity(pages: usize, links: usize) -> Self { + Self { + pages: Vec::with_capacity(pages), + links: Vec::with_capacity(links), + graph: Graph::with_capacity(pages, links), + } } - for _ in 0..n_pages { - graph.nodes.push(EdgeIdx(read_u32(r)?)); + fn write(&self, w: &mut impl Write) -> io::Result<()> { + assert!(self.pages.len() < u32::MAX as usize); + assert!(self.links.len() < u32::MAX as usize); + assert_eq!(self.pages.len(), self.graph.nodes.len()); + assert_eq!(self.links.len(), self.graph.edges.len()); + write_u32(w, self.pages.len() as u32)?; + write_u32(w, self.links.len() as u32)?; + + for page in &self.pages { + write_page(w, page)?; + } + + for link in &self.links { + write_link(w, link)?; + } + + for node in &self.graph.nodes { + write_u32(w, node.0)?; + } + + for edge in &self.graph.edges { + write_u32(w, edge.0)?; + } + + Ok(()) } - for _ in 0..n_links { - graph.edges.push(NodeIdx(read_u32(r)?)); + fn read(r: &mut impl Read) -> io::Result { + let n_pages = read_u32(r)?; + let n_links = read_u32(r)?; + + let mut result = Self::with_capacity(n_pages as usize, n_links as usize); + + for _ in 0..n_pages { + result.pages.push(read_page(r)?); + } + + for _ in 0..n_links { + result.links.push(read_link(r)?); + } + + for _ in 0..n_pages { + result.graph.nodes.push(EdgeIdx(read_u32(r)?)); + } + + for _ in 0..n_links { + result.graph.edges.push(NodeIdx(read_u32(r)?)); + } + + assert_eq!(result.pages.len(), result.graph.nodes.len()); + assert_eq!(result.links.len(), result.graph.edges.len()); + result.graph.check_consistency(); + Ok(result) } - assert_eq!(pages.len(), graph.nodes.len()); - assert_eq!(links.len(), graph.edges.len()); - graph.check_consistency(); - Ok((pages, links, graph)) -} - -pub fn write_to_file(path: &Path, pages: &[Page], links: &[Link], graph: &Graph) -> io::Result<()> { - let mut file = BufWriter::new(File::create(path)?); - write(&mut file, pages, links, graph) -} - -pub fn read_from_file(path: &Path) -> io::Result<(Vec, Vec, Graph)> { - let mut file = BufReader::new(File::open(path)?); - read(&mut file) + pub fn write_to_file(&self, path: &Path) -> io::Result<()> { + let mut file = BufWriter::new(File::create(path)?); + self.write(&mut file) + } + + pub fn read_from_file(path: &Path) -> io::Result { + let mut file = BufReader::new(File::open(path)?); + Self::read(&mut file) + } } diff --git a/brood/src/graph.rs b/brood/src/graph.rs index 1cc25da..ed6f559 100644 --- a/brood/src/graph.rs +++ b/brood/src/graph.rs @@ -1,6 +1,6 @@ use std::ops::{Add, AddAssign, Range, Sub, SubAssign}; -#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] pub struct NodeIdx(pub u32); impl NodeIdx { @@ -85,7 +85,7 @@ impl SubAssign for NodeIdx { } } -#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] pub struct EdgeIdx(pub u32); impl EdgeIdx { @@ -242,6 +242,11 @@ impl Graph { Edges::new(self) } + pub fn edges_for(&self, node: NodeIdx) -> impl Iterator + '_ { + self.edge_range(node) + .map(|i| (EdgeIdx::new(i), self.edges[i])) + } + pub fn edge_start(&self, node: NodeIdx) -> EdgeIdx { self.nodes .get(node.usize()) @@ -255,7 +260,7 @@ impl Graph { start.usize()..end.usize() } - pub fn edges_for(&self, node: NodeIdx) -> &[NodeIdx] { + pub fn edge_slice(&self, node: NodeIdx) -> &[NodeIdx] { &self.edges[self.edge_range(node)] } } @@ -283,15 +288,15 @@ impl Iterator for Edges<'_> { if self.ei.usize() >= self.graph.edges.len() { return None; } - let to = self.graph.edges[self.ei.usize()]; + let target = self.graph.edges[self.ei.usize()]; // if would not be sufficient because some nodes may not have any edges. while self.ei >= self.graph.edge_start(self.ni + 1) { self.ni += 1; } - let from = self.ni; + let source = self.ni; self.ei += 1; - Some((from, to)) + Some((source, target)) } } diff --git a/brood/src/util.rs b/brood/src/util.rs index f594058..d908a42 100644 --- a/brood/src/util.rs +++ b/brood/src/util.rs @@ -4,7 +4,7 @@ use regex::Regex; use thousands::Separable; use crate::{ - data::Page, + data::{Data, Page}, graph::{Graph, NodeIdx}, }; @@ -150,9 +150,9 @@ impl TitleNormalizer { } } -pub fn locate_title(normalizer: &TitleNormalizer, pages: &[Page], title: &str) -> NodeIdx { +pub fn locate_title(normalizer: &TitleNormalizer, data: &Data, title: &str) -> NodeIdx { let normalized = normalizer.normalize(title); - pages + data.pages .iter() .enumerate() .find(|(_, p)| normalizer.normalize(&p.title) == normalized) @@ -160,10 +160,10 @@ pub fn locate_title(normalizer: &TitleNormalizer, pages: &[Page], title: &str) - .expect("invalid title") } -pub fn resolve_redirects(pages: &[Page], graph: &Graph, mut page: NodeIdx) -> NodeIdx { +pub fn resolve_redirects(data: &Data, mut page: NodeIdx) -> NodeIdx { loop { - if pages[page.usize()].redirect { - if let Some(target) = graph.edges_for(page).first() { + if data.pages[page.usize()].redirect { + if let Some(target) = data.graph.edge_slice(page).first() { page = *target; continue; } @@ -173,11 +173,6 @@ pub fn resolve_redirects(pages: &[Page], graph: &Graph, mut page: NodeIdx) -> No } } -pub fn resolve_title( - normalizer: &TitleNormalizer, - pages: &[Page], - graph: &Graph, - title: &str, -) -> NodeIdx { - resolve_redirects(pages, graph, locate_title(normalizer, pages, title)) +pub fn resolve_title(normalizer: &TitleNormalizer, data: &Data, title: &str) -> NodeIdx { + resolve_redirects(data, locate_title(normalizer, data, title)) }