diff --git a/brood/src/commands/ingest.rs b/brood/src/commands/ingest.rs index d38b1c6..cda10d0 100644 --- a/brood/src/commands/ingest.rs +++ b/brood/src/commands/ingest.rs @@ -7,9 +7,7 @@ use std::u32; use rustc_hash::FxHashMap; use serde::Deserialize; -use crate::data::adjacency_list::{ - AdjacencyList, Link, LinkIdx, Page, PageIdx, SENTINEL_PAGE_MARKER, -}; +use crate::data::adjacency_list::{AdjacencyList, Page}; use crate::data::info::{LinkInfo, PageInfo}; use crate::data::store; use crate::util; @@ -74,29 +72,20 @@ fn first_stage() -> io::Result<(AdjacencyList, Titles)> { for (i, line) in stdin.lines().enumerate() { let json_page = serde_json::from_str::(&line?).unwrap(); - result.pages.push(Page { - start: LinkIdx(result.links.len() as u32), - data: PageInfo { - id: json_page.id, - length: json_page.length, - redirect: json_page.redirect.is_some(), - title: json_page.title, - }, + result.push_page(PageInfo { + id: json_page.id, + length: json_page.length, + redirect: json_page.redirect.is_some(), + title: json_page.title, }); if let Some(to) = json_page.redirect { let to = titles.insert(util::normalize_link(&to)); - result.links.push(Link { - to: PageIdx(to), - data: LinkInfo::default(), - }); + result.push_link(to, LinkInfo::default()); } else { for (to, start, len, flags) in json_page.links { let to = titles.insert(util::normalize_link(&to)); - result.links.push(Link { - to: PageIdx(to), - data: LinkInfo { start, len, flags }, - }); + result.push_link(to, LinkInfo { start, len, flags }); } } @@ -110,16 +99,6 @@ fn first_stage() -> io::Result<(AdjacencyList, Titles)> { eprintln!("Titles: {}", titles.titles.len()); eprintln!("Title map entries: {}", titles.map.len()); - result.pages.push(Page { - start: LinkIdx(result.links.len() as u32), - data: PageInfo { - id: u32::MAX, - length: 0, - redirect: false, - title: SENTINEL_PAGE_MARKER.to_string(), - }, - }); - Ok((result, titles)) } @@ -151,26 +130,19 @@ fn second_stage( let pages_map = initialize_pages_map(&first_stage.pages); let mut result = AdjacencyList::default(); - for page_idx in 0..first_stage.pages.len() - 1 { - let mut page = first_stage.pages[page_idx].clone(); - let start_link_idx = page.start; - let end_link_idx = first_stage.pages[page_idx + 1].start; + for (page_idx, page) in first_stage.pages() { + result.push_page(page.data.clone()); - page.start.0 = result.links.len() as u32; - result.pages.push(page); - - for link_idx in start_link_idx.0..end_link_idx.0 { - let mut link = first_stage.links[link_idx as usize]; - let title = util::normalize_link(titles.get(link.to.0)); + for (_, link) in first_stage.links(page_idx) { + let title = util::normalize_link(titles.get(link.to)); if let Some(to) = pages_map.get(&title) { // The link points to an existing article, we should keep it - link.to.0 = *to; - result.links.push(link); + result.push_link(*to, link.data); } } if (page_idx + 1) % 100_000 == 0 { - eprintln!("{} pages processed", page_idx + 1) + eprintln!("{} pages imported", page_idx + 1) } } @@ -178,10 +150,6 @@ fn second_stage( eprintln!("Links: {}", result.links.len()); eprintln!("Page map entries: {}", pages_map.len()); - let mut sentinel = first_stage.pages.last().unwrap().clone(); - sentinel.start.0 = result.links.len() as u32; - result.pages.push(sentinel); - result } diff --git a/brood/src/commands/list_pages.rs b/brood/src/commands/list_pages.rs index d4e0479..5f659ea 100644 --- a/brood/src/commands/list_pages.rs +++ b/brood/src/commands/list_pages.rs @@ -2,15 +2,13 @@ use std::fs::File; use std::io::{self, BufReader}; use std::path::Path; -use crate::data::adjacency_list::PageIdx; use crate::data::store; pub fn run(datafile: &Path) -> io::Result<()> { let mut databuf = BufReader::new(File::open(datafile)?); let data = store::read_adjacency_list(&mut databuf)?; - for (page_idx, page) in data.pages.iter().enumerate() { - let page_idx = PageIdx(page_idx as u32); + for (page_idx, page) in data.pages() { if page.data.redirect { for link_idx in data.link_range(page_idx) { let target_page = data.page(data.link(link_idx).to); diff --git a/brood/src/commands/longest_shortest_path.rs b/brood/src/commands/longest_shortest_path.rs index c3de973..e15eb17 100644 --- a/brood/src/commands/longest_shortest_path.rs +++ b/brood/src/commands/longest_shortest_path.rs @@ -3,14 +3,15 @@ use std::fs::File; use std::io::{self, BufReader}; use std::path::Path; -use crate::data::adjacency_list::{AdjacencyList, PageIdx}; +use crate::data::adjacency_list::AdjacencyList; use crate::data::info::{LinkInfo, PageInfo}; use crate::data::store; use crate::util; struct DijkstraPageInfo { cost: u32, - prev: PageIdx, + /// Index of the previous page. + prev: u32, redirect: bool, } @@ -18,7 +19,7 @@ impl DijkstraPageInfo { fn from_page_info(info: PageInfo) -> Self { Self { cost: u32::MAX, - prev: PageIdx::MAX, + prev: u32::MAX, redirect: info.redirect, } } @@ -42,12 +43,12 @@ impl DijkstraLinkInfo { #[derive(Clone, Copy, PartialEq, Eq)] struct Entry { cost: u32, - idx: PageIdx, + page_idx: u32, } impl Entry { - pub fn new(cost: u32, idx: PageIdx) -> Self { - Self { cost, idx } + pub fn new(cost: u32, page_idx: u32) -> Self { + Self { cost, page_idx } } } @@ -57,7 +58,7 @@ impl Ord for Entry { other .cost .cmp(&self.cost) - .then_with(|| self.idx.cmp(&other.idx)) + .then_with(|| self.page_idx.cmp(&other.page_idx)) } } @@ -70,22 +71,18 @@ impl PartialOrd for Entry { /// Closely matches the dijkstra example in [std::collections::binary_heap]. fn full_dijkstra( data: AdjacencyList, - from_idx: PageIdx, + from: u32, ) -> AdjacencyList { println!("> Prepare state"); let mut data = data .change_page_data(DijkstraPageInfo::from_page_info) .change_link_data(DijkstraLinkInfo::from_link_info); let mut queue = BinaryHeap::new(); - data.page_mut(from_idx).data.cost = 0; - queue.push(Entry::new(0, from_idx)); + data.page_mut(from).data.cost = 0; + queue.push(Entry::new(0, from)); println!("> Run dijkstra"); - while let Some(Entry { - cost, - idx: page_idx, - }) = queue.pop() - { + while let Some(Entry { cost, page_idx }) = queue.pop() { let page = data.page(page_idx); if cost > page.data.cost { // This queue entry is outdated @@ -98,7 +95,7 @@ fn full_dijkstra( let next = Entry { cost: cost + if redirect { 0 } else { link.data.cost }, - idx: link.to, + page_idx: link.to, }; let target_page = data.page_mut(link.to); @@ -115,23 +112,22 @@ fn full_dijkstra( fn find_longest_shortest_path( data: AdjacencyList, - from: PageIdx, -) -> Option> { - let to = PageIdx( - data.pages - .iter() - .enumerate() - .filter(|(_, p)| p.data.cost != u32::MAX) - .max_by_key(|(_, p)| p.data.cost)? - .0 as u32, - ); + from: u32, +) -> Option> { + let to = data + .pages + .iter() + .enumerate() + .filter(|(_, p)| p.data.cost != u32::MAX) + .max_by_key(|(_, p)| p.data.cost)? + .0 as u32; let mut steps = vec![]; let mut at = to; loop { steps.push(at); at = data.page(at).data.prev; - if at == PageIdx::MAX { + if at == u32::MAX { break; }; } @@ -162,7 +158,7 @@ pub fn run(datafile: &Path, from: &str) -> io::Result<()> { if let Some(path) = path { println!("Path found:"); for page_idx in path { - let page = &pages[page_idx.0 as usize]; + let page = &pages[page_idx as usize]; if page.data.redirect { println!(" v {:?}", page.data.title); } else { diff --git a/brood/src/commands/path.rs b/brood/src/commands/path.rs index 3d8d50d..82079d2 100644 --- a/brood/src/commands/path.rs +++ b/brood/src/commands/path.rs @@ -3,14 +3,14 @@ use std::fs::File; use std::io::{self, BufReader}; use std::path::Path; -use crate::data::adjacency_list::{AdjacencyList, PageIdx}; +use crate::data::adjacency_list::AdjacencyList; use crate::data::info::{LinkInfo, PageInfo}; use crate::data::store; use crate::util; struct DijkstraPageInfo { cost: u32, - prev: PageIdx, + prev: u32, redirect: bool, } @@ -18,7 +18,7 @@ impl DijkstraPageInfo { fn from_page_info(info: PageInfo) -> Self { Self { cost: u32::MAX, - prev: PageIdx::MAX, + prev: u32::MAX, redirect: info.redirect, } } @@ -42,12 +42,12 @@ impl DijkstraLinkInfo { #[derive(Clone, Copy, PartialEq, Eq)] struct Entry { cost: u32, - idx: PageIdx, + page_idx: u32, } impl Entry { - pub fn new(cost: u32, idx: PageIdx) -> Self { - Self { cost, idx } + pub fn new(cost: u32, page_idx: u32) -> Self { + Self { cost, page_idx } } } @@ -57,7 +57,7 @@ impl Ord for Entry { other .cost .cmp(&self.cost) - .then_with(|| self.idx.cmp(&other.idx)) + .then_with(|| self.page_idx.cmp(&other.page_idx)) } } @@ -68,11 +68,7 @@ impl PartialOrd for Entry { } /// Closely matches the dijkstra example in [std::collections::binary_heap]. -fn dijkstra( - data: AdjacencyList, - from: PageIdx, - to: PageIdx, -) -> Option> { +fn dijkstra(data: AdjacencyList, from: u32, to: u32) -> Option> { println!("> Prepare state"); let mut data = data .change_page_data(DijkstraPageInfo::from_page_info) @@ -82,11 +78,7 @@ fn dijkstra( queue.push(Entry::new(0, from)); println!("> Run dijkstra"); - while let Some(Entry { - cost, - idx: page_idx, - }) = queue.pop() - { + while let Some(Entry { cost, page_idx }) = queue.pop() { if page_idx == to { // We've found the shortest path to our target break; @@ -104,7 +96,7 @@ fn dijkstra( let next = Entry { cost: cost + if redirect { 0 } else { link.data.cost }, - idx: link.to, + page_idx: link.to, }; let target_page = data.page_mut(link.to); @@ -122,7 +114,7 @@ fn dijkstra( loop { steps.push(at); at = data.page(at).data.prev; - if at == PageIdx::MAX { + if at == u32::MAX { break; }; } @@ -152,7 +144,7 @@ pub fn path(datafile: &Path, from: &str, to: &str) -> io::Result<()> { if let Some(path) = path { println!("Path found:"); for page_idx in path { - let page = &pages[page_idx.0 as usize]; + let page = &pages[page_idx as usize]; if page.data.redirect { println!(" v {:?}", page.data.title); } else { diff --git a/brood/src/data/adjacency_list.rs b/brood/src/data/adjacency_list.rs index ca45843..04a1124 100644 --- a/brood/src/data/adjacency_list.rs +++ b/brood/src/data/adjacency_list.rs @@ -1,20 +1,11 @@ +use std::ops::Range; + use super::info::{LinkInfo, PageInfo}; -pub const SENTINEL_PAGE_MARKER: &str = "Q2AKO3OYzyitmCJURghJ"; - -#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub struct PageIdx(pub u32); - -impl PageIdx { - pub const MAX: PageIdx = PageIdx(u32::MAX); -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub struct LinkIdx(pub u32); - #[derive(Debug, Clone, Copy)] pub struct Page

{ - pub start: LinkIdx, + /// Index of the first link belonging to this page. + pub start: u32, pub data: P, } @@ -29,7 +20,8 @@ impl

Page

{ #[derive(Debug, Clone, Copy)] pub struct Link { - pub to: PageIdx, + /// Index of the page this link points to. + pub to: u32, pub data: L, } @@ -57,40 +49,57 @@ impl Default for AdjacencyList { } impl AdjacencyList { - pub fn page(&self, idx: PageIdx) -> &Page

{ - &self.pages[idx.0 as usize] + pub fn push_page(&mut self, data: P) { + self.pages.push(Page { + start: self.links.len() as u32, + data, + }); } - pub fn page_mut(&mut self, idx: PageIdx) -> &mut Page

{ - &mut self.pages[idx.0 as usize] + pub fn push_link(&mut self, to: u32, data: L) { + self.links.push(Link { to, data }) } - pub fn pages_range(&self) -> impl DoubleEndedIterator { - (0..self.pages.len() as u32 - 1).map(PageIdx) + pub fn page(&self, page_idx: u32) -> &Page

{ + &self.pages[page_idx as usize] } - pub fn link_range(&self, idx: PageIdx) -> impl DoubleEndedIterator { - let start_idx = self.page(idx).start; - let end_idx = self.page(PageIdx(idx.0 + 1)).start; - (start_idx.0..end_idx.0).map(LinkIdx) + pub fn page_mut(&mut self, page_idx: u32) -> &mut Page

{ + &mut self.pages[page_idx as usize] } - pub fn link_redirect(&self, idx: PageIdx) -> Option { - let start_idx = self.page(idx).start; - let end_idx = self.page(PageIdx(idx.0 + 1)).start; - if start_idx == end_idx { + pub fn pages(&self) -> impl Iterator)> { + self.pages.iter().enumerate().map(|(i, p)| (i as u32, p)) + } + + pub fn link(&self, link_idx: u32) -> &Link { + &self.links[link_idx as usize] + } + + pub fn link_mut(&mut self, link_idx: u32) -> &mut Link { + &mut self.links[link_idx as usize] + } + + pub fn link_range(&self, page_idx: u32) -> Range { + let start_idx = self.pages[page_idx as usize].start; + let end_idx = match self.pages.get(page_idx as usize + 1) { + Some(page) => page.start, + None => self.links.len() as u32, + }; + start_idx..end_idx + } + + pub fn link_redirect(&self, page_idx: u32) -> Option { + let range = self.link_range(page_idx); + if range.is_empty() { None } else { - Some(start_idx) + Some(range.start) } } - pub fn link(&self, idx: LinkIdx) -> &Link { - &self.links[idx.0 as usize] - } - - pub fn link_mut(&mut self, idx: LinkIdx) -> &mut Link { - &mut self.links[idx.0 as usize] + pub fn links(&self, page_idx: u32) -> impl Iterator)> { + self.link_range(page_idx).map(|i| (i, self.link(i))) } pub fn change_page_data(self, page_f: impl Fn(P) -> P2 + Copy) -> AdjacencyList { @@ -122,14 +131,6 @@ impl AdjacencyList { impl AdjacencyList { pub fn check_consistency(&self) { - // Check that we have a sentinel page - let sentinel = self.pages.last().expect("no sentinel page"); - assert!(sentinel.data.id == u32::MAX, "unmarked sentinel page"); - assert!( - sentinel.data.title.contains(SENTINEL_PAGE_MARKER), - "unmarked sentinel page" - ); - // Check that all types are large enough assert!(self.pages.len() < u32::MAX as usize, "too many pages"); assert!(self.links.len() < u32::MAX as usize, "too many links"); @@ -142,18 +143,17 @@ impl AdjacencyList { // Check that all links contain valid indices. Links must not link to // the sentinel page. - let range = 0..self.pages.len() as u32 - 1; + let range = 0..self.pages.len() as u32; for link in &self.links { - assert!(range.contains(&link.to.0), "invalid link"); + assert!(range.contains(&link.to), "invalid link"); } // Check that all redirect pages have at most one link - for page_idx in (0..self.pages.len() as u32 - 1).map(PageIdx) { - let page = self.page(page_idx); + for (page_idx, page) in self.pages.iter().enumerate() { if page.data.redirect { - let mut range = self.link_range(page_idx); - range.next(); // 0 or 1 links allowed - assert!(range.next().is_none(), "too many redirect links"); + let range = self.link_range(page_idx as u32); + let amount = range.end - range.start; + assert!(amount <= 1, "too many redirect links"); } } } diff --git a/brood/src/data/store.rs b/brood/src/data/store.rs index 72ac044..afba1a3 100644 --- a/brood/src/data/store.rs +++ b/brood/src/data/store.rs @@ -1,7 +1,7 @@ use std::io::{self, Read, Write}; use super::{ - adjacency_list::{AdjacencyList, Link, LinkIdx, Page, PageIdx}, + adjacency_list::{AdjacencyList, Link, Page}, info::{LinkInfo, PageInfo}, }; @@ -50,7 +50,7 @@ fn read_str(from: &mut R) -> io::Result { } fn write_page(page: &Page, to: &mut W) -> io::Result<()> { - write_u32(page.start.0, to)?; + write_u32(page.start, to)?; write_u32(page.data.id, to)?; write_u32(page.data.length, to)?; write_u8(if page.data.redirect { 1 } else { 0 }, to)?; @@ -60,14 +60,14 @@ fn write_page(page: &Page, to: &mut W) -> io::Result<()> { } pub fn read_page(from: &mut R) -> io::Result> { - let start = LinkIdx(read_u32(from)?); + let start_link_idx = read_u32(from)?; let id = read_u32(from)?; let length = read_u32(from)?; let redirect = read_u8(from)? != 0; let title = read_str(from)?; Ok(Page { - start, + start: start_link_idx, data: PageInfo { id, length, @@ -78,7 +78,7 @@ pub fn read_page(from: &mut R) -> io::Result> { } fn write_link(link: &Link, to: &mut W) -> io::Result<()> { - write_u32(link.to.0, to)?; + write_u32(link.to, to)?; write_u32(link.data.start, to)?; write_u32(link.data.len, to)?; write_u8(link.data.flags, to)?; @@ -87,13 +87,13 @@ fn write_link(link: &Link, to: &mut W) -> io::Result<()> { } fn read_link(from: &mut R) -> io::Result> { - let to = PageIdx(read_u32(from)?); + let to_page_idx = read_u32(from)?; let start = read_u32(from)?; let len = read_u32(from)?; let flags = read_u8(from)?; Ok(Link { - to, + to: to_page_idx, data: LinkInfo { start, len, flags }, }) } diff --git a/brood/src/util.rs b/brood/src/util.rs index c43cfae..e1a64ff 100644 --- a/brood/src/util.rs +++ b/brood/src/util.rs @@ -1,5 +1,5 @@ use crate::data::{ - adjacency_list::{AdjacencyList, Page, PageIdx}, + adjacency_list::{AdjacencyList, Page}, info::{LinkInfo, PageInfo}, }; @@ -15,21 +15,17 @@ pub fn normalize_link(link: &str) -> String { .collect::() } -pub fn find_index_of_title(pages: &[Page], title: &str) -> PageIdx { +pub fn find_index_of_title(pages: &[Page], title: &str) -> u32 { let title = normalize_link(title); - let idx = pages + pages .iter() .enumerate() .find(|(_, p)| normalize_link(&p.data.title) == title) .map(|(i, _)| i) - .expect("invalid title") as u32; - PageIdx(idx) + .expect("invalid title") as u32 } -pub fn resolve_redirects( - data: &AdjacencyList, - mut page_idx: PageIdx, -) -> PageIdx { +pub fn resolve_redirects(data: &AdjacencyList, mut page_idx: u32) -> u32 { loop { if data.page(page_idx).data.redirect { if let Some(link_idx) = data.link_redirect(page_idx) {