From 7a2372fedde2bccd42430c082e4c51ec8d370f66 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 25 Aug 2024 21:40:11 +0200 Subject: [PATCH] Refactor data representation and storage Mostly moving around code --- brood/src/commands/ingest.rs | 35 ++- brood/src/commands/list_pages.rs | 8 +- brood/src/commands/longest_shortest_path.rs | 61 ++-- brood/src/commands/path.rs | 52 ++-- brood/src/commands/reexport.rs | 6 +- brood/src/data.rs | 306 +------------------- brood/src/data/adjacency_list.rs | 152 ++++++++++ brood/src/data/info.rs | 24 ++ brood/src/data/store.rs | 134 +++++++++ brood/src/util.rs | 17 +- 10 files changed, 416 insertions(+), 379 deletions(-) create mode 100644 brood/src/data/adjacency_list.rs create mode 100644 brood/src/data/info.rs create mode 100644 brood/src/data/store.rs diff --git a/brood/src/commands/ingest.rs b/brood/src/commands/ingest.rs index 4fc182d..d38b1c6 100644 --- a/brood/src/commands/ingest.rs +++ b/brood/src/commands/ingest.rs @@ -2,11 +2,16 @@ use std::collections::hash_map::Entry; use std::fs::File; use std::io::{self, BufRead, BufReader, BufWriter}; use std::path::Path; +use std::u32; use rustc_hash::FxHashMap; use serde::Deserialize; -use crate::data::{AdjacencyList, Link, LinkInfo, Page, PageInfo}; +use crate::data::adjacency_list::{ + AdjacencyList, Link, LinkIdx, Page, PageIdx, SENTINEL_PAGE_MARKER, +}; +use crate::data::info::{LinkInfo, PageInfo}; +use crate::data::store; use crate::util; #[derive(Deserialize)] @@ -70,7 +75,7 @@ fn first_stage() -> io::Result<(AdjacencyList, Titles)> { let json_page = serde_json::from_str::(&line?).unwrap(); result.pages.push(Page { - link_idx: result.links.len() as u32, + start: LinkIdx(result.links.len() as u32), data: PageInfo { id: json_page.id, length: json_page.length, @@ -82,14 +87,14 @@ fn first_stage() -> io::Result<(AdjacencyList, Titles)> { if let Some(to) = json_page.redirect { let to = titles.insert(util::normalize_link(&to)); result.links.push(Link { - to, + to: PageIdx(to), data: LinkInfo::default(), }); } else { for (to, start, len, flags) in json_page.links { let to = titles.insert(util::normalize_link(&to)); result.links.push(Link { - to, + to: PageIdx(to), data: LinkInfo { start, len, flags }, }); } @@ -106,12 +111,12 @@ fn first_stage() -> io::Result<(AdjacencyList, Titles)> { eprintln!("Title map entries: {}", titles.map.len()); result.pages.push(Page { - link_idx: result.links.len() as u32, + start: LinkIdx(result.links.len() as u32), data: PageInfo { - id: 0, + id: u32::MAX, length: 0, redirect: false, - title: "Sentinel page at the end of all pages, Q2AKO3OYzyitmCJURghJ".to_string(), + title: SENTINEL_PAGE_MARKER.to_string(), }, }); @@ -148,18 +153,18 @@ fn second_stage( for page_idx in 0..first_stage.pages.len() - 1 { let mut page = first_stage.pages[page_idx].clone(); - let start_link_idx = page.link_idx; - let end_link_idx = first_stage.pages[page_idx + 1].link_idx; + let start_link_idx = page.start; + let end_link_idx = first_stage.pages[page_idx + 1].start; - page.link_idx = result.links.len() as u32; + page.start.0 = result.links.len() as u32; result.pages.push(page); - for link_idx in start_link_idx..end_link_idx { + for link_idx in start_link_idx.0..end_link_idx.0 { let mut link = first_stage.links[link_idx as usize]; - let title = util::normalize_link(titles.get(link.to)); + let title = util::normalize_link(titles.get(link.to.0)); if let Some(to) = pages_map.get(&title) { // The link points to an existing article, we should keep it - link.to = *to; + link.to.0 = *to; result.links.push(link); } } @@ -174,7 +179,7 @@ fn second_stage( eprintln!("Page map entries: {}", pages_map.len()); let mut sentinel = first_stage.pages.last().unwrap().clone(); - sentinel.link_idx = result.links.len() as u32; + sentinel.start.0 = result.links.len() as u32; result.pages.push(sentinel); result @@ -192,7 +197,7 @@ pub fn ingest(datafile: &Path) -> io::Result<()> { eprintln!(">> Export"); let mut datafile = BufWriter::new(File::create(datafile)?); - data.write(&mut datafile)?; + store::write_adjacency_list(&data, &mut datafile)?; Ok(()) } diff --git a/brood/src/commands/list_pages.rs b/brood/src/commands/list_pages.rs index e80b6ee..d4e0479 100644 --- a/brood/src/commands/list_pages.rs +++ b/brood/src/commands/list_pages.rs @@ -2,15 +2,17 @@ use std::fs::File; use std::io::{self, BufReader}; use std::path::Path; -use crate::data::AdjacencyList; +use crate::data::adjacency_list::PageIdx; +use crate::data::store; pub fn run(datafile: &Path) -> io::Result<()> { let mut databuf = BufReader::new(File::open(datafile)?); - let data = AdjacencyList::read(&mut databuf)?; + let data = store::read_adjacency_list(&mut databuf)?; for (page_idx, page) in data.pages.iter().enumerate() { + let page_idx = PageIdx(page_idx as u32); if page.data.redirect { - for link_idx in data.link_range(page_idx as u32) { + for link_idx in data.link_range(page_idx) { let target_page = data.page(data.link(link_idx).to); println!("{:?} -> {:?}", page.data.title, target_page.data.title); } diff --git a/brood/src/commands/longest_shortest_path.rs b/brood/src/commands/longest_shortest_path.rs index 754c405..1899529 100644 --- a/brood/src/commands/longest_shortest_path.rs +++ b/brood/src/commands/longest_shortest_path.rs @@ -3,12 +3,14 @@ use std::fs::File; use std::io::{self, BufReader}; use std::path::Path; -use crate::data::{AdjacencyList, LinkInfo, PageInfo}; +use crate::data::adjacency_list::{AdjacencyList, PageIdx}; +use crate::data::info::{LinkInfo, PageInfo}; +use crate::data::store; use crate::util; struct DijkstraPageInfo { cost: u32, - prev_page_idx: u32, + prev: PageIdx, redirect: bool, } @@ -16,7 +18,7 @@ impl DijkstraPageInfo { fn from_page_info(info: PageInfo) -> Self { Self { cost: u32::MAX, - prev_page_idx: u32::MAX, + prev: PageIdx(u32::MAX), redirect: info.redirect, } } @@ -40,12 +42,12 @@ impl DijkstraLinkInfo { #[derive(Clone, Copy, PartialEq, Eq)] struct Entry { cost: u32, - page_idx: u32, + idx: PageIdx, } impl Entry { - pub fn new(cost: u32, page_idx: u32) -> Self { - Self { cost, page_idx } + pub fn new(cost: u32, idx: PageIdx) -> Self { + Self { cost, idx } } } @@ -55,7 +57,7 @@ impl Ord for Entry { other .cost .cmp(&self.cost) - .then_with(|| self.page_idx.cmp(&other.page_idx)) + .then_with(|| self.idx.cmp(&other.idx)) } } @@ -68,7 +70,7 @@ impl PartialOrd for Entry { /// Closely matches the dijkstra example in [std::collections::binary_heap]. fn full_dijkstra( data: AdjacencyList, - from_idx: u32, + from_idx: PageIdx, ) -> AdjacencyList { println!("> Prepare state"); let mut data = data @@ -79,7 +81,11 @@ fn full_dijkstra( queue.push(Entry::new(0, from_idx)); println!("> Run dijkstra"); - while let Some(Entry { cost, page_idx }) = queue.pop() { + while let Some(Entry { + cost, + idx: page_idx, + }) = queue.pop() + { let page = data.page(page_idx); if cost > page.data.cost { // This queue entry is outdated @@ -92,13 +98,13 @@ fn full_dijkstra( let next = Entry { cost: cost + if redirect { 0 } else { link.data.cost }, - page_idx: link.to, + idx: link.to, }; let target_page = data.page_mut(link.to); if next.cost < target_page.data.cost { target_page.data.cost = next.cost; - target_page.data.prev_page_idx = page_idx; + target_page.data.prev = page_idx; queue.push(next); } } @@ -109,27 +115,28 @@ fn full_dijkstra( fn find_longest_shortest_path( data: AdjacencyList, - from_idx: u32, -) -> Option> { - let to_idx = data - .pages - .iter() - .enumerate() - .filter(|(_, p)| p.data.cost != u32::MAX) - .max_by_key(|(_, p)| p.data.cost)? - .0 as u32; + from: PageIdx, +) -> Option> { + let to = PageIdx( + data.pages + .iter() + .enumerate() + .filter(|(_, p)| p.data.cost != u32::MAX) + .max_by_key(|(_, p)| p.data.cost)? + .0 as u32, + ); let mut steps = vec![]; - let mut at_idx = to_idx; + let mut at = to; loop { - steps.push(at_idx); - at_idx = data.page(at_idx).data.prev_page_idx; - if at_idx == u32::MAX { + steps.push(at); + at = data.page(at).data.prev; + if at == PageIdx(u32::MAX) { break; }; } steps.reverse(); - if steps.first() == Some(&from_idx) { + if steps.first() == Some(&from) { Some(steps) } else { None @@ -139,7 +146,7 @@ fn find_longest_shortest_path( pub fn run(datafile: &Path, from: &str) -> io::Result<()> { println!(">> Import"); let mut databuf = BufReader::new(File::open(datafile)?); - let data = AdjacencyList::read(&mut databuf)?; + let data = store::read_adjacency_list(&mut databuf)?; let pages = data.pages.clone(); println!(">> Locate from and to"); @@ -155,7 +162,7 @@ pub fn run(datafile: &Path, from: &str) -> io::Result<()> { if let Some(path) = path { println!("Path found:"); for page_idx in path { - let page = &pages[page_idx as usize]; + let page = &pages[page_idx.0 as usize]; if page.data.redirect { println!(" v {:?}", page.data.title); } else { diff --git a/brood/src/commands/path.rs b/brood/src/commands/path.rs index 7f34b4f..3ecb411 100644 --- a/brood/src/commands/path.rs +++ b/brood/src/commands/path.rs @@ -3,12 +3,14 @@ use std::fs::File; use std::io::{self, BufReader}; use std::path::Path; -use crate::data::{AdjacencyList, LinkInfo, PageInfo}; +use crate::data::adjacency_list::{AdjacencyList, PageIdx}; +use crate::data::info::{LinkInfo, PageInfo}; +use crate::data::store; use crate::util; struct DijkstraPageInfo { cost: u32, - prev_page_idx: u32, + prev: PageIdx, redirect: bool, } @@ -16,7 +18,7 @@ impl DijkstraPageInfo { fn from_page_info(info: PageInfo) -> Self { Self { cost: u32::MAX, - prev_page_idx: u32::MAX, + prev: PageIdx(u32::MAX), redirect: info.redirect, } } @@ -40,12 +42,12 @@ impl DijkstraLinkInfo { #[derive(Clone, Copy, PartialEq, Eq)] struct Entry { cost: u32, - page_idx: u32, + idx: PageIdx, } impl Entry { - pub fn new(cost: u32, page_idx: u32) -> Self { - Self { cost, page_idx } + pub fn new(cost: u32, idx: PageIdx) -> Self { + Self { cost, idx } } } @@ -55,7 +57,7 @@ impl Ord for Entry { other .cost .cmp(&self.cost) - .then_with(|| self.page_idx.cmp(&other.page_idx)) + .then_with(|| self.idx.cmp(&other.idx)) } } @@ -68,20 +70,24 @@ impl PartialOrd for Entry { /// Closely matches the dijkstra example in [std::collections::binary_heap]. fn dijkstra( data: AdjacencyList, - from_idx: u32, - to_idx: u32, -) -> Option> { + from: PageIdx, + to: PageIdx, +) -> Option> { println!("> Prepare state"); let mut data = data .change_page_data(DijkstraPageInfo::from_page_info) .change_link_data(DijkstraLinkInfo::from_link_info); let mut queue = BinaryHeap::new(); - data.page_mut(from_idx).data.cost = 0; - queue.push(Entry::new(0, from_idx)); + data.page_mut(from).data.cost = 0; + queue.push(Entry::new(0, from)); println!("> Run dijkstra"); - while let Some(Entry { cost, page_idx }) = queue.pop() { - if page_idx == to_idx { + while let Some(Entry { + cost, + idx: page_idx, + }) = queue.pop() + { + if page_idx == to { // We've found the shortest path to our target break; } @@ -98,13 +104,13 @@ fn dijkstra( let next = Entry { cost: cost + if redirect { 0 } else { link.data.cost }, - page_idx: link.to, + idx: link.to, }; let target_page = data.page_mut(link.to); if next.cost < target_page.data.cost { target_page.data.cost = next.cost; - target_page.data.prev_page_idx = page_idx; + target_page.data.prev = page_idx; queue.push(next); } } @@ -112,16 +118,16 @@ fn dijkstra( println!("> Collect results"); let mut steps = vec![]; - let mut at_idx = to_idx; + let mut at = to; loop { - steps.push(at_idx); - at_idx = data.page(at_idx).data.prev_page_idx; - if at_idx == u32::MAX { + steps.push(at); + at = data.page(at).data.prev; + if at == PageIdx(u32::MAX) { break; }; } steps.reverse(); - if steps.first() == Some(&from_idx) { + if steps.first() == Some(&from) { Some(steps) } else { None @@ -131,7 +137,7 @@ fn dijkstra( pub fn path(datafile: &Path, from: &str, to: &str) -> io::Result<()> { println!(">> Import"); let mut databuf = BufReader::new(File::open(datafile)?); - let data = AdjacencyList::read(&mut databuf)?; + let data = store::read_adjacency_list(&mut databuf)?; let pages = data.pages.clone(); println!(">> Locate from and to"); @@ -146,7 +152,7 @@ pub fn path(datafile: &Path, from: &str, to: &str) -> io::Result<()> { if let Some(path) = path { println!("Path found:"); for page_idx in path { - let page = &pages[page_idx as usize]; + let page = &pages[page_idx.0 as usize]; if page.data.redirect { println!(" v {:?}", page.data.title); } else { diff --git a/brood/src/commands/reexport.rs b/brood/src/commands/reexport.rs index 476d1f9..18d56a6 100644 --- a/brood/src/commands/reexport.rs +++ b/brood/src/commands/reexport.rs @@ -2,19 +2,19 @@ use std::fs::File; use std::io::{self, BufReader, BufWriter}; use std::path::Path; -use crate::data::AdjacencyList; +use crate::data::store; pub fn reexport(from: &Path, to: &Path) -> io::Result<()> { eprintln!(">> Import"); let mut from = BufReader::new(File::open(from)?); - let data = AdjacencyList::read(&mut from)?; + let data = store::read_adjacency_list(&mut from)?; eprintln!(">> Consistency check"); data.check_consistency(); eprintln!(">> Export"); let mut to = BufWriter::new(File::create(to)?); - data.write(&mut to)?; + store::write_adjacency_list(&data, &mut to)?; Ok(()) } diff --git a/brood/src/data.rs b/brood/src/data.rs index a420c4f..16aa0eb 100644 --- a/brood/src/data.rs +++ b/brood/src/data.rs @@ -1,303 +1,3 @@ -use std::io::{self, Read, Write}; -use std::ops::Range; - -mod ioutil { - use std::io::{self, Read, Write}; - - pub fn write_u8(n: u8, to: &mut W) -> io::Result<()> { - to.write_all(&n.to_le_bytes()) - } - - pub fn read_u8(from: &mut R) -> io::Result { - let mut buf = [0_u8; 1]; - from.read_exact(&mut buf)?; - Ok(u8::from_le_bytes(buf)) - } - - pub fn write_u16(n: u16, to: &mut W) -> io::Result<()> { - to.write_all(&n.to_le_bytes()) - } - - pub fn read_u16(from: &mut R) -> io::Result { - let mut buf = [0_u8; 2]; - from.read_exact(&mut buf)?; - Ok(u16::from_le_bytes(buf)) - } - - pub fn write_u32(n: u32, to: &mut W) -> io::Result<()> { - to.write_all(&n.to_le_bytes()) - } - - pub fn read_u32(from: &mut R) -> io::Result { - let mut buf = [0_u8; 4]; - from.read_exact(&mut buf)?; - Ok(u32::from_le_bytes(buf)) - } - - pub fn write_str(s: &str, to: &mut W) -> io::Result<()> { - assert!(s.len() <= u16::MAX as usize); - write_u16(s.len() as u16, to)?; - to.write_all(s.as_bytes())?; - Ok(()) - } - - pub fn read_str(from: &mut R) -> io::Result { - let len = read_u16(from)? as usize; - let mut buf = vec![0_u8; len]; - from.read_exact(&mut buf)?; - Ok(String::from_utf8(buf).unwrap()) - } -} - -#[derive(Debug, Clone)] -pub struct PageInfo { - pub id: u32, - pub length: u32, - pub redirect: bool, - pub title: String, -} - -#[derive(Debug, Clone, Copy)] -pub struct Page

{ - pub link_idx: u32, - pub data: P, -} - -impl Page { - pub fn write(&self, to: &mut W) -> io::Result<()> { - ioutil::write_u32(self.link_idx, to)?; - ioutil::write_u32(self.data.id, to)?; - ioutil::write_u32(self.data.length, to)?; - ioutil::write_u8(if self.data.redirect { 1 } else { 0 }, to)?; - ioutil::write_str(&self.data.title, to)?; - - Ok(()) - } - - pub fn read(from: &mut R) -> io::Result { - let link_idx = ioutil::read_u32(from)?; - let id = ioutil::read_u32(from)?; - let length = ioutil::read_u32(from)?; - let redirect = ioutil::read_u8(from)? != 0; - let title = ioutil::read_str(from)?; - - Ok(Self { - link_idx, - data: PageInfo { - id, - length, - redirect, - title, - }, - }) - } -} - -impl

Page

{ - pub fn change_data(self, f: impl Fn(P) -> P2) -> Page { - Page { - link_idx: self.link_idx, - data: f(self.data), - } - } -} - -#[derive(Debug, Default, Clone, Copy)] -pub struct LinkInfo { - pub start: u32, - pub len: u32, - pub flags: u8, -} - -impl LinkInfo { - pub fn end(self) -> u32 { - self.start + self.len - } - - pub fn in_parens(self) -> bool { - self.flags & 0b1 != 0 - } - - pub fn in_structure(self) -> bool { - self.flags & 0b10 != 0 - } -} - -#[derive(Debug, Clone, Copy)] -pub struct Link { - pub to: u32, - pub data: L, -} - -impl Link { - pub fn write(&self, to: &mut W) -> io::Result<()> { - ioutil::write_u32(self.to, to)?; - ioutil::write_u32(self.data.start, to)?; - ioutil::write_u32(self.data.len, to)?; - ioutil::write_u8(self.data.flags, to)?; - - Ok(()) - } - - pub fn read(from: &mut R) -> io::Result { - let to = ioutil::read_u32(from)?; - let start = ioutil::read_u32(from)?; - let len = ioutil::read_u32(from)?; - let flags = ioutil::read_u8(from)?; - - Ok(Self { - to, - data: LinkInfo { start, len, flags }, - }) - } -} - -impl Link { - pub fn change_data(self, f: impl Fn(L) -> L2) -> Link { - Link { - to: self.to, - data: f(self.data), - } - } -} - -pub struct AdjacencyList { - pub pages: Vec>, - pub links: Vec>, -} - -impl Default for AdjacencyList { - fn default() -> Self { - Self { - pages: Default::default(), - links: Default::default(), - } - } -} - -impl AdjacencyList { - pub fn write(&self, to: &mut W) -> io::Result<()> { - ioutil::write_u32(self.pages.len() as u32, to)?; - ioutil::write_u32(self.links.len() as u32, to)?; - - for page in &self.pages { - page.write(to)?; - } - - for link in &self.links { - link.write(to)?; - } - - Ok(()) - } - - pub fn read(from: &mut R) -> io::Result { - let n_pages = ioutil::read_u32(from)?; - let n_links = ioutil::read_u32(from)?; - - let mut pages = vec![]; - for _ in 0..n_pages { - pages.push(Page::read(from)?); - } - - let mut links = vec![]; - for _ in 0..n_links { - links.push(Link::read(from)?); - } - - Ok(Self { pages, links }) - } - - pub fn check_consistency(&self) { - // Check that all types are large enough - assert!(self.pages.len() <= u32::MAX as usize, "pages len"); - assert!(self.links.len() <= u32::MAX as usize, "links len"); - for page in &self.pages { - assert!(page.data.title.len() <= u8::MAX as usize, "page title len"); - } - - // Check that all links contain valid indices - let range = 0..self.pages.len() as u32; - for link in &self.links { - if !range.contains(&link.to) { - panic!("Invalid link detected!"); - } - } - - // Check that all redirect pages have at most one link - for page_idx in 0..self.pages.len() as u32 - 1 { - let page = self.page(page_idx); - if page.data.redirect { - let start_idx = page.link_idx; - let end_idx = self.page(page_idx + 1).link_idx; - let n_links = end_idx - start_idx; - if n_links > 1 { - panic!( - "Redirect {:?} has too many ({n_links}) links", - page.data.title - ); - } - } - } - } -} - -impl AdjacencyList { - pub fn page(&self, idx: u32) -> &Page

{ - &self.pages[idx as usize] - } - - pub fn page_mut(&mut self, idx: u32) -> &mut Page

{ - &mut self.pages[idx as usize] - } - - pub fn link_range(&self, page_idx: u32) -> Range { - let start_idx = self.page(page_idx).link_idx; - let end_idx = self.page(page_idx + 1).link_idx; - start_idx..end_idx - } - - pub fn link_redirect(&self, page_idx: u32) -> Option { - let start_idx = self.page(page_idx).link_idx; - let end_idx = self.page(page_idx + 1).link_idx; - if start_idx == end_idx { - None - } else { - Some(start_idx) - } - } - - pub fn link(&self, idx: u32) -> &Link { - &self.links[idx as usize] - } - - pub fn link_mut(&mut self, idx: u32) -> &mut Link { - &mut self.links[idx as usize] - } - - pub fn change_page_data(self, page_f: impl Fn(P) -> P2 + Copy) -> AdjacencyList { - let pages = self - .pages - .into_iter() - .map(|p| p.change_data(page_f)) - .collect::>(); - - AdjacencyList { - pages, - links: self.links, - } - } - - pub fn change_link_data(self, link_f: impl Fn(L) -> L2 + Copy) -> AdjacencyList { - let links = self - .links - .into_iter() - .map(|l| l.change_data(link_f)) - .collect::>(); - - AdjacencyList { - pages: self.pages, - links, - } - } -} +pub mod adjacency_list; +pub mod info; +pub mod store; diff --git a/brood/src/data/adjacency_list.rs b/brood/src/data/adjacency_list.rs new file mode 100644 index 0000000..016cc1b --- /dev/null +++ b/brood/src/data/adjacency_list.rs @@ -0,0 +1,152 @@ +use super::info::{LinkInfo, PageInfo}; + +pub const SENTINEL_PAGE_MARKER: &str = "Q2AKO3OYzyitmCJURghJ"; + +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct PageIdx(pub u32); + +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct LinkIdx(pub u32); + +#[derive(Debug, Clone, Copy)] +pub struct Page

{ + pub start: LinkIdx, + pub data: P, +} + +impl

Page

{ + pub fn change_data(self, f: impl Fn(P) -> P2) -> Page { + Page { + start: self.start, + data: f(self.data), + } + } +} + +#[derive(Debug, Clone, Copy)] +pub struct Link { + pub to: PageIdx, + pub data: L, +} + +impl Link { + pub fn change_data(self, f: impl Fn(L) -> L2) -> Link { + Link { + to: self.to, + data: f(self.data), + } + } +} + +pub struct AdjacencyList { + pub pages: Vec>, + pub links: Vec>, +} + +impl Default for AdjacencyList { + fn default() -> Self { + Self { + pages: Default::default(), + links: Default::default(), + } + } +} + +impl AdjacencyList { + pub fn page(&self, idx: PageIdx) -> &Page

{ + &self.pages[idx.0 as usize] + } + + pub fn page_mut(&mut self, idx: PageIdx) -> &mut Page

{ + &mut self.pages[idx.0 as usize] + } + + pub fn link_range(&self, idx: PageIdx) -> impl DoubleEndedIterator { + let start_idx = self.page(idx).start; + let end_idx = self.page(PageIdx(idx.0 + 1)).start; + (start_idx.0..end_idx.0).map(LinkIdx) + } + + pub fn link_redirect(&self, idx: PageIdx) -> Option { + let start_idx = self.page(idx).start; + let end_idx = self.page(PageIdx(idx.0 + 1)).start; + if start_idx == end_idx { + None + } else { + Some(start_idx) + } + } + + pub fn link(&self, idx: LinkIdx) -> &Link { + &self.links[idx.0 as usize] + } + + pub fn link_mut(&mut self, idx: LinkIdx) -> &mut Link { + &mut self.links[idx.0 as usize] + } + + pub fn change_page_data(self, page_f: impl Fn(P) -> P2 + Copy) -> AdjacencyList { + let pages = self + .pages + .into_iter() + .map(|p| p.change_data(page_f)) + .collect::>(); + + AdjacencyList { + pages, + links: self.links, + } + } + + pub fn change_link_data(self, link_f: impl Fn(L) -> L2 + Copy) -> AdjacencyList { + let links = self + .links + .into_iter() + .map(|l| l.change_data(link_f)) + .collect::>(); + + AdjacencyList { + pages: self.pages, + links, + } + } +} + +impl AdjacencyList { + pub fn check_consistency(&self) { + // Check that we have a sentinel page + let sentinel = self.pages.last().expect("no sentinel page"); + assert!(sentinel.data.id == u32::MAX, "unmarked sentinel page"); + assert!( + sentinel.data.title.contains(SENTINEL_PAGE_MARKER), + "unmarked sentinel page" + ); + + // Check that all types are large enough + assert!(self.pages.len() < u32::MAX as usize, "too many pages"); + assert!(self.links.len() < u32::MAX as usize, "too many links"); + for page in &self.pages { + assert!( + page.data.title.len() <= u8::MAX as usize, + "page title too long" + ); + } + + // Check that all links contain valid indices. Links must not link to + // the sentinel page. + let range = 0..self.pages.len() as u32 - 1; + for link in &self.links { + assert!(range.contains(&link.to.0), "invalid link"); + } + + // Check that all redirect pages have at most one link + for page_idx in (0..self.pages.len() as u32 - 1).map(PageIdx) { + let page = self.page(page_idx); + if page.data.redirect { + let mut range = self.link_range(page_idx); + range.next(); // 0 or 1 links allowed + assert!(range.next().is_none(), "too many redirect links"); + } + } + } +} diff --git a/brood/src/data/info.rs b/brood/src/data/info.rs new file mode 100644 index 0000000..dad04d4 --- /dev/null +++ b/brood/src/data/info.rs @@ -0,0 +1,24 @@ +#[derive(Debug, Clone)] +pub struct PageInfo { + pub id: u32, + pub title: String, + pub length: u32, + pub redirect: bool, +} + +#[derive(Debug, Default, Clone, Copy)] +pub struct LinkInfo { + pub start: u32, + pub len: u32, + pub flags: u8, +} + +impl LinkInfo { + pub fn in_parens(self) -> bool { + self.flags & 0b1 != 0 + } + + pub fn in_structure(self) -> bool { + self.flags & 0b10 != 0 + } +} diff --git a/brood/src/data/store.rs b/brood/src/data/store.rs new file mode 100644 index 0000000..72ac044 --- /dev/null +++ b/brood/src/data/store.rs @@ -0,0 +1,134 @@ +use std::io::{self, Read, Write}; + +use super::{ + adjacency_list::{AdjacencyList, Link, LinkIdx, Page, PageIdx}, + info::{LinkInfo, PageInfo}, +}; + +fn write_u8(n: u8, to: &mut W) -> io::Result<()> { + to.write_all(&n.to_le_bytes()) +} + +fn read_u8(from: &mut R) -> io::Result { + let mut buf = [0_u8; 1]; + from.read_exact(&mut buf)?; + Ok(u8::from_le_bytes(buf)) +} + +fn write_u16(n: u16, to: &mut W) -> io::Result<()> { + to.write_all(&n.to_le_bytes()) +} + +fn read_u16(from: &mut R) -> io::Result { + let mut buf = [0_u8; 2]; + from.read_exact(&mut buf)?; + Ok(u16::from_le_bytes(buf)) +} + +fn write_u32(n: u32, to: &mut W) -> io::Result<()> { + to.write_all(&n.to_le_bytes()) +} + +fn read_u32(from: &mut R) -> io::Result { + let mut buf = [0_u8; 4]; + from.read_exact(&mut buf)?; + Ok(u32::from_le_bytes(buf)) +} + +fn write_str(s: &str, to: &mut W) -> io::Result<()> { + assert!(s.len() <= u16::MAX as usize); + write_u16(s.len() as u16, to)?; + to.write_all(s.as_bytes())?; + Ok(()) +} + +fn read_str(from: &mut R) -> io::Result { + let len = read_u16(from)? as usize; + let mut buf = vec![0_u8; len]; + from.read_exact(&mut buf)?; + Ok(String::from_utf8(buf).unwrap()) +} + +fn write_page(page: &Page, to: &mut W) -> io::Result<()> { + write_u32(page.start.0, to)?; + write_u32(page.data.id, to)?; + write_u32(page.data.length, to)?; + write_u8(if page.data.redirect { 1 } else { 0 }, to)?; + write_str(&page.data.title, to)?; + + Ok(()) +} + +pub fn read_page(from: &mut R) -> io::Result> { + let start = LinkIdx(read_u32(from)?); + let id = read_u32(from)?; + let length = read_u32(from)?; + let redirect = read_u8(from)? != 0; + let title = read_str(from)?; + + Ok(Page { + start, + data: PageInfo { + id, + length, + redirect, + title, + }, + }) +} + +fn write_link(link: &Link, to: &mut W) -> io::Result<()> { + write_u32(link.to.0, to)?; + write_u32(link.data.start, to)?; + write_u32(link.data.len, to)?; + write_u8(link.data.flags, to)?; + + Ok(()) +} + +fn read_link(from: &mut R) -> io::Result> { + let to = PageIdx(read_u32(from)?); + let start = read_u32(from)?; + let len = read_u32(from)?; + let flags = read_u8(from)?; + + Ok(Link { + to, + data: LinkInfo { start, len, flags }, + }) +} + +pub fn write_adjacency_list( + al: &AdjacencyList, + to: &mut W, +) -> io::Result<()> { + write_u32(al.pages.len() as u32, to)?; + write_u32(al.links.len() as u32, to)?; + + for page in &al.pages { + write_page(page, to)?; + } + + for link in &al.links { + write_link(link, to)?; + } + + Ok(()) +} + +pub fn read_adjacency_list(from: &mut R) -> io::Result> { + let n_pages = read_u32(from)?; + let n_links = read_u32(from)?; + + let mut pages = vec![]; + for _ in 0..n_pages { + pages.push(read_page(from)?); + } + + let mut links = vec![]; + for _ in 0..n_links { + links.push(read_link(from)?); + } + + Ok(AdjacencyList { pages, links }) +} diff --git a/brood/src/util.rs b/brood/src/util.rs index 69edb5b..c43cfae 100644 --- a/brood/src/util.rs +++ b/brood/src/util.rs @@ -1,4 +1,7 @@ -use crate::data::{AdjacencyList, LinkInfo, Page, PageInfo}; +use crate::data::{ + adjacency_list::{AdjacencyList, Page, PageIdx}, + info::{LinkInfo, PageInfo}, +}; pub fn normalize_link(link: &str) -> String { let link = link.trim().replace(' ', "_"); @@ -12,17 +15,21 @@ pub fn normalize_link(link: &str) -> String { .collect::() } -pub fn find_index_of_title(pages: &[Page], title: &str) -> u32 { +pub fn find_index_of_title(pages: &[Page], title: &str) -> PageIdx { let title = normalize_link(title); - pages + let idx = pages .iter() .enumerate() .find(|(_, p)| normalize_link(&p.data.title) == title) .map(|(i, _)| i) - .expect("invalid title") as u32 + .expect("invalid title") as u32; + PageIdx(idx) } -pub fn resolve_redirects(data: &AdjacencyList, mut page_idx: u32) -> u32 { +pub fn resolve_redirects( + data: &AdjacencyList, + mut page_idx: PageIdx, +) -> PageIdx { loop { if data.page(page_idx).data.redirect { if let Some(link_idx) = data.link_redirect(page_idx) {