From f71092058b4c4d6ca20952eaaa22a362829e1117 Mon Sep 17 00:00:00 2001 From: Joscha Date: Mon, 3 Oct 2022 22:14:58 +0200 Subject: [PATCH] Refactor export and add page length --- brood/src/commands/ingest.rs | 14 ++- brood/src/commands/reexport.rs | 8 +- brood/src/data.rs | 175 ++++++++++++++++++++------------- 3 files changed, 123 insertions(+), 74 deletions(-) diff --git a/brood/src/commands/ingest.rs b/brood/src/commands/ingest.rs index dc6263e..c9508a7 100644 --- a/brood/src/commands/ingest.rs +++ b/brood/src/commands/ingest.rs @@ -72,13 +72,14 @@ impl FirstStage { } } - fn insert_page(&mut self, id: u32, title: String, redirect: bool) { + fn insert_page(&mut self, id: u32, length: u32, redirect: bool, title: String) { let link_idx = self.pages.len() as u32; self.pages.push(Page { link_idx, id, - title, + length, redirect, + title, }); } @@ -87,7 +88,7 @@ impl FirstStage { } fn import_json_page(&mut self, page: JsonPage) { - self.insert_page(page.id, page.title, page.redirect.is_some()); + self.insert_page(page.id, page.length, page.redirect.is_some(), page.title); for (to, start, end) in page.links { let to = self.insert_title(util::normalize_link(&to)); self.insert_link(to, start, end); @@ -95,7 +96,12 @@ impl FirstStage { } fn finalize(&mut self) { - self.insert_page(0, "dummy page at the end of all pages".to_string(), false); + self.insert_page( + 0, + 0, + false, + "dummy page at the end of all pages".to_string(), + ); } fn from_stdin() -> io::Result { diff --git a/brood/src/commands/reexport.rs b/brood/src/commands/reexport.rs index 715db7e..476d1f9 100644 --- a/brood/src/commands/reexport.rs +++ b/brood/src/commands/reexport.rs @@ -6,15 +6,15 @@ use crate::data::AdjacencyList; pub fn reexport(from: &Path, to: &Path) -> io::Result<()> { eprintln!(">> Import"); - let from = BufReader::new(File::open(from)?); - let data = AdjacencyList::read(from)?; + let mut from = BufReader::new(File::open(from)?); + let data = AdjacencyList::read(&mut from)?; eprintln!(">> Consistency check"); data.check_consistency(); eprintln!(">> Export"); - let to = BufWriter::new(File::create(to)?); - data.write(to)?; + let mut to = BufWriter::new(File::create(to)?); + data.write(&mut to)?; Ok(()) } diff --git a/brood/src/data.rs b/brood/src/data.rs index 75c06c7..c994d66 100644 --- a/brood/src/data.rs +++ b/brood/src/data.rs @@ -2,12 +2,89 @@ use std::io::{self, Read, Write}; use serde::{Deserialize, Serialize}; +mod ioutil { + use std::io::{self, Read, Write}; + + pub fn write_u8(n: u8, to: &mut W) -> io::Result<()> { + to.write_all(&n.to_le_bytes()) + } + + pub fn read_u8(from: &mut R) -> io::Result { + let mut buf = [0_u8; 1]; + from.read_exact(&mut buf)?; + Ok(u8::from_le_bytes(buf)) + } + + pub fn write_u16(n: u16, to: &mut W) -> io::Result<()> { + to.write_all(&n.to_le_bytes()) + } + + pub fn read_u16(from: &mut R) -> io::Result { + let mut buf = [0_u8; 2]; + from.read_exact(&mut buf)?; + Ok(u16::from_le_bytes(buf)) + } + + pub fn write_u32(n: u32, to: &mut W) -> io::Result<()> { + to.write_all(&n.to_le_bytes()) + } + + pub fn read_u32(from: &mut R) -> io::Result { + let mut buf = [0_u8; 4]; + from.read_exact(&mut buf)?; + Ok(u32::from_le_bytes(buf)) + } + + pub fn write_str(s: &str, to: &mut W) -> io::Result<()> { + assert!(s.len() <= u16::MAX as usize); + write_u16(s.len() as u16, to)?; + to.write_all(s.as_bytes())?; + Ok(()) + } + + pub fn read_str(from: &mut R) -> io::Result { + let len = read_u16(from)? as usize; + let mut buf = vec![0_u8; len]; + from.read_exact(&mut buf)?; + Ok(String::from_utf8(buf).unwrap()) + } +} + #[derive(Debug, Clone, Serialize, Deserialize)] pub struct Page { pub link_idx: u32, pub id: u32, - pub title: String, + pub length: u32, pub redirect: bool, + pub title: String, +} + +impl Page { + pub fn write(&self, to: &mut W) -> io::Result<()> { + ioutil::write_u32(self.link_idx, to)?; + ioutil::write_u32(self.id, to)?; + ioutil::write_u32(self.length, to)?; + ioutil::write_u8(if self.redirect { 1 } else { 0 }, to)?; + ioutil::write_str(&self.title, to)?; + + Ok(()) + } + + pub fn read(from: &mut R) -> io::Result { + let link_idx = ioutil::read_u32(from)?; + let id = ioutil::read_u32(from)?; + let length = ioutil::read_u32(from)?; + let redirect = ioutil::read_u8(from)? != 0; + let title = ioutil::read_str(from)?; + + Ok(Self { + link_idx, + id, + length, + redirect, + title, + }) + } } #[derive(Debug, Clone, Copy, Serialize, Deserialize)] @@ -17,6 +94,24 @@ pub struct Link { pub end: u32, } +impl Link { + pub fn write(&self, to: &mut W) -> io::Result<()> { + ioutil::write_u32(self.to, to)?; + ioutil::write_u32(self.start, to)?; + ioutil::write_u32(self.end, to)?; + + Ok(()) + } + + pub fn read(from: &mut R) -> io::Result { + let to = ioutil::read_u32(from)?; + let start = ioutil::read_u32(from)?; + let end = ioutil::read_u32(from)?; + + Ok(Self { to, start, end }) + } +} + #[derive(Debug, Serialize, Deserialize)] pub struct AdjacencyList { pub pages: Vec, @@ -33,87 +128,35 @@ impl AdjacencyList { } } - pub fn write(&self, mut to: W) -> io::Result<()> { - let n_pages: u32 = self.pages.len() as u32; - to.write_all(&n_pages.to_le_bytes())?; - - let n_links: u32 = self.links.len() as u32; - to.write_all(&n_links.to_le_bytes())?; + pub fn write(&self, to: &mut W) -> io::Result<()> { + ioutil::write_u32(self.pages.len() as u32, to)?; + ioutil::write_u32(self.links.len() as u32, to)?; for page in &self.pages { - to.write_all(&page.link_idx.to_le_bytes())?; - to.write_all(&page.id.to_le_bytes())?; - to.write_all(&[if page.redirect { 1 } else { 0 }])?; - - let title_len: u16 = page.title.len() as u16; - to.write_all(&title_len.to_le_bytes())?; - to.write_all(page.title.as_bytes())?; + page.write(to)?; } for link in &self.links { - to.write_all(&link.to.to_le_bytes())?; - to.write_all(&link.start.to_le_bytes())?; - to.write_all(&link.end.to_le_bytes())?; + link.write(to)?; } Ok(()) } - pub fn read(mut from: R) -> io::Result { - let mut result = Self { - pages: vec![], - links: vec![], - }; - - let mut u8_buf = [0_u8; 1]; - let mut u16_buf = [0_u8; 2]; - let mut u32_buf = [0_u8; 4]; - - from.read_exact(&mut u32_buf)?; - let n_pages = u32::from_le_bytes(u32_buf); - - from.read_exact(&mut u32_buf)?; - let n_links = u32::from_le_bytes(u32_buf); + pub fn read(from: &mut R) -> io::Result { + let n_pages = ioutil::read_u32(from)?; + let n_links = ioutil::read_u32(from)?; + let mut pages = vec![]; for _ in 0..n_pages { - from.read_exact(&mut u32_buf)?; - let link_idx = u32::from_le_bytes(u32_buf); - - from.read_exact(&mut u32_buf)?; - let id = u32::from_le_bytes(u32_buf); - - from.read_exact(&mut u8_buf)?; - let redirect = u8_buf[0] != 0; - - from.read_exact(&mut u16_buf)?; - let title_len = u16::from_le_bytes(u16_buf); - let mut title_bytes = vec![0_u8; title_len as usize]; - from.read_exact(&mut title_bytes)?; - let title = String::from_utf8(title_bytes).unwrap(); - - let page = Page { - link_idx, - id, - title, - redirect, - }; - result.pages.push(page); + pages.push(Page::read(from)?); } + let mut links = vec![]; for _ in 0..n_links { - from.read_exact(&mut u32_buf)?; - let to = u32::from_le_bytes(u32_buf); - - from.read_exact(&mut u32_buf)?; - let start = u32::from_le_bytes(u32_buf); - - from.read_exact(&mut u32_buf)?; - let end = u32::from_le_bytes(u32_buf); - - let link = Link { to, start, end }; - result.links.push(link); + links.push(Link::read(from)?); } - Ok(result) + Ok(Self { pages, links }) } }