diff --git a/brood/src/data.rs b/brood/src/data.rs index 130f0ef..f57c42d 100644 --- a/brood/src/data.rs +++ b/brood/src/data.rs @@ -1,3 +1,5 @@ +use std::io::{self, Read, Write}; + use serde::{Deserialize, Serialize}; #[derive(Debug, Clone, Serialize, Deserialize)] @@ -20,3 +22,89 @@ pub struct AdjacencyList { pub pages: Vec, pub links: Vec, } + +impl AdjacencyList { + pub fn write(&self, mut to: W) -> io::Result<()> { + let n_pages: u32 = self.pages.len() as u32; + to.write_all(&n_pages.to_le_bytes())?; + + let n_links: u32 = self.links.len() as u32; + to.write_all(&n_links.to_le_bytes())?; + + for page in &self.pages { + to.write_all(&page.link_idx.to_le_bytes())?; + to.write_all(&page.id.to_le_bytes())?; + to.write_all(&[if page.redirect { 1 } else { 0 }])?; + + let title_len: u16 = page.title.len() as u16; + to.write_all(&title_len.to_le_bytes())?; + to.write_all(page.title.as_bytes())?; + } + + for link in &self.links { + to.write_all(&link.to.to_le_bytes())?; + to.write_all(&link.start.to_le_bytes())?; + to.write_all(&link.end.to_le_bytes())?; + } + + Ok(()) + } + + pub fn read(mut from: R) -> io::Result { + let mut result = Self { + pages: vec![], + links: vec![], + }; + + let mut u8_buf = [0_u8; 1]; + let mut u16_buf = [0_u8; 2]; + let mut u32_buf = [0_u8; 4]; + + from.read_exact(&mut u32_buf)?; + let n_pages = u32::from_le_bytes(u32_buf); + + from.read_exact(&mut u32_buf)?; + let n_links = u32::from_le_bytes(u32_buf); + + for _ in 0..n_pages { + from.read_exact(&mut u32_buf)?; + let link_idx = u32::from_le_bytes(u32_buf); + + from.read_exact(&mut u32_buf)?; + let id = u32::from_le_bytes(u32_buf); + + from.read_exact(&mut u8_buf)?; + let redirect = u8_buf[0] != 0; + + from.read_exact(&mut u16_buf)?; + let title_len = u16::from_le_bytes(u16_buf); + let mut title_bytes = vec![0_u8; title_len as usize]; + from.read_exact(&mut title_bytes)?; + let title = String::from_utf8(title_bytes).unwrap(); + + let page = Page { + link_idx, + id, + title, + redirect, + }; + result.pages.push(page); + } + + for _ in 0..n_links { + from.read_exact(&mut u32_buf)?; + let to = u32::from_le_bytes(u32_buf); + + from.read_exact(&mut u32_buf)?; + let start = u32::from_le_bytes(u32_buf); + + from.read_exact(&mut u32_buf)?; + let end = u32::from_le_bytes(u32_buf); + + let link = Link { to, start, end }; + result.links.push(link); + } + + Ok(result) + } +} diff --git a/brood/src/ingest.rs b/brood/src/ingest.rs index 669411b..6356431 100644 --- a/brood/src/ingest.rs +++ b/brood/src/ingest.rs @@ -1,5 +1,6 @@ use std::collections::hash_map::Entry; -use std::io::{self, BufRead, BufReader}; +use std::fs::File; +use std::io::{self, BufRead, BufReader, BufWriter}; use std::path::Path; use rustc_hash::FxHashMap; @@ -212,10 +213,9 @@ pub fn ingest(datafile: &Path) -> io::Result<()> { } } - // eprintln!("EXPORT"); - // let data = SlimAdjacencyList::from_alist(second_stage); - // ciborium::ser::into_writer(&data, io::stdout()).unwrap(); - // simd_json::to_writer(io::stdout(), &data).unwrap(); + eprintln!(">> Export"); + let mut datafile = BufWriter::new(File::create(datafile)?); + data.write(&mut datafile)?; Ok(()) }