Make adjacency list extensible

This commit is contained in:
Joscha 2022-10-21 20:39:53 +02:00
parent 78aa27c019
commit 3a75089e5a
2 changed files with 60 additions and 44 deletions

View file

@ -41,9 +41,9 @@ form a proper adjacency list.
struct FirstStage { struct FirstStage {
/// List with page info and index into [`Self::links`]. /// List with page info and index into [`Self::links`].
pages: Vec<Page>, pages: Vec<Page<()>>,
/// List with link info and index into [`Self::titles`]. /// List with link info and index into [`Self::titles`].
links: Vec<Link>, links: Vec<Link<()>>,
/// List with titles. /// List with titles.
titles: Vec<String>, titles: Vec<String>,
/// Map from normalized title to index in [`Self::titles`]. /// Map from normalized title to index in [`Self::titles`].
@ -80,11 +80,17 @@ impl FirstStage {
length, length,
redirect, redirect,
title, title,
data: (),
}); });
} }
fn insert_link(&mut self, to: u32, start: u32, end: u32) { fn insert_link(&mut self, to: u32, start: u32, end: u32) {
self.links.push(Link { to, start, end }); self.links.push(Link {
to,
start,
end,
data: (),
});
} }
fn import_json_page(&mut self, page: JsonPage) { fn import_json_page(&mut self, page: JsonPage) {
@ -125,9 +131,9 @@ impl FirstStage {
struct SecondStage { struct SecondStage {
/// List with page info and index into [`Self::links`]. /// List with page info and index into [`Self::links`].
pages: Vec<Page>, pages: Vec<Page<()>>,
/// List with link info and index into [`Self::pages`]. /// List with link info and index into [`Self::pages`].
links: Vec<Link>, links: Vec<Link<()>>,
/// Map from normalized title to index in [`Self::pages`]. /// Map from normalized title to index in [`Self::pages`].
pages_map: FxHashMap<String, u32>, pages_map: FxHashMap<String, u32>,
} }
@ -141,20 +147,20 @@ impl SecondStage {
} }
} }
fn initialize_pages_map(&mut self, pages: &[Page]) { fn initialize_pages_map(&mut self, pages: &[Page<()>]) {
for (idx, page) in pages.iter().enumerate() { for (idx, page) in pages.iter().enumerate() {
let title = util::normalize_link(&page.title); let title = util::normalize_link(&page.title);
self.pages_map.insert(title, idx as u32); self.pages_map.insert(title, idx as u32);
} }
} }
fn insert_page(&mut self, page: &Page) { fn insert_page(&mut self, page: &Page<()>) {
let mut page = page.clone(); let mut page = page.clone();
page.link_idx = self.pages.len() as u32; page.link_idx = self.pages.len() as u32;
self.pages.push(page); self.pages.push(page);
} }
fn insert_link(&mut self, mut link: Link, titles: &[String]) { fn insert_link(&mut self, mut link: Link<()>, titles: &[String]) {
let title = &titles[link.to as usize]; let title = &titles[link.to as usize];
if let Some(page_idx) = self.pages_map.get(title) { if let Some(page_idx) = self.pages_map.get(title) {
link.to = *page_idx; link.to = *page_idx;
@ -162,7 +168,7 @@ impl SecondStage {
} }
} }
fn finalize(&mut self, pages: &[Page]) { fn finalize(&mut self, pages: &[Page<()>]) {
self.insert_page(pages.last().unwrap()); self.insert_page(pages.last().unwrap());
} }
@ -194,7 +200,7 @@ impl SecondStage {
result result
} }
fn into_adjacency_list(self) -> AdjacencyList { fn into_adjacency_list(self) -> AdjacencyList<(), ()> {
AdjacencyList { AdjacencyList {
pages: self.pages, pages: self.pages,
links: self.links, links: self.links,

View file

@ -51,15 +51,16 @@ mod ioutil {
} }
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Page { pub struct Page<P> {
pub link_idx: u32, pub link_idx: u32,
pub id: u32, pub id: u32,
pub length: u32, pub length: u32,
pub redirect: bool, pub redirect: bool,
pub title: String, pub title: String,
pub data: P,
} }
impl Page { impl Page<()> {
pub fn write<W: Write>(&self, to: &mut W) -> io::Result<()> { pub fn write<W: Write>(&self, to: &mut W) -> io::Result<()> {
ioutil::write_u32(self.link_idx, to)?; ioutil::write_u32(self.link_idx, to)?;
ioutil::write_u32(self.id, to)?; ioutil::write_u32(self.id, to)?;
@ -83,18 +84,20 @@ impl Page {
length, length,
redirect, redirect,
title, title,
data: (),
}) })
} }
} }
#[derive(Debug, Clone, Copy, Serialize, Deserialize)] #[derive(Debug, Clone, Copy, Serialize, Deserialize)]
pub struct Link { pub struct Link<L> {
pub to: u32, pub to: u32,
pub start: u32, pub start: u32,
pub end: u32, pub end: u32,
pub data: L,
} }
impl Link { impl Link<()> {
pub fn write<W: Write>(&self, to: &mut W) -> io::Result<()> { pub fn write<W: Write>(&self, to: &mut W) -> io::Result<()> {
ioutil::write_u32(self.to, to)?; ioutil::write_u32(self.to, to)?;
ioutil::write_u32(self.start, to)?; ioutil::write_u32(self.start, to)?;
@ -108,42 +111,22 @@ impl Link {
let start = ioutil::read_u32(from)?; let start = ioutil::read_u32(from)?;
let end = ioutil::read_u32(from)?; let end = ioutil::read_u32(from)?;
Ok(Self { to, start, end }) Ok(Self {
to,
start,
end,
data: (),
})
} }
} }
#[derive(Debug, Serialize, Deserialize)] #[derive(Debug, Serialize, Deserialize)]
pub struct AdjacencyList { pub struct AdjacencyList<P, L> {
pub pages: Vec<Page>, pub pages: Vec<Page<P>>,
pub links: Vec<Link>, pub links: Vec<Link<L>>,
}
impl AdjacencyList {
pub fn check_consistency(&self) {
// Check that all types are large enough
assert!(self.pages.len() <= u32::MAX as usize, "pages len");
assert!(self.links.len() <= u32::MAX as usize, "links len");
for page in &self.pages {
assert!(page.link_idx <= u32::MAX as u32, "page link_idx");
assert!(page.id <= u32::MAX as u32, "page id");
assert!(page.length <= u32::MAX as u32, "page length");
assert!(page.title.len() <= u8::MAX as usize, "page title len");
}
for link in &self.links {
assert!(link.to <= u32::MAX as u32, "link to");
assert!(link.start <= u32::MAX as u32, "link start");
assert!(link.end <= u32::MAX as u32, "link end");
}
// Check that all links contain valid indices
let range = 0..self.pages.len() as u32;
for link in &self.links {
if !range.contains(&link.to) {
panic!("Invalid link detected!");
}
}
} }
impl AdjacencyList<(), ()> {
pub fn write<W: Write>(&self, to: &mut W) -> io::Result<()> { pub fn write<W: Write>(&self, to: &mut W) -> io::Result<()> {
ioutil::write_u32(self.pages.len() as u32, to)?; ioutil::write_u32(self.pages.len() as u32, to)?;
ioutil::write_u32(self.links.len() as u32, to)?; ioutil::write_u32(self.links.len() as u32, to)?;
@ -176,3 +159,30 @@ impl AdjacencyList {
Ok(Self { pages, links }) Ok(Self { pages, links })
} }
} }
impl<P, L> AdjacencyList<P, L> {
pub fn check_consistency(&self) {
// Check that all types are large enough
assert!(self.pages.len() <= u32::MAX as usize, "pages len");
assert!(self.links.len() <= u32::MAX as usize, "links len");
for page in &self.pages {
assert!(page.link_idx <= u32::MAX as u32, "page link_idx");
assert!(page.id <= u32::MAX as u32, "page id");
assert!(page.length <= u32::MAX as u32, "page length");
assert!(page.title.len() <= u8::MAX as usize, "page title len");
}
for link in &self.links {
assert!(link.to <= u32::MAX as u32, "link to");
assert!(link.start <= u32::MAX as u32, "link start");
assert!(link.end <= u32::MAX as u32, "link end");
}
// Check that all links contain valid indices
let range = 0..self.pages.len() as u32;
for link in &self.links {
if !range.contains(&link.to) {
panic!("Invalid link detected!");
}
}
}
}