Make adjacency list extensible

This commit is contained in:
Joscha 2022-10-21 20:39:53 +02:00
parent 78aa27c019
commit 3a75089e5a
2 changed files with 60 additions and 44 deletions

View file

@ -41,9 +41,9 @@ form a proper adjacency list.
struct FirstStage {
/// List with page info and index into [`Self::links`].
pages: Vec<Page>,
pages: Vec<Page<()>>,
/// List with link info and index into [`Self::titles`].
links: Vec<Link>,
links: Vec<Link<()>>,
/// List with titles.
titles: Vec<String>,
/// Map from normalized title to index in [`Self::titles`].
@ -80,11 +80,17 @@ impl FirstStage {
length,
redirect,
title,
data: (),
});
}
fn insert_link(&mut self, to: u32, start: u32, end: u32) {
self.links.push(Link { to, start, end });
self.links.push(Link {
to,
start,
end,
data: (),
});
}
fn import_json_page(&mut self, page: JsonPage) {
@ -125,9 +131,9 @@ impl FirstStage {
struct SecondStage {
/// List with page info and index into [`Self::links`].
pages: Vec<Page>,
pages: Vec<Page<()>>,
/// List with link info and index into [`Self::pages`].
links: Vec<Link>,
links: Vec<Link<()>>,
/// Map from normalized title to index in [`Self::pages`].
pages_map: FxHashMap<String, u32>,
}
@ -141,20 +147,20 @@ impl SecondStage {
}
}
fn initialize_pages_map(&mut self, pages: &[Page]) {
fn initialize_pages_map(&mut self, pages: &[Page<()>]) {
for (idx, page) in pages.iter().enumerate() {
let title = util::normalize_link(&page.title);
self.pages_map.insert(title, idx as u32);
}
}
fn insert_page(&mut self, page: &Page) {
fn insert_page(&mut self, page: &Page<()>) {
let mut page = page.clone();
page.link_idx = self.pages.len() as u32;
self.pages.push(page);
}
fn insert_link(&mut self, mut link: Link, titles: &[String]) {
fn insert_link(&mut self, mut link: Link<()>, titles: &[String]) {
let title = &titles[link.to as usize];
if let Some(page_idx) = self.pages_map.get(title) {
link.to = *page_idx;
@ -162,7 +168,7 @@ impl SecondStage {
}
}
fn finalize(&mut self, pages: &[Page]) {
fn finalize(&mut self, pages: &[Page<()>]) {
self.insert_page(pages.last().unwrap());
}
@ -194,7 +200,7 @@ impl SecondStage {
result
}
fn into_adjacency_list(self) -> AdjacencyList {
fn into_adjacency_list(self) -> AdjacencyList<(), ()> {
AdjacencyList {
pages: self.pages,
links: self.links,

View file

@ -51,15 +51,16 @@ mod ioutil {
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Page {
pub struct Page<P> {
pub link_idx: u32,
pub id: u32,
pub length: u32,
pub redirect: bool,
pub title: String,
pub data: P,
}
impl Page {
impl Page<()> {
pub fn write<W: Write>(&self, to: &mut W) -> io::Result<()> {
ioutil::write_u32(self.link_idx, to)?;
ioutil::write_u32(self.id, to)?;
@ -83,18 +84,20 @@ impl Page {
length,
redirect,
title,
data: (),
})
}
}
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
pub struct Link {
pub struct Link<L> {
pub to: u32,
pub start: u32,
pub end: u32,
pub data: L,
}
impl Link {
impl Link<()> {
pub fn write<W: Write>(&self, to: &mut W) -> io::Result<()> {
ioutil::write_u32(self.to, to)?;
ioutil::write_u32(self.start, to)?;
@ -108,42 +111,22 @@ impl Link {
let start = ioutil::read_u32(from)?;
let end = ioutil::read_u32(from)?;
Ok(Self { to, start, end })
Ok(Self {
to,
start,
end,
data: (),
})
}
}
#[derive(Debug, Serialize, Deserialize)]
pub struct AdjacencyList {
pub pages: Vec<Page>,
pub links: Vec<Link>,
pub struct AdjacencyList<P, L> {
pub pages: Vec<Page<P>>,
pub links: Vec<Link<L>>,
}
impl AdjacencyList {
pub fn check_consistency(&self) {
// Check that all types are large enough
assert!(self.pages.len() <= u32::MAX as usize, "pages len");
assert!(self.links.len() <= u32::MAX as usize, "links len");
for page in &self.pages {
assert!(page.link_idx <= u32::MAX as u32, "page link_idx");
assert!(page.id <= u32::MAX as u32, "page id");
assert!(page.length <= u32::MAX as u32, "page length");
assert!(page.title.len() <= u8::MAX as usize, "page title len");
}
for link in &self.links {
assert!(link.to <= u32::MAX as u32, "link to");
assert!(link.start <= u32::MAX as u32, "link start");
assert!(link.end <= u32::MAX as u32, "link end");
}
// Check that all links contain valid indices
let range = 0..self.pages.len() as u32;
for link in &self.links {
if !range.contains(&link.to) {
panic!("Invalid link detected!");
}
}
}
impl AdjacencyList<(), ()> {
pub fn write<W: Write>(&self, to: &mut W) -> io::Result<()> {
ioutil::write_u32(self.pages.len() as u32, to)?;
ioutil::write_u32(self.links.len() as u32, to)?;
@ -176,3 +159,30 @@ impl AdjacencyList {
Ok(Self { pages, links })
}
}
impl<P, L> AdjacencyList<P, L> {
pub fn check_consistency(&self) {
// Check that all types are large enough
assert!(self.pages.len() <= u32::MAX as usize, "pages len");
assert!(self.links.len() <= u32::MAX as usize, "links len");
for page in &self.pages {
assert!(page.link_idx <= u32::MAX as u32, "page link_idx");
assert!(page.id <= u32::MAX as u32, "page id");
assert!(page.length <= u32::MAX as u32, "page length");
assert!(page.title.len() <= u8::MAX as usize, "page title len");
}
for link in &self.links {
assert!(link.to <= u32::MAX as u32, "link to");
assert!(link.start <= u32::MAX as u32, "link start");
assert!(link.end <= u32::MAX as u32, "link end");
}
// Check that all links contain valid indices
let range = 0..self.pages.len() as u32;
for link in &self.links {
if !range.contains(&link.to) {
panic!("Invalid link detected!");
}
}
}
}