Refactor data representation and storage
Mostly moving around code
This commit is contained in:
parent
0eb745e928
commit
7a2372fedd
10 changed files with 416 additions and 379 deletions
|
|
@ -2,11 +2,16 @@ use std::collections::hash_map::Entry;
|
|||
use std::fs::File;
|
||||
use std::io::{self, BufRead, BufReader, BufWriter};
|
||||
use std::path::Path;
|
||||
use std::u32;
|
||||
|
||||
use rustc_hash::FxHashMap;
|
||||
use serde::Deserialize;
|
||||
|
||||
use crate::data::{AdjacencyList, Link, LinkInfo, Page, PageInfo};
|
||||
use crate::data::adjacency_list::{
|
||||
AdjacencyList, Link, LinkIdx, Page, PageIdx, SENTINEL_PAGE_MARKER,
|
||||
};
|
||||
use crate::data::info::{LinkInfo, PageInfo};
|
||||
use crate::data::store;
|
||||
use crate::util;
|
||||
|
||||
#[derive(Deserialize)]
|
||||
|
|
@ -70,7 +75,7 @@ fn first_stage() -> io::Result<(AdjacencyList<PageInfo, LinkInfo>, Titles)> {
|
|||
let json_page = serde_json::from_str::<JsonPage>(&line?).unwrap();
|
||||
|
||||
result.pages.push(Page {
|
||||
link_idx: result.links.len() as u32,
|
||||
start: LinkIdx(result.links.len() as u32),
|
||||
data: PageInfo {
|
||||
id: json_page.id,
|
||||
length: json_page.length,
|
||||
|
|
@ -82,14 +87,14 @@ fn first_stage() -> io::Result<(AdjacencyList<PageInfo, LinkInfo>, Titles)> {
|
|||
if let Some(to) = json_page.redirect {
|
||||
let to = titles.insert(util::normalize_link(&to));
|
||||
result.links.push(Link {
|
||||
to,
|
||||
to: PageIdx(to),
|
||||
data: LinkInfo::default(),
|
||||
});
|
||||
} else {
|
||||
for (to, start, len, flags) in json_page.links {
|
||||
let to = titles.insert(util::normalize_link(&to));
|
||||
result.links.push(Link {
|
||||
to,
|
||||
to: PageIdx(to),
|
||||
data: LinkInfo { start, len, flags },
|
||||
});
|
||||
}
|
||||
|
|
@ -106,12 +111,12 @@ fn first_stage() -> io::Result<(AdjacencyList<PageInfo, LinkInfo>, Titles)> {
|
|||
eprintln!("Title map entries: {}", titles.map.len());
|
||||
|
||||
result.pages.push(Page {
|
||||
link_idx: result.links.len() as u32,
|
||||
start: LinkIdx(result.links.len() as u32),
|
||||
data: PageInfo {
|
||||
id: 0,
|
||||
id: u32::MAX,
|
||||
length: 0,
|
||||
redirect: false,
|
||||
title: "Sentinel page at the end of all pages, Q2AKO3OYzyitmCJURghJ".to_string(),
|
||||
title: SENTINEL_PAGE_MARKER.to_string(),
|
||||
},
|
||||
});
|
||||
|
||||
|
|
@ -148,18 +153,18 @@ fn second_stage(
|
|||
|
||||
for page_idx in 0..first_stage.pages.len() - 1 {
|
||||
let mut page = first_stage.pages[page_idx].clone();
|
||||
let start_link_idx = page.link_idx;
|
||||
let end_link_idx = first_stage.pages[page_idx + 1].link_idx;
|
||||
let start_link_idx = page.start;
|
||||
let end_link_idx = first_stage.pages[page_idx + 1].start;
|
||||
|
||||
page.link_idx = result.links.len() as u32;
|
||||
page.start.0 = result.links.len() as u32;
|
||||
result.pages.push(page);
|
||||
|
||||
for link_idx in start_link_idx..end_link_idx {
|
||||
for link_idx in start_link_idx.0..end_link_idx.0 {
|
||||
let mut link = first_stage.links[link_idx as usize];
|
||||
let title = util::normalize_link(titles.get(link.to));
|
||||
let title = util::normalize_link(titles.get(link.to.0));
|
||||
if let Some(to) = pages_map.get(&title) {
|
||||
// The link points to an existing article, we should keep it
|
||||
link.to = *to;
|
||||
link.to.0 = *to;
|
||||
result.links.push(link);
|
||||
}
|
||||
}
|
||||
|
|
@ -174,7 +179,7 @@ fn second_stage(
|
|||
eprintln!("Page map entries: {}", pages_map.len());
|
||||
|
||||
let mut sentinel = first_stage.pages.last().unwrap().clone();
|
||||
sentinel.link_idx = result.links.len() as u32;
|
||||
sentinel.start.0 = result.links.len() as u32;
|
||||
result.pages.push(sentinel);
|
||||
|
||||
result
|
||||
|
|
@ -192,7 +197,7 @@ pub fn ingest(datafile: &Path) -> io::Result<()> {
|
|||
|
||||
eprintln!(">> Export");
|
||||
let mut datafile = BufWriter::new(File::create(datafile)?);
|
||||
data.write(&mut datafile)?;
|
||||
store::write_adjacency_list(&data, &mut datafile)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,15 +2,17 @@ use std::fs::File;
|
|||
use std::io::{self, BufReader};
|
||||
use std::path::Path;
|
||||
|
||||
use crate::data::AdjacencyList;
|
||||
use crate::data::adjacency_list::PageIdx;
|
||||
use crate::data::store;
|
||||
|
||||
pub fn run(datafile: &Path) -> io::Result<()> {
|
||||
let mut databuf = BufReader::new(File::open(datafile)?);
|
||||
let data = AdjacencyList::read(&mut databuf)?;
|
||||
let data = store::read_adjacency_list(&mut databuf)?;
|
||||
|
||||
for (page_idx, page) in data.pages.iter().enumerate() {
|
||||
let page_idx = PageIdx(page_idx as u32);
|
||||
if page.data.redirect {
|
||||
for link_idx in data.link_range(page_idx as u32) {
|
||||
for link_idx in data.link_range(page_idx) {
|
||||
let target_page = data.page(data.link(link_idx).to);
|
||||
println!("{:?} -> {:?}", page.data.title, target_page.data.title);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -3,12 +3,14 @@ use std::fs::File;
|
|||
use std::io::{self, BufReader};
|
||||
use std::path::Path;
|
||||
|
||||
use crate::data::{AdjacencyList, LinkInfo, PageInfo};
|
||||
use crate::data::adjacency_list::{AdjacencyList, PageIdx};
|
||||
use crate::data::info::{LinkInfo, PageInfo};
|
||||
use crate::data::store;
|
||||
use crate::util;
|
||||
|
||||
struct DijkstraPageInfo {
|
||||
cost: u32,
|
||||
prev_page_idx: u32,
|
||||
prev: PageIdx,
|
||||
redirect: bool,
|
||||
}
|
||||
|
||||
|
|
@ -16,7 +18,7 @@ impl DijkstraPageInfo {
|
|||
fn from_page_info(info: PageInfo) -> Self {
|
||||
Self {
|
||||
cost: u32::MAX,
|
||||
prev_page_idx: u32::MAX,
|
||||
prev: PageIdx(u32::MAX),
|
||||
redirect: info.redirect,
|
||||
}
|
||||
}
|
||||
|
|
@ -40,12 +42,12 @@ impl DijkstraLinkInfo {
|
|||
#[derive(Clone, Copy, PartialEq, Eq)]
|
||||
struct Entry {
|
||||
cost: u32,
|
||||
page_idx: u32,
|
||||
idx: PageIdx,
|
||||
}
|
||||
|
||||
impl Entry {
|
||||
pub fn new(cost: u32, page_idx: u32) -> Self {
|
||||
Self { cost, page_idx }
|
||||
pub fn new(cost: u32, idx: PageIdx) -> Self {
|
||||
Self { cost, idx }
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -55,7 +57,7 @@ impl Ord for Entry {
|
|||
other
|
||||
.cost
|
||||
.cmp(&self.cost)
|
||||
.then_with(|| self.page_idx.cmp(&other.page_idx))
|
||||
.then_with(|| self.idx.cmp(&other.idx))
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -68,7 +70,7 @@ impl PartialOrd for Entry {
|
|||
/// Closely matches the dijkstra example in [std::collections::binary_heap].
|
||||
fn full_dijkstra(
|
||||
data: AdjacencyList<PageInfo, LinkInfo>,
|
||||
from_idx: u32,
|
||||
from_idx: PageIdx,
|
||||
) -> AdjacencyList<DijkstraPageInfo, DijkstraLinkInfo> {
|
||||
println!("> Prepare state");
|
||||
let mut data = data
|
||||
|
|
@ -79,7 +81,11 @@ fn full_dijkstra(
|
|||
queue.push(Entry::new(0, from_idx));
|
||||
|
||||
println!("> Run dijkstra");
|
||||
while let Some(Entry { cost, page_idx }) = queue.pop() {
|
||||
while let Some(Entry {
|
||||
cost,
|
||||
idx: page_idx,
|
||||
}) = queue.pop()
|
||||
{
|
||||
let page = data.page(page_idx);
|
||||
if cost > page.data.cost {
|
||||
// This queue entry is outdated
|
||||
|
|
@ -92,13 +98,13 @@ fn full_dijkstra(
|
|||
|
||||
let next = Entry {
|
||||
cost: cost + if redirect { 0 } else { link.data.cost },
|
||||
page_idx: link.to,
|
||||
idx: link.to,
|
||||
};
|
||||
|
||||
let target_page = data.page_mut(link.to);
|
||||
if next.cost < target_page.data.cost {
|
||||
target_page.data.cost = next.cost;
|
||||
target_page.data.prev_page_idx = page_idx;
|
||||
target_page.data.prev = page_idx;
|
||||
queue.push(next);
|
||||
}
|
||||
}
|
||||
|
|
@ -109,27 +115,28 @@ fn full_dijkstra(
|
|||
|
||||
fn find_longest_shortest_path(
|
||||
data: AdjacencyList<DijkstraPageInfo, DijkstraLinkInfo>,
|
||||
from_idx: u32,
|
||||
) -> Option<Vec<u32>> {
|
||||
let to_idx = data
|
||||
.pages
|
||||
from: PageIdx,
|
||||
) -> Option<Vec<PageIdx>> {
|
||||
let to = PageIdx(
|
||||
data.pages
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter(|(_, p)| p.data.cost != u32::MAX)
|
||||
.max_by_key(|(_, p)| p.data.cost)?
|
||||
.0 as u32;
|
||||
.0 as u32,
|
||||
);
|
||||
|
||||
let mut steps = vec![];
|
||||
let mut at_idx = to_idx;
|
||||
let mut at = to;
|
||||
loop {
|
||||
steps.push(at_idx);
|
||||
at_idx = data.page(at_idx).data.prev_page_idx;
|
||||
if at_idx == u32::MAX {
|
||||
steps.push(at);
|
||||
at = data.page(at).data.prev;
|
||||
if at == PageIdx(u32::MAX) {
|
||||
break;
|
||||
};
|
||||
}
|
||||
steps.reverse();
|
||||
if steps.first() == Some(&from_idx) {
|
||||
if steps.first() == Some(&from) {
|
||||
Some(steps)
|
||||
} else {
|
||||
None
|
||||
|
|
@ -139,7 +146,7 @@ fn find_longest_shortest_path(
|
|||
pub fn run(datafile: &Path, from: &str) -> io::Result<()> {
|
||||
println!(">> Import");
|
||||
let mut databuf = BufReader::new(File::open(datafile)?);
|
||||
let data = AdjacencyList::read(&mut databuf)?;
|
||||
let data = store::read_adjacency_list(&mut databuf)?;
|
||||
let pages = data.pages.clone();
|
||||
|
||||
println!(">> Locate from and to");
|
||||
|
|
@ -155,7 +162,7 @@ pub fn run(datafile: &Path, from: &str) -> io::Result<()> {
|
|||
if let Some(path) = path {
|
||||
println!("Path found:");
|
||||
for page_idx in path {
|
||||
let page = &pages[page_idx as usize];
|
||||
let page = &pages[page_idx.0 as usize];
|
||||
if page.data.redirect {
|
||||
println!(" v {:?}", page.data.title);
|
||||
} else {
|
||||
|
|
|
|||
|
|
@ -3,12 +3,14 @@ use std::fs::File;
|
|||
use std::io::{self, BufReader};
|
||||
use std::path::Path;
|
||||
|
||||
use crate::data::{AdjacencyList, LinkInfo, PageInfo};
|
||||
use crate::data::adjacency_list::{AdjacencyList, PageIdx};
|
||||
use crate::data::info::{LinkInfo, PageInfo};
|
||||
use crate::data::store;
|
||||
use crate::util;
|
||||
|
||||
struct DijkstraPageInfo {
|
||||
cost: u32,
|
||||
prev_page_idx: u32,
|
||||
prev: PageIdx,
|
||||
redirect: bool,
|
||||
}
|
||||
|
||||
|
|
@ -16,7 +18,7 @@ impl DijkstraPageInfo {
|
|||
fn from_page_info(info: PageInfo) -> Self {
|
||||
Self {
|
||||
cost: u32::MAX,
|
||||
prev_page_idx: u32::MAX,
|
||||
prev: PageIdx(u32::MAX),
|
||||
redirect: info.redirect,
|
||||
}
|
||||
}
|
||||
|
|
@ -40,12 +42,12 @@ impl DijkstraLinkInfo {
|
|||
#[derive(Clone, Copy, PartialEq, Eq)]
|
||||
struct Entry {
|
||||
cost: u32,
|
||||
page_idx: u32,
|
||||
idx: PageIdx,
|
||||
}
|
||||
|
||||
impl Entry {
|
||||
pub fn new(cost: u32, page_idx: u32) -> Self {
|
||||
Self { cost, page_idx }
|
||||
pub fn new(cost: u32, idx: PageIdx) -> Self {
|
||||
Self { cost, idx }
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -55,7 +57,7 @@ impl Ord for Entry {
|
|||
other
|
||||
.cost
|
||||
.cmp(&self.cost)
|
||||
.then_with(|| self.page_idx.cmp(&other.page_idx))
|
||||
.then_with(|| self.idx.cmp(&other.idx))
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -68,20 +70,24 @@ impl PartialOrd for Entry {
|
|||
/// Closely matches the dijkstra example in [std::collections::binary_heap].
|
||||
fn dijkstra(
|
||||
data: AdjacencyList<PageInfo, LinkInfo>,
|
||||
from_idx: u32,
|
||||
to_idx: u32,
|
||||
) -> Option<Vec<u32>> {
|
||||
from: PageIdx,
|
||||
to: PageIdx,
|
||||
) -> Option<Vec<PageIdx>> {
|
||||
println!("> Prepare state");
|
||||
let mut data = data
|
||||
.change_page_data(DijkstraPageInfo::from_page_info)
|
||||
.change_link_data(DijkstraLinkInfo::from_link_info);
|
||||
let mut queue = BinaryHeap::new();
|
||||
data.page_mut(from_idx).data.cost = 0;
|
||||
queue.push(Entry::new(0, from_idx));
|
||||
data.page_mut(from).data.cost = 0;
|
||||
queue.push(Entry::new(0, from));
|
||||
|
||||
println!("> Run dijkstra");
|
||||
while let Some(Entry { cost, page_idx }) = queue.pop() {
|
||||
if page_idx == to_idx {
|
||||
while let Some(Entry {
|
||||
cost,
|
||||
idx: page_idx,
|
||||
}) = queue.pop()
|
||||
{
|
||||
if page_idx == to {
|
||||
// We've found the shortest path to our target
|
||||
break;
|
||||
}
|
||||
|
|
@ -98,13 +104,13 @@ fn dijkstra(
|
|||
|
||||
let next = Entry {
|
||||
cost: cost + if redirect { 0 } else { link.data.cost },
|
||||
page_idx: link.to,
|
||||
idx: link.to,
|
||||
};
|
||||
|
||||
let target_page = data.page_mut(link.to);
|
||||
if next.cost < target_page.data.cost {
|
||||
target_page.data.cost = next.cost;
|
||||
target_page.data.prev_page_idx = page_idx;
|
||||
target_page.data.prev = page_idx;
|
||||
queue.push(next);
|
||||
}
|
||||
}
|
||||
|
|
@ -112,16 +118,16 @@ fn dijkstra(
|
|||
|
||||
println!("> Collect results");
|
||||
let mut steps = vec![];
|
||||
let mut at_idx = to_idx;
|
||||
let mut at = to;
|
||||
loop {
|
||||
steps.push(at_idx);
|
||||
at_idx = data.page(at_idx).data.prev_page_idx;
|
||||
if at_idx == u32::MAX {
|
||||
steps.push(at);
|
||||
at = data.page(at).data.prev;
|
||||
if at == PageIdx(u32::MAX) {
|
||||
break;
|
||||
};
|
||||
}
|
||||
steps.reverse();
|
||||
if steps.first() == Some(&from_idx) {
|
||||
if steps.first() == Some(&from) {
|
||||
Some(steps)
|
||||
} else {
|
||||
None
|
||||
|
|
@ -131,7 +137,7 @@ fn dijkstra(
|
|||
pub fn path(datafile: &Path, from: &str, to: &str) -> io::Result<()> {
|
||||
println!(">> Import");
|
||||
let mut databuf = BufReader::new(File::open(datafile)?);
|
||||
let data = AdjacencyList::read(&mut databuf)?;
|
||||
let data = store::read_adjacency_list(&mut databuf)?;
|
||||
let pages = data.pages.clone();
|
||||
|
||||
println!(">> Locate from and to");
|
||||
|
|
@ -146,7 +152,7 @@ pub fn path(datafile: &Path, from: &str, to: &str) -> io::Result<()> {
|
|||
if let Some(path) = path {
|
||||
println!("Path found:");
|
||||
for page_idx in path {
|
||||
let page = &pages[page_idx as usize];
|
||||
let page = &pages[page_idx.0 as usize];
|
||||
if page.data.redirect {
|
||||
println!(" v {:?}", page.data.title);
|
||||
} else {
|
||||
|
|
|
|||
|
|
@ -2,19 +2,19 @@ use std::fs::File;
|
|||
use std::io::{self, BufReader, BufWriter};
|
||||
use std::path::Path;
|
||||
|
||||
use crate::data::AdjacencyList;
|
||||
use crate::data::store;
|
||||
|
||||
pub fn reexport(from: &Path, to: &Path) -> io::Result<()> {
|
||||
eprintln!(">> Import");
|
||||
let mut from = BufReader::new(File::open(from)?);
|
||||
let data = AdjacencyList::read(&mut from)?;
|
||||
let data = store::read_adjacency_list(&mut from)?;
|
||||
|
||||
eprintln!(">> Consistency check");
|
||||
data.check_consistency();
|
||||
|
||||
eprintln!(">> Export");
|
||||
let mut to = BufWriter::new(File::create(to)?);
|
||||
data.write(&mut to)?;
|
||||
store::write_adjacency_list(&data, &mut to)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,303 +1,3 @@
|
|||
use std::io::{self, Read, Write};
|
||||
use std::ops::Range;
|
||||
|
||||
mod ioutil {
|
||||
use std::io::{self, Read, Write};
|
||||
|
||||
pub fn write_u8<W: Write>(n: u8, to: &mut W) -> io::Result<()> {
|
||||
to.write_all(&n.to_le_bytes())
|
||||
}
|
||||
|
||||
pub fn read_u8<R: Read>(from: &mut R) -> io::Result<u8> {
|
||||
let mut buf = [0_u8; 1];
|
||||
from.read_exact(&mut buf)?;
|
||||
Ok(u8::from_le_bytes(buf))
|
||||
}
|
||||
|
||||
pub fn write_u16<W: Write>(n: u16, to: &mut W) -> io::Result<()> {
|
||||
to.write_all(&n.to_le_bytes())
|
||||
}
|
||||
|
||||
pub fn read_u16<R: Read>(from: &mut R) -> io::Result<u16> {
|
||||
let mut buf = [0_u8; 2];
|
||||
from.read_exact(&mut buf)?;
|
||||
Ok(u16::from_le_bytes(buf))
|
||||
}
|
||||
|
||||
pub fn write_u32<W: Write>(n: u32, to: &mut W) -> io::Result<()> {
|
||||
to.write_all(&n.to_le_bytes())
|
||||
}
|
||||
|
||||
pub fn read_u32<R: Read>(from: &mut R) -> io::Result<u32> {
|
||||
let mut buf = [0_u8; 4];
|
||||
from.read_exact(&mut buf)?;
|
||||
Ok(u32::from_le_bytes(buf))
|
||||
}
|
||||
|
||||
pub fn write_str<W: Write>(s: &str, to: &mut W) -> io::Result<()> {
|
||||
assert!(s.len() <= u16::MAX as usize);
|
||||
write_u16(s.len() as u16, to)?;
|
||||
to.write_all(s.as_bytes())?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn read_str<R: Read>(from: &mut R) -> io::Result<String> {
|
||||
let len = read_u16(from)? as usize;
|
||||
let mut buf = vec![0_u8; len];
|
||||
from.read_exact(&mut buf)?;
|
||||
Ok(String::from_utf8(buf).unwrap())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct PageInfo {
|
||||
pub id: u32,
|
||||
pub length: u32,
|
||||
pub redirect: bool,
|
||||
pub title: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct Page<P> {
|
||||
pub link_idx: u32,
|
||||
pub data: P,
|
||||
}
|
||||
|
||||
impl Page<PageInfo> {
|
||||
pub fn write<W: Write>(&self, to: &mut W) -> io::Result<()> {
|
||||
ioutil::write_u32(self.link_idx, to)?;
|
||||
ioutil::write_u32(self.data.id, to)?;
|
||||
ioutil::write_u32(self.data.length, to)?;
|
||||
ioutil::write_u8(if self.data.redirect { 1 } else { 0 }, to)?;
|
||||
ioutil::write_str(&self.data.title, to)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn read<R: Read>(from: &mut R) -> io::Result<Self> {
|
||||
let link_idx = ioutil::read_u32(from)?;
|
||||
let id = ioutil::read_u32(from)?;
|
||||
let length = ioutil::read_u32(from)?;
|
||||
let redirect = ioutil::read_u8(from)? != 0;
|
||||
let title = ioutil::read_str(from)?;
|
||||
|
||||
Ok(Self {
|
||||
link_idx,
|
||||
data: PageInfo {
|
||||
id,
|
||||
length,
|
||||
redirect,
|
||||
title,
|
||||
},
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl<P> Page<P> {
|
||||
pub fn change_data<P2>(self, f: impl Fn(P) -> P2) -> Page<P2> {
|
||||
Page {
|
||||
link_idx: self.link_idx,
|
||||
data: f(self.data),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone, Copy)]
|
||||
pub struct LinkInfo {
|
||||
pub start: u32,
|
||||
pub len: u32,
|
||||
pub flags: u8,
|
||||
}
|
||||
|
||||
impl LinkInfo {
|
||||
pub fn end(self) -> u32 {
|
||||
self.start + self.len
|
||||
}
|
||||
|
||||
pub fn in_parens(self) -> bool {
|
||||
self.flags & 0b1 != 0
|
||||
}
|
||||
|
||||
pub fn in_structure(self) -> bool {
|
||||
self.flags & 0b10 != 0
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct Link<L> {
|
||||
pub to: u32,
|
||||
pub data: L,
|
||||
}
|
||||
|
||||
impl Link<LinkInfo> {
|
||||
pub fn write<W: Write>(&self, to: &mut W) -> io::Result<()> {
|
||||
ioutil::write_u32(self.to, to)?;
|
||||
ioutil::write_u32(self.data.start, to)?;
|
||||
ioutil::write_u32(self.data.len, to)?;
|
||||
ioutil::write_u8(self.data.flags, to)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn read<R: Read>(from: &mut R) -> io::Result<Self> {
|
||||
let to = ioutil::read_u32(from)?;
|
||||
let start = ioutil::read_u32(from)?;
|
||||
let len = ioutil::read_u32(from)?;
|
||||
let flags = ioutil::read_u8(from)?;
|
||||
|
||||
Ok(Self {
|
||||
to,
|
||||
data: LinkInfo { start, len, flags },
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl<L> Link<L> {
|
||||
pub fn change_data<L2>(self, f: impl Fn(L) -> L2) -> Link<L2> {
|
||||
Link {
|
||||
to: self.to,
|
||||
data: f(self.data),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct AdjacencyList<P, L> {
|
||||
pub pages: Vec<Page<P>>,
|
||||
pub links: Vec<Link<L>>,
|
||||
}
|
||||
|
||||
impl<P, L> Default for AdjacencyList<P, L> {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
pages: Default::default(),
|
||||
links: Default::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AdjacencyList<PageInfo, LinkInfo> {
|
||||
pub fn write<W: Write>(&self, to: &mut W) -> io::Result<()> {
|
||||
ioutil::write_u32(self.pages.len() as u32, to)?;
|
||||
ioutil::write_u32(self.links.len() as u32, to)?;
|
||||
|
||||
for page in &self.pages {
|
||||
page.write(to)?;
|
||||
}
|
||||
|
||||
for link in &self.links {
|
||||
link.write(to)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn read<R: Read>(from: &mut R) -> io::Result<Self> {
|
||||
let n_pages = ioutil::read_u32(from)?;
|
||||
let n_links = ioutil::read_u32(from)?;
|
||||
|
||||
let mut pages = vec![];
|
||||
for _ in 0..n_pages {
|
||||
pages.push(Page::read(from)?);
|
||||
}
|
||||
|
||||
let mut links = vec![];
|
||||
for _ in 0..n_links {
|
||||
links.push(Link::read(from)?);
|
||||
}
|
||||
|
||||
Ok(Self { pages, links })
|
||||
}
|
||||
|
||||
pub fn check_consistency(&self) {
|
||||
// Check that all types are large enough
|
||||
assert!(self.pages.len() <= u32::MAX as usize, "pages len");
|
||||
assert!(self.links.len() <= u32::MAX as usize, "links len");
|
||||
for page in &self.pages {
|
||||
assert!(page.data.title.len() <= u8::MAX as usize, "page title len");
|
||||
}
|
||||
|
||||
// Check that all links contain valid indices
|
||||
let range = 0..self.pages.len() as u32;
|
||||
for link in &self.links {
|
||||
if !range.contains(&link.to) {
|
||||
panic!("Invalid link detected!");
|
||||
}
|
||||
}
|
||||
|
||||
// Check that all redirect pages have at most one link
|
||||
for page_idx in 0..self.pages.len() as u32 - 1 {
|
||||
let page = self.page(page_idx);
|
||||
if page.data.redirect {
|
||||
let start_idx = page.link_idx;
|
||||
let end_idx = self.page(page_idx + 1).link_idx;
|
||||
let n_links = end_idx - start_idx;
|
||||
if n_links > 1 {
|
||||
panic!(
|
||||
"Redirect {:?} has too many ({n_links}) links",
|
||||
page.data.title
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<P, L> AdjacencyList<P, L> {
|
||||
pub fn page(&self, idx: u32) -> &Page<P> {
|
||||
&self.pages[idx as usize]
|
||||
}
|
||||
|
||||
pub fn page_mut(&mut self, idx: u32) -> &mut Page<P> {
|
||||
&mut self.pages[idx as usize]
|
||||
}
|
||||
|
||||
pub fn link_range(&self, page_idx: u32) -> Range<u32> {
|
||||
let start_idx = self.page(page_idx).link_idx;
|
||||
let end_idx = self.page(page_idx + 1).link_idx;
|
||||
start_idx..end_idx
|
||||
}
|
||||
|
||||
pub fn link_redirect(&self, page_idx: u32) -> Option<u32> {
|
||||
let start_idx = self.page(page_idx).link_idx;
|
||||
let end_idx = self.page(page_idx + 1).link_idx;
|
||||
if start_idx == end_idx {
|
||||
None
|
||||
} else {
|
||||
Some(start_idx)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn link(&self, idx: u32) -> &Link<L> {
|
||||
&self.links[idx as usize]
|
||||
}
|
||||
|
||||
pub fn link_mut(&mut self, idx: u32) -> &mut Link<L> {
|
||||
&mut self.links[idx as usize]
|
||||
}
|
||||
|
||||
pub fn change_page_data<P2>(self, page_f: impl Fn(P) -> P2 + Copy) -> AdjacencyList<P2, L> {
|
||||
let pages = self
|
||||
.pages
|
||||
.into_iter()
|
||||
.map(|p| p.change_data(page_f))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
AdjacencyList {
|
||||
pages,
|
||||
links: self.links,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn change_link_data<L2>(self, link_f: impl Fn(L) -> L2 + Copy) -> AdjacencyList<P, L2> {
|
||||
let links = self
|
||||
.links
|
||||
.into_iter()
|
||||
.map(|l| l.change_data(link_f))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
AdjacencyList {
|
||||
pages: self.pages,
|
||||
links,
|
||||
}
|
||||
}
|
||||
}
|
||||
pub mod adjacency_list;
|
||||
pub mod info;
|
||||
pub mod store;
|
||||
|
|
|
|||
152
brood/src/data/adjacency_list.rs
Normal file
152
brood/src/data/adjacency_list.rs
Normal file
|
|
@ -0,0 +1,152 @@
|
|||
use super::info::{LinkInfo, PageInfo};
|
||||
|
||||
pub const SENTINEL_PAGE_MARKER: &str = "Q2AKO3OYzyitmCJURghJ";
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
pub struct PageIdx(pub u32);
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
pub struct LinkIdx(pub u32);
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct Page<P> {
|
||||
pub start: LinkIdx,
|
||||
pub data: P,
|
||||
}
|
||||
|
||||
impl<P> Page<P> {
|
||||
pub fn change_data<P2>(self, f: impl Fn(P) -> P2) -> Page<P2> {
|
||||
Page {
|
||||
start: self.start,
|
||||
data: f(self.data),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct Link<L> {
|
||||
pub to: PageIdx,
|
||||
pub data: L,
|
||||
}
|
||||
|
||||
impl<L> Link<L> {
|
||||
pub fn change_data<L2>(self, f: impl Fn(L) -> L2) -> Link<L2> {
|
||||
Link {
|
||||
to: self.to,
|
||||
data: f(self.data),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct AdjacencyList<P, L> {
|
||||
pub pages: Vec<Page<P>>,
|
||||
pub links: Vec<Link<L>>,
|
||||
}
|
||||
|
||||
impl<P, L> Default for AdjacencyList<P, L> {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
pages: Default::default(),
|
||||
links: Default::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<P, L> AdjacencyList<P, L> {
|
||||
pub fn page(&self, idx: PageIdx) -> &Page<P> {
|
||||
&self.pages[idx.0 as usize]
|
||||
}
|
||||
|
||||
pub fn page_mut(&mut self, idx: PageIdx) -> &mut Page<P> {
|
||||
&mut self.pages[idx.0 as usize]
|
||||
}
|
||||
|
||||
pub fn link_range(&self, idx: PageIdx) -> impl DoubleEndedIterator<Item = LinkIdx> {
|
||||
let start_idx = self.page(idx).start;
|
||||
let end_idx = self.page(PageIdx(idx.0 + 1)).start;
|
||||
(start_idx.0..end_idx.0).map(LinkIdx)
|
||||
}
|
||||
|
||||
pub fn link_redirect(&self, idx: PageIdx) -> Option<LinkIdx> {
|
||||
let start_idx = self.page(idx).start;
|
||||
let end_idx = self.page(PageIdx(idx.0 + 1)).start;
|
||||
if start_idx == end_idx {
|
||||
None
|
||||
} else {
|
||||
Some(start_idx)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn link(&self, idx: LinkIdx) -> &Link<L> {
|
||||
&self.links[idx.0 as usize]
|
||||
}
|
||||
|
||||
pub fn link_mut(&mut self, idx: LinkIdx) -> &mut Link<L> {
|
||||
&mut self.links[idx.0 as usize]
|
||||
}
|
||||
|
||||
pub fn change_page_data<P2>(self, page_f: impl Fn(P) -> P2 + Copy) -> AdjacencyList<P2, L> {
|
||||
let pages = self
|
||||
.pages
|
||||
.into_iter()
|
||||
.map(|p| p.change_data(page_f))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
AdjacencyList {
|
||||
pages,
|
||||
links: self.links,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn change_link_data<L2>(self, link_f: impl Fn(L) -> L2 + Copy) -> AdjacencyList<P, L2> {
|
||||
let links = self
|
||||
.links
|
||||
.into_iter()
|
||||
.map(|l| l.change_data(link_f))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
AdjacencyList {
|
||||
pages: self.pages,
|
||||
links,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AdjacencyList<PageInfo, LinkInfo> {
|
||||
pub fn check_consistency(&self) {
|
||||
// Check that we have a sentinel page
|
||||
let sentinel = self.pages.last().expect("no sentinel page");
|
||||
assert!(sentinel.data.id == u32::MAX, "unmarked sentinel page");
|
||||
assert!(
|
||||
sentinel.data.title.contains(SENTINEL_PAGE_MARKER),
|
||||
"unmarked sentinel page"
|
||||
);
|
||||
|
||||
// Check that all types are large enough
|
||||
assert!(self.pages.len() < u32::MAX as usize, "too many pages");
|
||||
assert!(self.links.len() < u32::MAX as usize, "too many links");
|
||||
for page in &self.pages {
|
||||
assert!(
|
||||
page.data.title.len() <= u8::MAX as usize,
|
||||
"page title too long"
|
||||
);
|
||||
}
|
||||
|
||||
// Check that all links contain valid indices. Links must not link to
|
||||
// the sentinel page.
|
||||
let range = 0..self.pages.len() as u32 - 1;
|
||||
for link in &self.links {
|
||||
assert!(range.contains(&link.to.0), "invalid link");
|
||||
}
|
||||
|
||||
// Check that all redirect pages have at most one link
|
||||
for page_idx in (0..self.pages.len() as u32 - 1).map(PageIdx) {
|
||||
let page = self.page(page_idx);
|
||||
if page.data.redirect {
|
||||
let mut range = self.link_range(page_idx);
|
||||
range.next(); // 0 or 1 links allowed
|
||||
assert!(range.next().is_none(), "too many redirect links");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
24
brood/src/data/info.rs
Normal file
24
brood/src/data/info.rs
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
#[derive(Debug, Clone)]
|
||||
pub struct PageInfo {
|
||||
pub id: u32,
|
||||
pub title: String,
|
||||
pub length: u32,
|
||||
pub redirect: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone, Copy)]
|
||||
pub struct LinkInfo {
|
||||
pub start: u32,
|
||||
pub len: u32,
|
||||
pub flags: u8,
|
||||
}
|
||||
|
||||
impl LinkInfo {
|
||||
pub fn in_parens(self) -> bool {
|
||||
self.flags & 0b1 != 0
|
||||
}
|
||||
|
||||
pub fn in_structure(self) -> bool {
|
||||
self.flags & 0b10 != 0
|
||||
}
|
||||
}
|
||||
134
brood/src/data/store.rs
Normal file
134
brood/src/data/store.rs
Normal file
|
|
@ -0,0 +1,134 @@
|
|||
use std::io::{self, Read, Write};
|
||||
|
||||
use super::{
|
||||
adjacency_list::{AdjacencyList, Link, LinkIdx, Page, PageIdx},
|
||||
info::{LinkInfo, PageInfo},
|
||||
};
|
||||
|
||||
fn write_u8<W: Write>(n: u8, to: &mut W) -> io::Result<()> {
|
||||
to.write_all(&n.to_le_bytes())
|
||||
}
|
||||
|
||||
fn read_u8<R: Read>(from: &mut R) -> io::Result<u8> {
|
||||
let mut buf = [0_u8; 1];
|
||||
from.read_exact(&mut buf)?;
|
||||
Ok(u8::from_le_bytes(buf))
|
||||
}
|
||||
|
||||
fn write_u16<W: Write>(n: u16, to: &mut W) -> io::Result<()> {
|
||||
to.write_all(&n.to_le_bytes())
|
||||
}
|
||||
|
||||
fn read_u16<R: Read>(from: &mut R) -> io::Result<u16> {
|
||||
let mut buf = [0_u8; 2];
|
||||
from.read_exact(&mut buf)?;
|
||||
Ok(u16::from_le_bytes(buf))
|
||||
}
|
||||
|
||||
fn write_u32<W: Write>(n: u32, to: &mut W) -> io::Result<()> {
|
||||
to.write_all(&n.to_le_bytes())
|
||||
}
|
||||
|
||||
fn read_u32<R: Read>(from: &mut R) -> io::Result<u32> {
|
||||
let mut buf = [0_u8; 4];
|
||||
from.read_exact(&mut buf)?;
|
||||
Ok(u32::from_le_bytes(buf))
|
||||
}
|
||||
|
||||
fn write_str<W: Write>(s: &str, to: &mut W) -> io::Result<()> {
|
||||
assert!(s.len() <= u16::MAX as usize);
|
||||
write_u16(s.len() as u16, to)?;
|
||||
to.write_all(s.as_bytes())?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn read_str<R: Read>(from: &mut R) -> io::Result<String> {
|
||||
let len = read_u16(from)? as usize;
|
||||
let mut buf = vec![0_u8; len];
|
||||
from.read_exact(&mut buf)?;
|
||||
Ok(String::from_utf8(buf).unwrap())
|
||||
}
|
||||
|
||||
fn write_page<W: Write>(page: &Page<PageInfo>, to: &mut W) -> io::Result<()> {
|
||||
write_u32(page.start.0, to)?;
|
||||
write_u32(page.data.id, to)?;
|
||||
write_u32(page.data.length, to)?;
|
||||
write_u8(if page.data.redirect { 1 } else { 0 }, to)?;
|
||||
write_str(&page.data.title, to)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn read_page<R: Read>(from: &mut R) -> io::Result<Page<PageInfo>> {
|
||||
let start = LinkIdx(read_u32(from)?);
|
||||
let id = read_u32(from)?;
|
||||
let length = read_u32(from)?;
|
||||
let redirect = read_u8(from)? != 0;
|
||||
let title = read_str(from)?;
|
||||
|
||||
Ok(Page {
|
||||
start,
|
||||
data: PageInfo {
|
||||
id,
|
||||
length,
|
||||
redirect,
|
||||
title,
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
fn write_link<W: Write>(link: &Link<LinkInfo>, to: &mut W) -> io::Result<()> {
|
||||
write_u32(link.to.0, to)?;
|
||||
write_u32(link.data.start, to)?;
|
||||
write_u32(link.data.len, to)?;
|
||||
write_u8(link.data.flags, to)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn read_link<R: Read>(from: &mut R) -> io::Result<Link<LinkInfo>> {
|
||||
let to = PageIdx(read_u32(from)?);
|
||||
let start = read_u32(from)?;
|
||||
let len = read_u32(from)?;
|
||||
let flags = read_u8(from)?;
|
||||
|
||||
Ok(Link {
|
||||
to,
|
||||
data: LinkInfo { start, len, flags },
|
||||
})
|
||||
}
|
||||
|
||||
pub fn write_adjacency_list<W: Write>(
|
||||
al: &AdjacencyList<PageInfo, LinkInfo>,
|
||||
to: &mut W,
|
||||
) -> io::Result<()> {
|
||||
write_u32(al.pages.len() as u32, to)?;
|
||||
write_u32(al.links.len() as u32, to)?;
|
||||
|
||||
for page in &al.pages {
|
||||
write_page(page, to)?;
|
||||
}
|
||||
|
||||
for link in &al.links {
|
||||
write_link(link, to)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn read_adjacency_list<R: Read>(from: &mut R) -> io::Result<AdjacencyList<PageInfo, LinkInfo>> {
|
||||
let n_pages = read_u32(from)?;
|
||||
let n_links = read_u32(from)?;
|
||||
|
||||
let mut pages = vec![];
|
||||
for _ in 0..n_pages {
|
||||
pages.push(read_page(from)?);
|
||||
}
|
||||
|
||||
let mut links = vec![];
|
||||
for _ in 0..n_links {
|
||||
links.push(read_link(from)?);
|
||||
}
|
||||
|
||||
Ok(AdjacencyList { pages, links })
|
||||
}
|
||||
|
|
@ -1,4 +1,7 @@
|
|||
use crate::data::{AdjacencyList, LinkInfo, Page, PageInfo};
|
||||
use crate::data::{
|
||||
adjacency_list::{AdjacencyList, Page, PageIdx},
|
||||
info::{LinkInfo, PageInfo},
|
||||
};
|
||||
|
||||
pub fn normalize_link(link: &str) -> String {
|
||||
let link = link.trim().replace(' ', "_");
|
||||
|
|
@ -12,17 +15,21 @@ pub fn normalize_link(link: &str) -> String {
|
|||
.collect::<String>()
|
||||
}
|
||||
|
||||
pub fn find_index_of_title(pages: &[Page<PageInfo>], title: &str) -> u32 {
|
||||
pub fn find_index_of_title(pages: &[Page<PageInfo>], title: &str) -> PageIdx {
|
||||
let title = normalize_link(title);
|
||||
pages
|
||||
let idx = pages
|
||||
.iter()
|
||||
.enumerate()
|
||||
.find(|(_, p)| normalize_link(&p.data.title) == title)
|
||||
.map(|(i, _)| i)
|
||||
.expect("invalid title") as u32
|
||||
.expect("invalid title") as u32;
|
||||
PageIdx(idx)
|
||||
}
|
||||
|
||||
pub fn resolve_redirects(data: &AdjacencyList<PageInfo, LinkInfo>, mut page_idx: u32) -> u32 {
|
||||
pub fn resolve_redirects(
|
||||
data: &AdjacencyList<PageInfo, LinkInfo>,
|
||||
mut page_idx: PageIdx,
|
||||
) -> PageIdx {
|
||||
loop {
|
||||
if data.page(page_idx).data.redirect {
|
||||
if let Some(link_idx) = data.link_redirect(page_idx) {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue