Refactor data representation and storage

Mostly moving around code
This commit is contained in:
Joscha 2024-08-25 21:40:11 +02:00
parent 0eb745e928
commit 7a2372fedd
10 changed files with 416 additions and 379 deletions

View file

@ -2,11 +2,16 @@ use std::collections::hash_map::Entry;
use std::fs::File;
use std::io::{self, BufRead, BufReader, BufWriter};
use std::path::Path;
use std::u32;
use rustc_hash::FxHashMap;
use serde::Deserialize;
use crate::data::{AdjacencyList, Link, LinkInfo, Page, PageInfo};
use crate::data::adjacency_list::{
AdjacencyList, Link, LinkIdx, Page, PageIdx, SENTINEL_PAGE_MARKER,
};
use crate::data::info::{LinkInfo, PageInfo};
use crate::data::store;
use crate::util;
#[derive(Deserialize)]
@ -70,7 +75,7 @@ fn first_stage() -> io::Result<(AdjacencyList<PageInfo, LinkInfo>, Titles)> {
let json_page = serde_json::from_str::<JsonPage>(&line?).unwrap();
result.pages.push(Page {
link_idx: result.links.len() as u32,
start: LinkIdx(result.links.len() as u32),
data: PageInfo {
id: json_page.id,
length: json_page.length,
@ -82,14 +87,14 @@ fn first_stage() -> io::Result<(AdjacencyList<PageInfo, LinkInfo>, Titles)> {
if let Some(to) = json_page.redirect {
let to = titles.insert(util::normalize_link(&to));
result.links.push(Link {
to,
to: PageIdx(to),
data: LinkInfo::default(),
});
} else {
for (to, start, len, flags) in json_page.links {
let to = titles.insert(util::normalize_link(&to));
result.links.push(Link {
to,
to: PageIdx(to),
data: LinkInfo { start, len, flags },
});
}
@ -106,12 +111,12 @@ fn first_stage() -> io::Result<(AdjacencyList<PageInfo, LinkInfo>, Titles)> {
eprintln!("Title map entries: {}", titles.map.len());
result.pages.push(Page {
link_idx: result.links.len() as u32,
start: LinkIdx(result.links.len() as u32),
data: PageInfo {
id: 0,
id: u32::MAX,
length: 0,
redirect: false,
title: "Sentinel page at the end of all pages, Q2AKO3OYzyitmCJURghJ".to_string(),
title: SENTINEL_PAGE_MARKER.to_string(),
},
});
@ -148,18 +153,18 @@ fn second_stage(
for page_idx in 0..first_stage.pages.len() - 1 {
let mut page = first_stage.pages[page_idx].clone();
let start_link_idx = page.link_idx;
let end_link_idx = first_stage.pages[page_idx + 1].link_idx;
let start_link_idx = page.start;
let end_link_idx = first_stage.pages[page_idx + 1].start;
page.link_idx = result.links.len() as u32;
page.start.0 = result.links.len() as u32;
result.pages.push(page);
for link_idx in start_link_idx..end_link_idx {
for link_idx in start_link_idx.0..end_link_idx.0 {
let mut link = first_stage.links[link_idx as usize];
let title = util::normalize_link(titles.get(link.to));
let title = util::normalize_link(titles.get(link.to.0));
if let Some(to) = pages_map.get(&title) {
// The link points to an existing article, we should keep it
link.to = *to;
link.to.0 = *to;
result.links.push(link);
}
}
@ -174,7 +179,7 @@ fn second_stage(
eprintln!("Page map entries: {}", pages_map.len());
let mut sentinel = first_stage.pages.last().unwrap().clone();
sentinel.link_idx = result.links.len() as u32;
sentinel.start.0 = result.links.len() as u32;
result.pages.push(sentinel);
result
@ -192,7 +197,7 @@ pub fn ingest(datafile: &Path) -> io::Result<()> {
eprintln!(">> Export");
let mut datafile = BufWriter::new(File::create(datafile)?);
data.write(&mut datafile)?;
store::write_adjacency_list(&data, &mut datafile)?;
Ok(())
}

View file

@ -2,15 +2,17 @@ use std::fs::File;
use std::io::{self, BufReader};
use std::path::Path;
use crate::data::AdjacencyList;
use crate::data::adjacency_list::PageIdx;
use crate::data::store;
pub fn run(datafile: &Path) -> io::Result<()> {
let mut databuf = BufReader::new(File::open(datafile)?);
let data = AdjacencyList::read(&mut databuf)?;
let data = store::read_adjacency_list(&mut databuf)?;
for (page_idx, page) in data.pages.iter().enumerate() {
let page_idx = PageIdx(page_idx as u32);
if page.data.redirect {
for link_idx in data.link_range(page_idx as u32) {
for link_idx in data.link_range(page_idx) {
let target_page = data.page(data.link(link_idx).to);
println!("{:?} -> {:?}", page.data.title, target_page.data.title);
}

View file

@ -3,12 +3,14 @@ use std::fs::File;
use std::io::{self, BufReader};
use std::path::Path;
use crate::data::{AdjacencyList, LinkInfo, PageInfo};
use crate::data::adjacency_list::{AdjacencyList, PageIdx};
use crate::data::info::{LinkInfo, PageInfo};
use crate::data::store;
use crate::util;
struct DijkstraPageInfo {
cost: u32,
prev_page_idx: u32,
prev: PageIdx,
redirect: bool,
}
@ -16,7 +18,7 @@ impl DijkstraPageInfo {
fn from_page_info(info: PageInfo) -> Self {
Self {
cost: u32::MAX,
prev_page_idx: u32::MAX,
prev: PageIdx(u32::MAX),
redirect: info.redirect,
}
}
@ -40,12 +42,12 @@ impl DijkstraLinkInfo {
#[derive(Clone, Copy, PartialEq, Eq)]
struct Entry {
cost: u32,
page_idx: u32,
idx: PageIdx,
}
impl Entry {
pub fn new(cost: u32, page_idx: u32) -> Self {
Self { cost, page_idx }
pub fn new(cost: u32, idx: PageIdx) -> Self {
Self { cost, idx }
}
}
@ -55,7 +57,7 @@ impl Ord for Entry {
other
.cost
.cmp(&self.cost)
.then_with(|| self.page_idx.cmp(&other.page_idx))
.then_with(|| self.idx.cmp(&other.idx))
}
}
@ -68,7 +70,7 @@ impl PartialOrd for Entry {
/// Closely matches the dijkstra example in [std::collections::binary_heap].
fn full_dijkstra(
data: AdjacencyList<PageInfo, LinkInfo>,
from_idx: u32,
from_idx: PageIdx,
) -> AdjacencyList<DijkstraPageInfo, DijkstraLinkInfo> {
println!("> Prepare state");
let mut data = data
@ -79,7 +81,11 @@ fn full_dijkstra(
queue.push(Entry::new(0, from_idx));
println!("> Run dijkstra");
while let Some(Entry { cost, page_idx }) = queue.pop() {
while let Some(Entry {
cost,
idx: page_idx,
}) = queue.pop()
{
let page = data.page(page_idx);
if cost > page.data.cost {
// This queue entry is outdated
@ -92,13 +98,13 @@ fn full_dijkstra(
let next = Entry {
cost: cost + if redirect { 0 } else { link.data.cost },
page_idx: link.to,
idx: link.to,
};
let target_page = data.page_mut(link.to);
if next.cost < target_page.data.cost {
target_page.data.cost = next.cost;
target_page.data.prev_page_idx = page_idx;
target_page.data.prev = page_idx;
queue.push(next);
}
}
@ -109,27 +115,28 @@ fn full_dijkstra(
fn find_longest_shortest_path(
data: AdjacencyList<DijkstraPageInfo, DijkstraLinkInfo>,
from_idx: u32,
) -> Option<Vec<u32>> {
let to_idx = data
.pages
.iter()
.enumerate()
.filter(|(_, p)| p.data.cost != u32::MAX)
.max_by_key(|(_, p)| p.data.cost)?
.0 as u32;
from: PageIdx,
) -> Option<Vec<PageIdx>> {
let to = PageIdx(
data.pages
.iter()
.enumerate()
.filter(|(_, p)| p.data.cost != u32::MAX)
.max_by_key(|(_, p)| p.data.cost)?
.0 as u32,
);
let mut steps = vec![];
let mut at_idx = to_idx;
let mut at = to;
loop {
steps.push(at_idx);
at_idx = data.page(at_idx).data.prev_page_idx;
if at_idx == u32::MAX {
steps.push(at);
at = data.page(at).data.prev;
if at == PageIdx(u32::MAX) {
break;
};
}
steps.reverse();
if steps.first() == Some(&from_idx) {
if steps.first() == Some(&from) {
Some(steps)
} else {
None
@ -139,7 +146,7 @@ fn find_longest_shortest_path(
pub fn run(datafile: &Path, from: &str) -> io::Result<()> {
println!(">> Import");
let mut databuf = BufReader::new(File::open(datafile)?);
let data = AdjacencyList::read(&mut databuf)?;
let data = store::read_adjacency_list(&mut databuf)?;
let pages = data.pages.clone();
println!(">> Locate from and to");
@ -155,7 +162,7 @@ pub fn run(datafile: &Path, from: &str) -> io::Result<()> {
if let Some(path) = path {
println!("Path found:");
for page_idx in path {
let page = &pages[page_idx as usize];
let page = &pages[page_idx.0 as usize];
if page.data.redirect {
println!(" v {:?}", page.data.title);
} else {

View file

@ -3,12 +3,14 @@ use std::fs::File;
use std::io::{self, BufReader};
use std::path::Path;
use crate::data::{AdjacencyList, LinkInfo, PageInfo};
use crate::data::adjacency_list::{AdjacencyList, PageIdx};
use crate::data::info::{LinkInfo, PageInfo};
use crate::data::store;
use crate::util;
struct DijkstraPageInfo {
cost: u32,
prev_page_idx: u32,
prev: PageIdx,
redirect: bool,
}
@ -16,7 +18,7 @@ impl DijkstraPageInfo {
fn from_page_info(info: PageInfo) -> Self {
Self {
cost: u32::MAX,
prev_page_idx: u32::MAX,
prev: PageIdx(u32::MAX),
redirect: info.redirect,
}
}
@ -40,12 +42,12 @@ impl DijkstraLinkInfo {
#[derive(Clone, Copy, PartialEq, Eq)]
struct Entry {
cost: u32,
page_idx: u32,
idx: PageIdx,
}
impl Entry {
pub fn new(cost: u32, page_idx: u32) -> Self {
Self { cost, page_idx }
pub fn new(cost: u32, idx: PageIdx) -> Self {
Self { cost, idx }
}
}
@ -55,7 +57,7 @@ impl Ord for Entry {
other
.cost
.cmp(&self.cost)
.then_with(|| self.page_idx.cmp(&other.page_idx))
.then_with(|| self.idx.cmp(&other.idx))
}
}
@ -68,20 +70,24 @@ impl PartialOrd for Entry {
/// Closely matches the dijkstra example in [std::collections::binary_heap].
fn dijkstra(
data: AdjacencyList<PageInfo, LinkInfo>,
from_idx: u32,
to_idx: u32,
) -> Option<Vec<u32>> {
from: PageIdx,
to: PageIdx,
) -> Option<Vec<PageIdx>> {
println!("> Prepare state");
let mut data = data
.change_page_data(DijkstraPageInfo::from_page_info)
.change_link_data(DijkstraLinkInfo::from_link_info);
let mut queue = BinaryHeap::new();
data.page_mut(from_idx).data.cost = 0;
queue.push(Entry::new(0, from_idx));
data.page_mut(from).data.cost = 0;
queue.push(Entry::new(0, from));
println!("> Run dijkstra");
while let Some(Entry { cost, page_idx }) = queue.pop() {
if page_idx == to_idx {
while let Some(Entry {
cost,
idx: page_idx,
}) = queue.pop()
{
if page_idx == to {
// We've found the shortest path to our target
break;
}
@ -98,13 +104,13 @@ fn dijkstra(
let next = Entry {
cost: cost + if redirect { 0 } else { link.data.cost },
page_idx: link.to,
idx: link.to,
};
let target_page = data.page_mut(link.to);
if next.cost < target_page.data.cost {
target_page.data.cost = next.cost;
target_page.data.prev_page_idx = page_idx;
target_page.data.prev = page_idx;
queue.push(next);
}
}
@ -112,16 +118,16 @@ fn dijkstra(
println!("> Collect results");
let mut steps = vec![];
let mut at_idx = to_idx;
let mut at = to;
loop {
steps.push(at_idx);
at_idx = data.page(at_idx).data.prev_page_idx;
if at_idx == u32::MAX {
steps.push(at);
at = data.page(at).data.prev;
if at == PageIdx(u32::MAX) {
break;
};
}
steps.reverse();
if steps.first() == Some(&from_idx) {
if steps.first() == Some(&from) {
Some(steps)
} else {
None
@ -131,7 +137,7 @@ fn dijkstra(
pub fn path(datafile: &Path, from: &str, to: &str) -> io::Result<()> {
println!(">> Import");
let mut databuf = BufReader::new(File::open(datafile)?);
let data = AdjacencyList::read(&mut databuf)?;
let data = store::read_adjacency_list(&mut databuf)?;
let pages = data.pages.clone();
println!(">> Locate from and to");
@ -146,7 +152,7 @@ pub fn path(datafile: &Path, from: &str, to: &str) -> io::Result<()> {
if let Some(path) = path {
println!("Path found:");
for page_idx in path {
let page = &pages[page_idx as usize];
let page = &pages[page_idx.0 as usize];
if page.data.redirect {
println!(" v {:?}", page.data.title);
} else {

View file

@ -2,19 +2,19 @@ use std::fs::File;
use std::io::{self, BufReader, BufWriter};
use std::path::Path;
use crate::data::AdjacencyList;
use crate::data::store;
pub fn reexport(from: &Path, to: &Path) -> io::Result<()> {
eprintln!(">> Import");
let mut from = BufReader::new(File::open(from)?);
let data = AdjacencyList::read(&mut from)?;
let data = store::read_adjacency_list(&mut from)?;
eprintln!(">> Consistency check");
data.check_consistency();
eprintln!(">> Export");
let mut to = BufWriter::new(File::create(to)?);
data.write(&mut to)?;
store::write_adjacency_list(&data, &mut to)?;
Ok(())
}