Group pages, links, and graph in Data struct
This commit is contained in:
parent
0168373509
commit
aa4187fcd8
5 changed files with 124 additions and 101 deletions
|
|
@ -9,8 +9,8 @@ use serde::Deserialize;
|
||||||
use thousands::Separable;
|
use thousands::Separable;
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
data::{self, Link, Page},
|
data::{Data, Link, Page},
|
||||||
graph::{Graph, NodeIdx},
|
graph::NodeIdx,
|
||||||
util::{Counter, TitleNormalizer},
|
util::{Counter, TitleNormalizer},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
@ -87,11 +87,9 @@ fn read_page_data(
|
||||||
normalizer: &TitleNormalizer,
|
normalizer: &TitleNormalizer,
|
||||||
title_lookup: &HashMap<String, (u32, u32)>,
|
title_lookup: &HashMap<String, (u32, u32)>,
|
||||||
r: &mut BufReader<File>,
|
r: &mut BufReader<File>,
|
||||||
) -> io::Result<(Vec<Page>, Vec<Link>, Graph)> {
|
) -> io::Result<Data> {
|
||||||
let mut counter = Counter::new();
|
let mut counter = Counter::new();
|
||||||
let mut pages = vec![];
|
let mut data = Data::new();
|
||||||
let mut links = vec![];
|
|
||||||
let mut graph = Graph::new();
|
|
||||||
|
|
||||||
for (i, line) in r.lines().enumerate() {
|
for (i, line) in r.lines().enumerate() {
|
||||||
counter.tick();
|
counter.tick();
|
||||||
|
|
@ -106,8 +104,8 @@ fn read_page_data(
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
graph.add_node();
|
data.graph.add_node();
|
||||||
pages.push(Page {
|
data.pages.push(Page {
|
||||||
id: page.id,
|
id: page.id,
|
||||||
title: page.title,
|
title: page.title,
|
||||||
length: page.length,
|
length: page.length,
|
||||||
|
|
@ -123,14 +121,14 @@ fn read_page_data(
|
||||||
|
|
||||||
for (target, start, len, flags) in page_links {
|
for (target, start, len, flags) in page_links {
|
||||||
if let Some((_, brood_i)) = title_lookup.get(&normalizer.normalize(&target)) {
|
if let Some((_, brood_i)) = title_lookup.get(&normalizer.normalize(&target)) {
|
||||||
graph.edges.push(NodeIdx(*brood_i));
|
data.graph.edges.push(NodeIdx(*brood_i));
|
||||||
links.push(Link { start, len, flags });
|
data.links.push(Link { start, len, flags });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
counter.done();
|
counter.done();
|
||||||
Ok((pages, links, graph))
|
Ok(data)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Convert sift data to brood data.
|
/// Convert sift data to brood data.
|
||||||
|
|
@ -141,7 +139,7 @@ pub struct Cmd {
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Cmd {
|
impl Cmd {
|
||||||
pub fn run(self, data: &Path) -> io::Result<()> {
|
pub fn run(self, brood_data: &Path) -> io::Result<()> {
|
||||||
let normalizer = TitleNormalizer::new();
|
let normalizer = TitleNormalizer::new();
|
||||||
|
|
||||||
println!(">> First pass");
|
println!(">> First pass");
|
||||||
|
|
@ -158,18 +156,24 @@ impl Cmd {
|
||||||
sift_data.seek(io::SeekFrom::Start(0))?;
|
sift_data.seek(io::SeekFrom::Start(0))?;
|
||||||
|
|
||||||
println!("> Reading page data");
|
println!("> Reading page data");
|
||||||
let (pages, links, graph) = read_page_data(&normalizer, &title_lookup, &mut sift_data)?;
|
let data = read_page_data(&normalizer, &title_lookup, &mut sift_data)?;
|
||||||
assert_eq!(pages.len(), title_lookup.len());
|
assert_eq!(data.pages.len(), title_lookup.len());
|
||||||
drop(title_lookup); // Don't hoard memory
|
drop(title_lookup); // Don't hoard memory
|
||||||
drop(sift_data); // No longer needed
|
drop(sift_data); // No longer needed
|
||||||
|
|
||||||
println!("> Checking consistency");
|
println!("> Checking consistency");
|
||||||
graph.check_consistency();
|
data.graph.check_consistency();
|
||||||
|
|
||||||
println!(">> Export");
|
println!(">> Export");
|
||||||
println!("Pages: {:>13}", pages.len().separate_with_underscores());
|
println!(
|
||||||
println!("Links: {:>13}", links.len().separate_with_underscores());
|
"Pages: {:>13}",
|
||||||
data::write_to_file(data, &pages, &links, &graph)?;
|
data.pages.len().separate_with_underscores()
|
||||||
|
);
|
||||||
|
println!(
|
||||||
|
"Links: {:>13}",
|
||||||
|
data.links.len().separate_with_underscores()
|
||||||
|
);
|
||||||
|
data.write_to_file(brood_data)?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,7 @@ use std::{io, path::Path};
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
algo::Dijkstra,
|
algo::Dijkstra,
|
||||||
data,
|
data::Data,
|
||||||
util::{self, TitleNormalizer},
|
util::{self, TitleNormalizer},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
@ -18,22 +18,22 @@ impl Cmd {
|
||||||
let normalizer = TitleNormalizer::new();
|
let normalizer = TitleNormalizer::new();
|
||||||
|
|
||||||
println!(">> Import");
|
println!(">> Import");
|
||||||
let (pages, _links, graph) = data::read_from_file(data)?;
|
let data = Data::read_from_file(data)?;
|
||||||
|
|
||||||
println!(">> Resolve articles");
|
println!(">> Resolve articles");
|
||||||
let start = util::resolve_title(&normalizer, &pages, &graph, &self.start);
|
let start = util::resolve_title(&normalizer, &data, &self.start);
|
||||||
let goal = util::resolve_title(&normalizer, &pages, &graph, &self.goal);
|
let goal = util::resolve_title(&normalizer, &data, &self.goal);
|
||||||
println!("Start: {}", pages[start.usize()].title);
|
println!("Start: {}", data.pages[start.usize()].title);
|
||||||
println!("Goal: {}", pages[goal.usize()].title);
|
println!("Goal: {}", data.pages[goal.usize()].title);
|
||||||
|
|
||||||
println!(">> Find path");
|
println!(">> Find path");
|
||||||
println!("> Preparing dijkstra");
|
println!("> Preparing dijkstra");
|
||||||
let mut dijkstra = Dijkstra::new(&graph);
|
let mut dijkstra = Dijkstra::new(&data.graph);
|
||||||
println!("> Running dijkstra");
|
println!("> Running dijkstra");
|
||||||
dijkstra.run(
|
dijkstra.run(
|
||||||
start,
|
start,
|
||||||
|node| node == goal,
|
|node| node == goal,
|
||||||
|source, _edge, _target| !pages[source.usize()].redirect as u32,
|
|source, _edge, _target| !data.pages[source.usize()].redirect as u32,
|
||||||
);
|
);
|
||||||
|
|
||||||
if dijkstra.cost(goal) == u32::MAX {
|
if dijkstra.cost(goal) == u32::MAX {
|
||||||
|
|
@ -48,7 +48,7 @@ impl Cmd {
|
||||||
println!();
|
println!();
|
||||||
println!("Path found (cost {cost}, length {}):", path.len());
|
println!("Path found (cost {cost}, length {}):", path.len());
|
||||||
for page in path {
|
for page in path {
|
||||||
let info = &pages[page.usize()];
|
let info = &data.pages[page.usize()];
|
||||||
if info.redirect {
|
if info.redirect {
|
||||||
println!("v {:?}", info.title);
|
println!("v {:?}", info.title);
|
||||||
} else {
|
} else {
|
||||||
|
|
|
||||||
|
|
@ -109,69 +109,88 @@ fn read_link(r: &mut impl Read) -> io::Result<Link> {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn write(w: &mut impl Write, pages: &[Page], links: &[Link], graph: &Graph) -> io::Result<()> {
|
#[derive(Default)]
|
||||||
assert!(pages.len() < u32::MAX as usize);
|
pub struct Data {
|
||||||
assert!(links.len() < u32::MAX as usize);
|
pub pages: Vec<Page>,
|
||||||
assert_eq!(pages.len(), graph.nodes.len());
|
pub links: Vec<Link>,
|
||||||
assert_eq!(links.len(), graph.edges.len());
|
pub graph: Graph,
|
||||||
write_u32(w, pages.len() as u32)?;
|
|
||||||
write_u32(w, links.len() as u32)?;
|
|
||||||
|
|
||||||
for page in pages {
|
|
||||||
write_page(w, page)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
for link in links {
|
|
||||||
write_link(w, link)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
for node in &graph.nodes {
|
|
||||||
write_u32(w, node.0)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
for edge in &graph.edges {
|
|
||||||
write_u32(w, edge.0)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn read(r: &mut impl Read) -> io::Result<(Vec<Page>, Vec<Link>, Graph)> {
|
impl Data {
|
||||||
let n_pages = read_u32(r)?;
|
pub fn new() -> Self {
|
||||||
let n_links = read_u32(r)?;
|
Self::default()
|
||||||
|
|
||||||
let mut pages = Vec::with_capacity(n_pages as usize);
|
|
||||||
let mut links = Vec::with_capacity(n_links as usize);
|
|
||||||
let mut graph = Graph::with_capacity(n_pages as usize, n_links as usize);
|
|
||||||
|
|
||||||
for _ in 0..n_pages {
|
|
||||||
pages.push(read_page(r)?);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for _ in 0..n_links {
|
pub fn with_capacity(pages: usize, links: usize) -> Self {
|
||||||
links.push(read_link(r)?);
|
Self {
|
||||||
|
pages: Vec::with_capacity(pages),
|
||||||
|
links: Vec::with_capacity(links),
|
||||||
|
graph: Graph::with_capacity(pages, links),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for _ in 0..n_pages {
|
fn write(&self, w: &mut impl Write) -> io::Result<()> {
|
||||||
graph.nodes.push(EdgeIdx(read_u32(r)?));
|
assert!(self.pages.len() < u32::MAX as usize);
|
||||||
|
assert!(self.links.len() < u32::MAX as usize);
|
||||||
|
assert_eq!(self.pages.len(), self.graph.nodes.len());
|
||||||
|
assert_eq!(self.links.len(), self.graph.edges.len());
|
||||||
|
write_u32(w, self.pages.len() as u32)?;
|
||||||
|
write_u32(w, self.links.len() as u32)?;
|
||||||
|
|
||||||
|
for page in &self.pages {
|
||||||
|
write_page(w, page)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
for link in &self.links {
|
||||||
|
write_link(w, link)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
for node in &self.graph.nodes {
|
||||||
|
write_u32(w, node.0)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
for edge in &self.graph.edges {
|
||||||
|
write_u32(w, edge.0)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
for _ in 0..n_links {
|
fn read(r: &mut impl Read) -> io::Result<Self> {
|
||||||
graph.edges.push(NodeIdx(read_u32(r)?));
|
let n_pages = read_u32(r)?;
|
||||||
|
let n_links = read_u32(r)?;
|
||||||
|
|
||||||
|
let mut result = Self::with_capacity(n_pages as usize, n_links as usize);
|
||||||
|
|
||||||
|
for _ in 0..n_pages {
|
||||||
|
result.pages.push(read_page(r)?);
|
||||||
|
}
|
||||||
|
|
||||||
|
for _ in 0..n_links {
|
||||||
|
result.links.push(read_link(r)?);
|
||||||
|
}
|
||||||
|
|
||||||
|
for _ in 0..n_pages {
|
||||||
|
result.graph.nodes.push(EdgeIdx(read_u32(r)?));
|
||||||
|
}
|
||||||
|
|
||||||
|
for _ in 0..n_links {
|
||||||
|
result.graph.edges.push(NodeIdx(read_u32(r)?));
|
||||||
|
}
|
||||||
|
|
||||||
|
assert_eq!(result.pages.len(), result.graph.nodes.len());
|
||||||
|
assert_eq!(result.links.len(), result.graph.edges.len());
|
||||||
|
result.graph.check_consistency();
|
||||||
|
Ok(result)
|
||||||
}
|
}
|
||||||
|
|
||||||
assert_eq!(pages.len(), graph.nodes.len());
|
pub fn write_to_file(&self, path: &Path) -> io::Result<()> {
|
||||||
assert_eq!(links.len(), graph.edges.len());
|
let mut file = BufWriter::new(File::create(path)?);
|
||||||
graph.check_consistency();
|
self.write(&mut file)
|
||||||
Ok((pages, links, graph))
|
}
|
||||||
}
|
|
||||||
|
pub fn read_from_file(path: &Path) -> io::Result<Self> {
|
||||||
pub fn write_to_file(path: &Path, pages: &[Page], links: &[Link], graph: &Graph) -> io::Result<()> {
|
let mut file = BufReader::new(File::open(path)?);
|
||||||
let mut file = BufWriter::new(File::create(path)?);
|
Self::read(&mut file)
|
||||||
write(&mut file, pages, links, graph)
|
}
|
||||||
}
|
|
||||||
|
|
||||||
pub fn read_from_file(path: &Path) -> io::Result<(Vec<Page>, Vec<Link>, Graph)> {
|
|
||||||
let mut file = BufReader::new(File::open(path)?);
|
|
||||||
read(&mut file)
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
use std::ops::{Add, AddAssign, Range, Sub, SubAssign};
|
use std::ops::{Add, AddAssign, Range, Sub, SubAssign};
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||||
pub struct NodeIdx(pub u32);
|
pub struct NodeIdx(pub u32);
|
||||||
|
|
||||||
impl NodeIdx {
|
impl NodeIdx {
|
||||||
|
|
@ -85,7 +85,7 @@ impl SubAssign<u32> for NodeIdx {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||||
pub struct EdgeIdx(pub u32);
|
pub struct EdgeIdx(pub u32);
|
||||||
|
|
||||||
impl EdgeIdx {
|
impl EdgeIdx {
|
||||||
|
|
@ -242,6 +242,11 @@ impl Graph {
|
||||||
Edges::new(self)
|
Edges::new(self)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn edges_for(&self, node: NodeIdx) -> impl Iterator<Item = (EdgeIdx, NodeIdx)> + '_ {
|
||||||
|
self.edge_range(node)
|
||||||
|
.map(|i| (EdgeIdx::new(i), self.edges[i]))
|
||||||
|
}
|
||||||
|
|
||||||
pub fn edge_start(&self, node: NodeIdx) -> EdgeIdx {
|
pub fn edge_start(&self, node: NodeIdx) -> EdgeIdx {
|
||||||
self.nodes
|
self.nodes
|
||||||
.get(node.usize())
|
.get(node.usize())
|
||||||
|
|
@ -255,7 +260,7 @@ impl Graph {
|
||||||
start.usize()..end.usize()
|
start.usize()..end.usize()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn edges_for(&self, node: NodeIdx) -> &[NodeIdx] {
|
pub fn edge_slice(&self, node: NodeIdx) -> &[NodeIdx] {
|
||||||
&self.edges[self.edge_range(node)]
|
&self.edges[self.edge_range(node)]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -283,15 +288,15 @@ impl Iterator for Edges<'_> {
|
||||||
if self.ei.usize() >= self.graph.edges.len() {
|
if self.ei.usize() >= self.graph.edges.len() {
|
||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
let to = self.graph.edges[self.ei.usize()];
|
let target = self.graph.edges[self.ei.usize()];
|
||||||
|
|
||||||
// if would not be sufficient because some nodes may not have any edges.
|
// if would not be sufficient because some nodes may not have any edges.
|
||||||
while self.ei >= self.graph.edge_start(self.ni + 1) {
|
while self.ei >= self.graph.edge_start(self.ni + 1) {
|
||||||
self.ni += 1;
|
self.ni += 1;
|
||||||
}
|
}
|
||||||
let from = self.ni;
|
let source = self.ni;
|
||||||
|
|
||||||
self.ei += 1;
|
self.ei += 1;
|
||||||
Some((from, to))
|
Some((source, target))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -4,7 +4,7 @@ use regex::Regex;
|
||||||
use thousands::Separable;
|
use thousands::Separable;
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
data::Page,
|
data::{Data, Page},
|
||||||
graph::{Graph, NodeIdx},
|
graph::{Graph, NodeIdx},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
@ -150,9 +150,9 @@ impl TitleNormalizer {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn locate_title(normalizer: &TitleNormalizer, pages: &[Page], title: &str) -> NodeIdx {
|
pub fn locate_title(normalizer: &TitleNormalizer, data: &Data, title: &str) -> NodeIdx {
|
||||||
let normalized = normalizer.normalize(title);
|
let normalized = normalizer.normalize(title);
|
||||||
pages
|
data.pages
|
||||||
.iter()
|
.iter()
|
||||||
.enumerate()
|
.enumerate()
|
||||||
.find(|(_, p)| normalizer.normalize(&p.title) == normalized)
|
.find(|(_, p)| normalizer.normalize(&p.title) == normalized)
|
||||||
|
|
@ -160,10 +160,10 @@ pub fn locate_title(normalizer: &TitleNormalizer, pages: &[Page], title: &str) -
|
||||||
.expect("invalid title")
|
.expect("invalid title")
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn resolve_redirects(pages: &[Page], graph: &Graph, mut page: NodeIdx) -> NodeIdx {
|
pub fn resolve_redirects(data: &Data, mut page: NodeIdx) -> NodeIdx {
|
||||||
loop {
|
loop {
|
||||||
if pages[page.usize()].redirect {
|
if data.pages[page.usize()].redirect {
|
||||||
if let Some(target) = graph.edges_for(page).first() {
|
if let Some(target) = data.graph.edge_slice(page).first() {
|
||||||
page = *target;
|
page = *target;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
@ -173,11 +173,6 @@ pub fn resolve_redirects(pages: &[Page], graph: &Graph, mut page: NodeIdx) -> No
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn resolve_title(
|
pub fn resolve_title(normalizer: &TitleNormalizer, data: &Data, title: &str) -> NodeIdx {
|
||||||
normalizer: &TitleNormalizer,
|
resolve_redirects(data, locate_title(normalizer, data, title))
|
||||||
pages: &[Page],
|
|
||||||
graph: &Graph,
|
|
||||||
title: &str,
|
|
||||||
) -> NodeIdx {
|
|
||||||
resolve_redirects(pages, graph, locate_title(normalizer, pages, title))
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue