Allow transforming graph before commands

This commit is contained in:
Joscha 2024-12-31 15:30:11 +01:00
parent ab7b7295ca
commit c573f1b0b0
8 changed files with 134 additions and 17 deletions

View file

@ -1,3 +1,4 @@
mod dijkstra; mod dijkstra;
mod edit;
pub use self::dijkstra::*; pub use self::{dijkstra::*, edit::*};

74
brood/src/algo/edit.rs Normal file
View file

@ -0,0 +1,74 @@
use std::mem;
use crate::{
data::{Data, Link},
graph::NodeIdx,
util,
};
pub fn retain_edges(data: &mut Data, f: impl Fn(&Link) -> bool) {
let mut links = mem::take(&mut data.links).into_iter();
let graph = mem::take(&mut data.graph);
for node in graph.nodes() {
data.graph.add_node();
for edge in graph.edge_slice(node) {
let link = links.next().unwrap();
if f(&link) {
data.links.push(link);
data.graph.add_edge(*edge);
}
}
}
}
pub fn resolve_redirects(data: &mut Data) {
// Permutation from input node to input node
let mut perm_redirect = vec![NodeIdx::NONE; data.pages.len()];
for node in data.graph.nodes() {
perm_redirect[node.usize()] = util::resolve_redirects(data, node);
}
// Permutation from input node to final node
let mut perm_retain = vec![NodeIdx::NONE; data.pages.len()];
let mut perm_retain_count = NodeIdx(0);
for (i, page) in data.pages.iter().enumerate() {
if !page.redirect {
perm_retain[i] = perm_retain_count;
perm_retain_count += 1;
}
}
let mut pages = mem::take(&mut data.pages).into_iter();
let mut links = mem::take(&mut data.links).into_iter();
let graph = mem::take(&mut data.graph);
for node in graph.nodes() {
let page = pages.next().unwrap();
let new_node = perm_retain[node.usize()];
if new_node == NodeIdx::NONE {
// Skip all edges
for _ in graph.edge_slice(node) {
links.next().unwrap();
}
continue;
}
data.pages.push(page);
data.graph.add_node();
for edge in graph.edge_slice(node) {
let link = links.next().unwrap();
let new_edge = perm_retain[perm_redirect[edge.usize()].usize()];
if new_edge == NodeIdx::NONE {
continue;
}
data.links.push(link);
data.graph.add_edge(new_edge);
}
}
}

View file

@ -121,7 +121,7 @@ fn read_page_data(
for (target, start, len, flags) in page_links { for (target, start, len, flags) in page_links {
if let Some((_, brood_i)) = title_lookup.get(&normalizer.normalize(&target)) { if let Some((_, brood_i)) = title_lookup.get(&normalizer.normalize(&target)) {
data.graph.edges.push(NodeIdx(*brood_i)); data.graph.add_edge(NodeIdx(*brood_i));
data.links.push(Link { start, len, flags }); data.links.push(Link { start, len, flags });
} }
} }
@ -139,7 +139,7 @@ pub struct Cmd {
} }
impl Cmd { impl Cmd {
pub fn run(self, brood_data: &Path) -> io::Result<()> { pub fn run(&self, brood_data: &Path) -> io::Result<()> {
let normalizer = TitleNormalizer::new(); let normalizer = TitleNormalizer::new();
println!(">> First pass"); println!(">> First pass");
@ -162,7 +162,7 @@ impl Cmd {
drop(sift_data); // No longer needed drop(sift_data); // No longer needed
println!("> Checking consistency"); println!("> Checking consistency");
data.graph.check_consistency(); data.check_consistency();
println!(">> Export"); println!(">> Export");
println!( println!(

View file

@ -1,4 +1,4 @@
use std::{io, path::Path}; use std::io;
use crate::{ use crate::{
algo::Dijkstra, algo::Dijkstra,
@ -14,12 +14,9 @@ pub struct Cmd {
} }
impl Cmd { impl Cmd {
pub fn run(self, data: &Path) -> io::Result<()> { pub fn run(self, data: Data) -> io::Result<()> {
let normalizer = TitleNormalizer::new(); let normalizer = TitleNormalizer::new();
println!(">> Import");
let data = Data::read_from_file(data)?;
println!(">> Resolve articles"); println!(">> Resolve articles");
let start = util::resolve_title(&normalizer, &data, &self.start); let start = util::resolve_title(&normalizer, &data, &self.start);
let goal = util::resolve_title(&normalizer, &data, &self.goal); let goal = util::resolve_title(&normalizer, &data, &self.goal);

View file

@ -1,4 +1,4 @@
use std::{collections::HashSet, io, path::Path}; use std::{collections::HashSet, io};
use thousands::Separable; use thousands::Separable;
@ -18,12 +18,9 @@ pub struct Cmd {
} }
impl Cmd { impl Cmd {
pub fn run(self, data: &Path) -> io::Result<()> { pub fn run(self, data: Data) -> io::Result<()> {
let normalizer = TitleNormalizer::new(); let normalizer = TitleNormalizer::new();
println!(">> Import");
let data = Data::read_from_file(data)?;
println!(">> Locate article"); println!(">> Locate article");
let mut node = util::locate_title(&normalizer, &data, &self.title); let mut node = util::locate_title(&normalizer, &data, &self.title);

View file

@ -192,6 +192,22 @@ impl Data {
Self::read(&mut file) Self::read(&mut file)
} }
pub fn check_consistency(&self) {
assert_eq!(
self.pages.len(),
self.graph.nodes.len(),
"inconsistent number of pages"
);
assert_eq!(
self.links.len(),
self.graph.edges.len(),
"inconsistent number of links"
);
self.graph.check_consistency();
}
pub fn redirect_target(&self, node: NodeIdx) -> Option<NodeIdx> { pub fn redirect_target(&self, node: NodeIdx) -> Option<NodeIdx> {
if !self.pages[node.usize()].redirect { if !self.pages[node.usize()].redirect {
return None; return None;

View file

@ -194,6 +194,10 @@ impl Graph {
self.nodes.push(EdgeIdx::new(self.edges.len())); self.nodes.push(EdgeIdx::new(self.edges.len()));
} }
pub fn add_edge(&mut self, target: NodeIdx) {
self.edges.push(target);
}
pub fn check_consistency(&self) { pub fn check_consistency(&self) {
if self.nodes.is_empty() { if self.nodes.is_empty() {
assert!(self.edges.is_empty(), "edges must belong to existing nodes"); assert!(self.edges.is_empty(), "edges must belong to existing nodes");

View file

@ -7,6 +7,7 @@ mod util;
use std::{io, path::PathBuf}; use std::{io, path::PathBuf};
use clap::Parser; use clap::Parser;
use data::Data;
#[derive(Debug, Parser)] #[derive(Debug, Parser)]
enum Command { enum Command {
@ -20,13 +21,40 @@ struct Args {
datafile: PathBuf, datafile: PathBuf,
#[command(subcommand)] #[command(subcommand)]
command: Command, command: Command,
#[arg(long, short = 'P')]
in_parens: Option<bool>,
#[arg(long, short = 'S')]
in_structure: Option<bool>,
#[arg(long, short = 'R')]
resolve_redirects: bool,
} }
fn main() -> io::Result<()> { fn main() -> io::Result<()> {
let args = Args::parse(); let args = Args::parse();
if let Command::Ingest(cmd) = &args.command {
return cmd.run(&args.datafile);
}
println!(">> Import");
let mut data = Data::read_from_file(&args.datafile)?;
if args.in_parens.is_some() || args.in_structure.is_some() {
println!("> Filtering edges");
algo::retain_edges(&mut data, |link| {
args.in_parens.is_none_or(|b| b == link.in_parens())
&& args.in_structure.is_none_or(|b| b == link.in_structure())
});
}
if args.resolve_redirects {
println!("> Resolving redirects");
algo::resolve_redirects(&mut data);
}
match args.command { match args.command {
Command::Ingest(cmd) => cmd.run(&args.datafile), Command::Ingest(_) => unreachable!(),
Command::Show(cmd) => cmd.run(&args.datafile), Command::Show(cmd) => cmd.run(data),
Command::Path(cmd) => cmd.run(&args.datafile), Command::Path(cmd) => cmd.run(data),
} }
} }