From c573f1b0b020ef824d1be45df8e1f3881da784c5 Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 31 Dec 2024 15:30:11 +0100 Subject: [PATCH] Allow transforming graph before commands --- brood/src/algo.rs | 3 +- brood/src/algo/edit.rs | 74 ++++++++++++++++++++++++++++++++++++ brood/src/commands/ingest.rs | 6 +-- brood/src/commands/path.rs | 7 +--- brood/src/commands/show.rs | 7 +--- brood/src/data.rs | 16 ++++++++ brood/src/graph.rs | 4 ++ brood/src/main.rs | 34 +++++++++++++++-- 8 files changed, 134 insertions(+), 17 deletions(-) create mode 100644 brood/src/algo/edit.rs diff --git a/brood/src/algo.rs b/brood/src/algo.rs index ffc1aa5..ac1919f 100644 --- a/brood/src/algo.rs +++ b/brood/src/algo.rs @@ -1,3 +1,4 @@ mod dijkstra; +mod edit; -pub use self::dijkstra::*; +pub use self::{dijkstra::*, edit::*}; diff --git a/brood/src/algo/edit.rs b/brood/src/algo/edit.rs new file mode 100644 index 0000000..2b44298 --- /dev/null +++ b/brood/src/algo/edit.rs @@ -0,0 +1,74 @@ +use std::mem; + +use crate::{ + data::{Data, Link}, + graph::NodeIdx, + util, +}; + +pub fn retain_edges(data: &mut Data, f: impl Fn(&Link) -> bool) { + let mut links = mem::take(&mut data.links).into_iter(); + let graph = mem::take(&mut data.graph); + + for node in graph.nodes() { + data.graph.add_node(); + + for edge in graph.edge_slice(node) { + let link = links.next().unwrap(); + if f(&link) { + data.links.push(link); + data.graph.add_edge(*edge); + } + } + } +} + +pub fn resolve_redirects(data: &mut Data) { + // Permutation from input node to input node + let mut perm_redirect = vec![NodeIdx::NONE; data.pages.len()]; + for node in data.graph.nodes() { + perm_redirect[node.usize()] = util::resolve_redirects(data, node); + } + + // Permutation from input node to final node + let mut perm_retain = vec![NodeIdx::NONE; data.pages.len()]; + let mut perm_retain_count = NodeIdx(0); + for (i, page) in data.pages.iter().enumerate() { + if !page.redirect { + perm_retain[i] = perm_retain_count; + perm_retain_count += 1; + } + } + + let mut pages = mem::take(&mut data.pages).into_iter(); + let mut links = mem::take(&mut data.links).into_iter(); + let graph = mem::take(&mut data.graph); + + for node in graph.nodes() { + let page = pages.next().unwrap(); + let new_node = perm_retain[node.usize()]; + + if new_node == NodeIdx::NONE { + // Skip all edges + for _ in graph.edge_slice(node) { + links.next().unwrap(); + } + continue; + } + + data.pages.push(page); + data.graph.add_node(); + + for edge in graph.edge_slice(node) { + let link = links.next().unwrap(); + let new_edge = perm_retain[perm_redirect[edge.usize()].usize()]; + + if new_edge == NodeIdx::NONE { + continue; + } + + data.links.push(link); + data.graph.add_edge(new_edge); + } + } +} diff --git a/brood/src/commands/ingest.rs b/brood/src/commands/ingest.rs index 42fc13a..2036062 100644 --- a/brood/src/commands/ingest.rs +++ b/brood/src/commands/ingest.rs @@ -121,7 +121,7 @@ fn read_page_data( for (target, start, len, flags) in page_links { if let Some((_, brood_i)) = title_lookup.get(&normalizer.normalize(&target)) { - data.graph.edges.push(NodeIdx(*brood_i)); + data.graph.add_edge(NodeIdx(*brood_i)); data.links.push(Link { start, len, flags }); } } @@ -139,7 +139,7 @@ pub struct Cmd { } impl Cmd { - pub fn run(self, brood_data: &Path) -> io::Result<()> { + pub fn run(&self, brood_data: &Path) -> io::Result<()> { let normalizer = TitleNormalizer::new(); println!(">> First pass"); @@ -162,7 +162,7 @@ impl Cmd { drop(sift_data); // No longer needed println!("> Checking consistency"); - data.graph.check_consistency(); + data.check_consistency(); println!(">> Export"); println!( diff --git a/brood/src/commands/path.rs b/brood/src/commands/path.rs index ebea2a6..d21ba95 100644 --- a/brood/src/commands/path.rs +++ b/brood/src/commands/path.rs @@ -1,4 +1,4 @@ -use std::{io, path::Path}; +use std::io; use crate::{ algo::Dijkstra, @@ -14,12 +14,9 @@ pub struct Cmd { } impl Cmd { - pub fn run(self, data: &Path) -> io::Result<()> { + pub fn run(self, data: Data) -> io::Result<()> { let normalizer = TitleNormalizer::new(); - println!(">> Import"); - let data = Data::read_from_file(data)?; - println!(">> Resolve articles"); let start = util::resolve_title(&normalizer, &data, &self.start); let goal = util::resolve_title(&normalizer, &data, &self.goal); diff --git a/brood/src/commands/show.rs b/brood/src/commands/show.rs index 894d11d..0c67388 100644 --- a/brood/src/commands/show.rs +++ b/brood/src/commands/show.rs @@ -1,4 +1,4 @@ -use std::{collections::HashSet, io, path::Path}; +use std::{collections::HashSet, io}; use thousands::Separable; @@ -18,12 +18,9 @@ pub struct Cmd { } impl Cmd { - pub fn run(self, data: &Path) -> io::Result<()> { + pub fn run(self, data: Data) -> io::Result<()> { let normalizer = TitleNormalizer::new(); - println!(">> Import"); - let data = Data::read_from_file(data)?; - println!(">> Locate article"); let mut node = util::locate_title(&normalizer, &data, &self.title); diff --git a/brood/src/data.rs b/brood/src/data.rs index 091354b..c253094 100644 --- a/brood/src/data.rs +++ b/brood/src/data.rs @@ -192,6 +192,22 @@ impl Data { Self::read(&mut file) } + pub fn check_consistency(&self) { + assert_eq!( + self.pages.len(), + self.graph.nodes.len(), + "inconsistent number of pages" + ); + + assert_eq!( + self.links.len(), + self.graph.edges.len(), + "inconsistent number of links" + ); + + self.graph.check_consistency(); + } + pub fn redirect_target(&self, node: NodeIdx) -> Option { if !self.pages[node.usize()].redirect { return None; diff --git a/brood/src/graph.rs b/brood/src/graph.rs index 95c53e1..620b81a 100644 --- a/brood/src/graph.rs +++ b/brood/src/graph.rs @@ -194,6 +194,10 @@ impl Graph { self.nodes.push(EdgeIdx::new(self.edges.len())); } + pub fn add_edge(&mut self, target: NodeIdx) { + self.edges.push(target); + } + pub fn check_consistency(&self) { if self.nodes.is_empty() { assert!(self.edges.is_empty(), "edges must belong to existing nodes"); diff --git a/brood/src/main.rs b/brood/src/main.rs index db547ce..9f1af1e 100644 --- a/brood/src/main.rs +++ b/brood/src/main.rs @@ -7,6 +7,7 @@ mod util; use std::{io, path::PathBuf}; use clap::Parser; +use data::Data; #[derive(Debug, Parser)] enum Command { @@ -20,13 +21,40 @@ struct Args { datafile: PathBuf, #[command(subcommand)] command: Command, + #[arg(long, short = 'P')] + in_parens: Option, + #[arg(long, short = 'S')] + in_structure: Option, + #[arg(long, short = 'R')] + resolve_redirects: bool, } fn main() -> io::Result<()> { let args = Args::parse(); + + if let Command::Ingest(cmd) = &args.command { + return cmd.run(&args.datafile); + } + + println!(">> Import"); + let mut data = Data::read_from_file(&args.datafile)?; + + if args.in_parens.is_some() || args.in_structure.is_some() { + println!("> Filtering edges"); + algo::retain_edges(&mut data, |link| { + args.in_parens.is_none_or(|b| b == link.in_parens()) + && args.in_structure.is_none_or(|b| b == link.in_structure()) + }); + } + + if args.resolve_redirects { + println!("> Resolving redirects"); + algo::resolve_redirects(&mut data); + } + match args.command { - Command::Ingest(cmd) => cmd.run(&args.datafile), - Command::Show(cmd) => cmd.run(&args.datafile), - Command::Path(cmd) => cmd.run(&args.datafile), + Command::Ingest(_) => unreachable!(), + Command::Show(cmd) => cmd.run(data), + Command::Path(cmd) => cmd.run(data), } }