Allow transforming graph before commands
This commit is contained in:
parent
ab7b7295ca
commit
c573f1b0b0
8 changed files with 134 additions and 17 deletions
|
|
@ -1,3 +1,4 @@
|
||||||
mod dijkstra;
|
mod dijkstra;
|
||||||
|
mod edit;
|
||||||
|
|
||||||
pub use self::dijkstra::*;
|
pub use self::{dijkstra::*, edit::*};
|
||||||
|
|
|
||||||
74
brood/src/algo/edit.rs
Normal file
74
brood/src/algo/edit.rs
Normal file
|
|
@ -0,0 +1,74 @@
|
||||||
|
use std::mem;
|
||||||
|
|
||||||
|
use crate::{
|
||||||
|
data::{Data, Link},
|
||||||
|
graph::NodeIdx,
|
||||||
|
util,
|
||||||
|
};
|
||||||
|
|
||||||
|
pub fn retain_edges(data: &mut Data, f: impl Fn(&Link) -> bool) {
|
||||||
|
let mut links = mem::take(&mut data.links).into_iter();
|
||||||
|
let graph = mem::take(&mut data.graph);
|
||||||
|
|
||||||
|
for node in graph.nodes() {
|
||||||
|
data.graph.add_node();
|
||||||
|
|
||||||
|
for edge in graph.edge_slice(node) {
|
||||||
|
let link = links.next().unwrap();
|
||||||
|
if f(&link) {
|
||||||
|
data.links.push(link);
|
||||||
|
data.graph.add_edge(*edge);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn resolve_redirects(data: &mut Data) {
|
||||||
|
// Permutation from input node to input node
|
||||||
|
let mut perm_redirect = vec![NodeIdx::NONE; data.pages.len()];
|
||||||
|
for node in data.graph.nodes() {
|
||||||
|
perm_redirect[node.usize()] = util::resolve_redirects(data, node);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Permutation from input node to final node
|
||||||
|
let mut perm_retain = vec![NodeIdx::NONE; data.pages.len()];
|
||||||
|
let mut perm_retain_count = NodeIdx(0);
|
||||||
|
for (i, page) in data.pages.iter().enumerate() {
|
||||||
|
if !page.redirect {
|
||||||
|
perm_retain[i] = perm_retain_count;
|
||||||
|
perm_retain_count += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut pages = mem::take(&mut data.pages).into_iter();
|
||||||
|
let mut links = mem::take(&mut data.links).into_iter();
|
||||||
|
let graph = mem::take(&mut data.graph);
|
||||||
|
|
||||||
|
for node in graph.nodes() {
|
||||||
|
let page = pages.next().unwrap();
|
||||||
|
let new_node = perm_retain[node.usize()];
|
||||||
|
|
||||||
|
if new_node == NodeIdx::NONE {
|
||||||
|
// Skip all edges
|
||||||
|
for _ in graph.edge_slice(node) {
|
||||||
|
links.next().unwrap();
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
data.pages.push(page);
|
||||||
|
data.graph.add_node();
|
||||||
|
|
||||||
|
for edge in graph.edge_slice(node) {
|
||||||
|
let link = links.next().unwrap();
|
||||||
|
let new_edge = perm_retain[perm_redirect[edge.usize()].usize()];
|
||||||
|
|
||||||
|
if new_edge == NodeIdx::NONE {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
data.links.push(link);
|
||||||
|
data.graph.add_edge(new_edge);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -121,7 +121,7 @@ fn read_page_data(
|
||||||
|
|
||||||
for (target, start, len, flags) in page_links {
|
for (target, start, len, flags) in page_links {
|
||||||
if let Some((_, brood_i)) = title_lookup.get(&normalizer.normalize(&target)) {
|
if let Some((_, brood_i)) = title_lookup.get(&normalizer.normalize(&target)) {
|
||||||
data.graph.edges.push(NodeIdx(*brood_i));
|
data.graph.add_edge(NodeIdx(*brood_i));
|
||||||
data.links.push(Link { start, len, flags });
|
data.links.push(Link { start, len, flags });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -139,7 +139,7 @@ pub struct Cmd {
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Cmd {
|
impl Cmd {
|
||||||
pub fn run(self, brood_data: &Path) -> io::Result<()> {
|
pub fn run(&self, brood_data: &Path) -> io::Result<()> {
|
||||||
let normalizer = TitleNormalizer::new();
|
let normalizer = TitleNormalizer::new();
|
||||||
|
|
||||||
println!(">> First pass");
|
println!(">> First pass");
|
||||||
|
|
@ -162,7 +162,7 @@ impl Cmd {
|
||||||
drop(sift_data); // No longer needed
|
drop(sift_data); // No longer needed
|
||||||
|
|
||||||
println!("> Checking consistency");
|
println!("> Checking consistency");
|
||||||
data.graph.check_consistency();
|
data.check_consistency();
|
||||||
|
|
||||||
println!(">> Export");
|
println!(">> Export");
|
||||||
println!(
|
println!(
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
use std::{io, path::Path};
|
use std::io;
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
algo::Dijkstra,
|
algo::Dijkstra,
|
||||||
|
|
@ -14,12 +14,9 @@ pub struct Cmd {
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Cmd {
|
impl Cmd {
|
||||||
pub fn run(self, data: &Path) -> io::Result<()> {
|
pub fn run(self, data: Data) -> io::Result<()> {
|
||||||
let normalizer = TitleNormalizer::new();
|
let normalizer = TitleNormalizer::new();
|
||||||
|
|
||||||
println!(">> Import");
|
|
||||||
let data = Data::read_from_file(data)?;
|
|
||||||
|
|
||||||
println!(">> Resolve articles");
|
println!(">> Resolve articles");
|
||||||
let start = util::resolve_title(&normalizer, &data, &self.start);
|
let start = util::resolve_title(&normalizer, &data, &self.start);
|
||||||
let goal = util::resolve_title(&normalizer, &data, &self.goal);
|
let goal = util::resolve_title(&normalizer, &data, &self.goal);
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
use std::{collections::HashSet, io, path::Path};
|
use std::{collections::HashSet, io};
|
||||||
|
|
||||||
use thousands::Separable;
|
use thousands::Separable;
|
||||||
|
|
||||||
|
|
@ -18,12 +18,9 @@ pub struct Cmd {
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Cmd {
|
impl Cmd {
|
||||||
pub fn run(self, data: &Path) -> io::Result<()> {
|
pub fn run(self, data: Data) -> io::Result<()> {
|
||||||
let normalizer = TitleNormalizer::new();
|
let normalizer = TitleNormalizer::new();
|
||||||
|
|
||||||
println!(">> Import");
|
|
||||||
let data = Data::read_from_file(data)?;
|
|
||||||
|
|
||||||
println!(">> Locate article");
|
println!(">> Locate article");
|
||||||
let mut node = util::locate_title(&normalizer, &data, &self.title);
|
let mut node = util::locate_title(&normalizer, &data, &self.title);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -192,6 +192,22 @@ impl Data {
|
||||||
Self::read(&mut file)
|
Self::read(&mut file)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn check_consistency(&self) {
|
||||||
|
assert_eq!(
|
||||||
|
self.pages.len(),
|
||||||
|
self.graph.nodes.len(),
|
||||||
|
"inconsistent number of pages"
|
||||||
|
);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
self.links.len(),
|
||||||
|
self.graph.edges.len(),
|
||||||
|
"inconsistent number of links"
|
||||||
|
);
|
||||||
|
|
||||||
|
self.graph.check_consistency();
|
||||||
|
}
|
||||||
|
|
||||||
pub fn redirect_target(&self, node: NodeIdx) -> Option<NodeIdx> {
|
pub fn redirect_target(&self, node: NodeIdx) -> Option<NodeIdx> {
|
||||||
if !self.pages[node.usize()].redirect {
|
if !self.pages[node.usize()].redirect {
|
||||||
return None;
|
return None;
|
||||||
|
|
|
||||||
|
|
@ -194,6 +194,10 @@ impl Graph {
|
||||||
self.nodes.push(EdgeIdx::new(self.edges.len()));
|
self.nodes.push(EdgeIdx::new(self.edges.len()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn add_edge(&mut self, target: NodeIdx) {
|
||||||
|
self.edges.push(target);
|
||||||
|
}
|
||||||
|
|
||||||
pub fn check_consistency(&self) {
|
pub fn check_consistency(&self) {
|
||||||
if self.nodes.is_empty() {
|
if self.nodes.is_empty() {
|
||||||
assert!(self.edges.is_empty(), "edges must belong to existing nodes");
|
assert!(self.edges.is_empty(), "edges must belong to existing nodes");
|
||||||
|
|
|
||||||
|
|
@ -7,6 +7,7 @@ mod util;
|
||||||
use std::{io, path::PathBuf};
|
use std::{io, path::PathBuf};
|
||||||
|
|
||||||
use clap::Parser;
|
use clap::Parser;
|
||||||
|
use data::Data;
|
||||||
|
|
||||||
#[derive(Debug, Parser)]
|
#[derive(Debug, Parser)]
|
||||||
enum Command {
|
enum Command {
|
||||||
|
|
@ -20,13 +21,40 @@ struct Args {
|
||||||
datafile: PathBuf,
|
datafile: PathBuf,
|
||||||
#[command(subcommand)]
|
#[command(subcommand)]
|
||||||
command: Command,
|
command: Command,
|
||||||
|
#[arg(long, short = 'P')]
|
||||||
|
in_parens: Option<bool>,
|
||||||
|
#[arg(long, short = 'S')]
|
||||||
|
in_structure: Option<bool>,
|
||||||
|
#[arg(long, short = 'R')]
|
||||||
|
resolve_redirects: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
fn main() -> io::Result<()> {
|
fn main() -> io::Result<()> {
|
||||||
let args = Args::parse();
|
let args = Args::parse();
|
||||||
|
|
||||||
|
if let Command::Ingest(cmd) = &args.command {
|
||||||
|
return cmd.run(&args.datafile);
|
||||||
|
}
|
||||||
|
|
||||||
|
println!(">> Import");
|
||||||
|
let mut data = Data::read_from_file(&args.datafile)?;
|
||||||
|
|
||||||
|
if args.in_parens.is_some() || args.in_structure.is_some() {
|
||||||
|
println!("> Filtering edges");
|
||||||
|
algo::retain_edges(&mut data, |link| {
|
||||||
|
args.in_parens.is_none_or(|b| b == link.in_parens())
|
||||||
|
&& args.in_structure.is_none_or(|b| b == link.in_structure())
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if args.resolve_redirects {
|
||||||
|
println!("> Resolving redirects");
|
||||||
|
algo::resolve_redirects(&mut data);
|
||||||
|
}
|
||||||
|
|
||||||
match args.command {
|
match args.command {
|
||||||
Command::Ingest(cmd) => cmd.run(&args.datafile),
|
Command::Ingest(_) => unreachable!(),
|
||||||
Command::Show(cmd) => cmd.run(&args.datafile),
|
Command::Show(cmd) => cmd.run(data),
|
||||||
Command::Path(cmd) => cmd.run(&args.datafile),
|
Command::Path(cmd) => cmd.run(data),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue