diff --git a/brood/src/commands.rs b/brood/src/commands.rs index cc694c1..f58324f 100644 --- a/brood/src/commands.rs +++ b/brood/src/commands.rs @@ -1,4 +1,5 @@ pub mod export; pub mod ingest; pub mod path; +pub mod redirects; pub mod show; diff --git a/brood/src/commands/redirects.rs b/brood/src/commands/redirects.rs new file mode 100644 index 0000000..aeab362 --- /dev/null +++ b/brood/src/commands/redirects.rs @@ -0,0 +1,107 @@ +use std::{cmp::Reverse, collections::HashSet, io}; + +use thousands::Separable; + +use crate::{data::Data, graph::NodeIdx, util}; + +fn find_redirects(data: &Data) -> Vec<(NodeIdx, NodeIdx, usize)> { + let mut redirects = Vec::<(NodeIdx, NodeIdx, usize)>::new(); + + for node in data.graph.nodes() { + if !data.pages[node.usize()].redirect { + continue; + } + + let mut seen = HashSet::new(); + + let mut curr = node; + seen.insert(node); + + while let Some(next) = data.redirect_target(curr) { + if seen.contains(&next) { + println!(" Redirect loop: {}", data.pages[node.usize()].title); + break; + } + + curr = next; + seen.insert(next); + } + + redirects.push((node, curr, seen.len() - 1)); + } + + redirects +} + +fn follow_redirect(data: &Data, start: NodeIdx) -> Vec { + let mut seen = HashSet::new(); + let mut nodes = Vec::new(); + + let mut curr = start; + seen.insert(curr); + nodes.push(curr); + + while let Some(next) = data.redirect_target(curr) { + if seen.contains(&next) { + break; + } + + curr = next; + seen.insert(curr); + nodes.push(curr); + } + + nodes +} + +/// Show interesting redirect stats. +#[derive(Debug, clap::Parser)] +pub struct Cmd { + #[arg(long, short)] + long: bool, +} + +impl Cmd { + pub fn run(self, data: Data) -> io::Result<()> { + println!(">> Resolving redirects"); + let redirects = find_redirects(&data); + + println!(); + println!( + "There is a total of {} redirects.", + redirects.len().separate_with_underscores() + ); + + let mut long = redirects + .iter() + .filter(|(_, _, l)| *l > 1) + .collect::>(); + long.sort_by_key(|(_, _, l)| Reverse(l)); + + println!( + "{} redirects take more than one step to reach an article.", + long.len().separate_with_underscores() + ); + + println!( + "The longest redirect chain takes {} steps.", + long.iter().map(|(_, _, l)| l).max().copied().unwrap_or(0), + ); + + println!("Though these redirect chains are usually swiftly fixed by bots."); + + if self.long { + println!(); + println!("Redirect chains with length > 1:"); + + for (start, _, _) in long { + println!(); + for step in follow_redirect(&data, *start) { + println!("{}", util::fmt_page(&data.pages[step.usize()])); + } + } + } + + Ok(()) + } +} diff --git a/brood/src/main.rs b/brood/src/main.rs index 757695b..a84ee1b 100644 --- a/brood/src/main.rs +++ b/brood/src/main.rs @@ -15,6 +15,7 @@ enum Command { Export(commands::export::Cmd), Show(commands::show::Cmd), Path(commands::path::Cmd), + Redirects(commands::redirects::Cmd), } #[derive(Debug, Parser)] @@ -73,5 +74,6 @@ fn main() -> io::Result<()> { Command::Export(cmd) => cmd.run(data), Command::Show(cmd) => cmd.run(data), Command::Path(cmd) => cmd.run(data), + Command::Redirects(cmd) => cmd.run(data), } }