From 8016bbfc83a1dde9dba3a6be9e55082d7296b899 Mon Sep 17 00:00:00 2001 From: Joscha Date: Wed, 1 Jan 2025 00:59:03 +0100 Subject: [PATCH] Port and rename pg command --- brood/src/commands.rs | 1 + brood/src/commands/pg.rs | 273 ++++++++++++++++++++++++++ brood/src/commands/philosophy_game.rs | 269 ------------------------- brood/src/main.rs | 2 + 4 files changed, 276 insertions(+), 269 deletions(-) create mode 100644 brood/src/commands/pg.rs delete mode 100644 brood/src/commands/philosophy_game.rs diff --git a/brood/src/commands.rs b/brood/src/commands.rs index 9885f8a..fbb29d7 100644 --- a/brood/src/commands.rs +++ b/brood/src/commands.rs @@ -2,5 +2,6 @@ pub mod export; pub mod ingest; pub mod longest_path; pub mod path; +pub mod pg; pub mod show; pub mod stats; diff --git a/brood/src/commands/pg.rs b/brood/src/commands/pg.rs new file mode 100644 index 0000000..a106a3a --- /dev/null +++ b/brood/src/commands/pg.rs @@ -0,0 +1,273 @@ +use std::{ + collections::{BTreeSet, HashMap, HashSet}, + io::{self, BufWriter}, +}; + +use crate::{ + data::Data, + graph::NodeIdx, + util::{self, TitleNormalizer}, +}; + +struct PageMap(Vec); + +impl PageMap { + fn new(len: usize) -> Self { + Self(vec![NodeIdx::NONE; len]) + } + + fn get(&self, node: NodeIdx) -> NodeIdx { + self.0[node.usize()] + } + + fn set(&mut self, node: NodeIdx, to: NodeIdx) { + self.0[node.usize()] = to; + } +} + +fn first_viable_link(data: &Data, node: NodeIdx) -> Option { + for edge in data.graph.edge_slice(node) { + let link = &data.links[edge.usize()]; + if !link.in_parens() && !link.in_structure() { + return Some(*edge); + } + } + None +} + +fn find_forward_edges(data: &Data) -> PageMap { + let mut result = PageMap::new(data.pages.len()); + for node in data.graph.nodes() { + if let Some(first_link) = first_viable_link(data, node) { + result.set(node, first_link); + } + } + result +} + +fn find_clusters(data: &Data, forward: &PageMap) -> PageMap { + let mut cluster = PageMap::new(data.pages.len()); + for node in data.graph.nodes() { + let mut current = node; + let mut visited = HashSet::new(); + let canonical = loop { + // We've already determined the canonical element for this page. + if cluster.get(current) != NodeIdx::NONE { + break cluster.get(current); + } + + // We've hit a loop + if visited.contains(¤t) { + let mut loop_members = BTreeSet::new(); + while !loop_members.contains(¤t) { + loop_members.insert(current); + current = forward.get(current); + } + break loop_members.pop_first().unwrap(); + } + + visited.insert(current); + + let next = forward.get(current); + if next == NodeIdx::NONE { + // We've hit a dead-end + break current; + } + + current = next; + }; + + for i in visited { + cluster.set(i, canonical); + } + } + + cluster +} + +enum Cluster { + DeadEnd(NodeIdx), + Loop(Vec), +} + +fn resolve_clusters(forward: &PageMap, cluster: &PageMap) -> HashMap { + let mut result = HashMap::new(); + for canonical in cluster.0.iter().copied().collect::>() { + if forward.get(canonical) == NodeIdx::NONE { + result.insert(canonical, Cluster::DeadEnd(canonical)); + continue; + } + + let mut members = vec![]; + let mut current = canonical; + loop { + members.push(current); + current = forward.get(current); + if current == canonical { + break; + } + } + result.insert(canonical, Cluster::Loop(members)); + } + + result +} + +fn print_forward_edges_as_json(data: &Data, forward: &PageMap) -> io::Result<()> { + let map = forward + .0 + .iter() + .enumerate() + .map(|(node, first_link)| { + let page_title = &data.pages[node].title; + let first_link_title = if *first_link == NodeIdx::NONE { + None + } else { + Some(&data.pages[first_link.usize()].title) + }; + (page_title, first_link_title) + }) + .collect::>(); + + let writer = BufWriter::new(io::stdout()); + serde_json::to_writer_pretty(writer, &map)?; + Ok(()) +} + +fn print_trace(normalizer: &TitleNormalizer, data: &Data, forward: &PageMap, start: &str) { + let start_idx = util::resolve_title(normalizer, data, start); + + let mut current = start_idx; + let mut visited = HashSet::new(); + loop { + let page = &data.pages[current.usize()]; + let title = &page.title; + if page.redirect { + println!(" v {title}"); + } else { + println!(" - {title}"); + } + + visited.insert(current); + + let next = forward.get(current); + + if next == NodeIdx::NONE { + println!("> dead-end reached"); + return; + } + + if visited.contains(&next) { + let page = &data.pages[next.usize()]; + let title = &page.title; + println!("> loop detected ({title})"); + return; + } + + current = next; + } +} + +fn print_canonical_pages_as_json(data: &Data, cluster: &PageMap) -> io::Result<()> { + let map = cluster + .0 + .iter() + .enumerate() + .map(|(page, canonical)| { + ( + &data.pages[page].title, + &data.pages[canonical.usize()].title, + ) + }) + .collect::>(); + + let writer = BufWriter::new(io::stdout()); + serde_json::to_writer_pretty(writer, &map)?; + Ok(()) +} + +#[derive(Debug, PartialEq, Eq, clap::Parser)] +enum Command { + First, + Trace { start: String }, + Canonical, + Cluster, +} + +/// Show interesting stats. +#[derive(Debug, clap::Parser)] +pub struct Cmd { + #[command(subcommand)] + command: Command, +} + +impl Cmd { + pub fn run(self, data: Data) -> io::Result<()> { + let normalizer = TitleNormalizer::new(); + + eprintln!(">> Forward"); + let forward = find_forward_edges(&data); + + match self.command { + Command::First => { + eprintln!(">> First links"); + print_forward_edges_as_json(&data, &forward)?; + return Ok(()); + } + Command::Trace { start } => { + eprintln!(">> Tracing"); + print_trace(&normalizer, &data, &forward, &start); + return Ok(()); + } + _ => {} + } + + // Determine cluster for each page, represented via canonical page. The + // canonical page of a cluster is either a dead-end or the loop member with + // the smallest index. + eprintln!(">> Find clusters"); + let cluster = find_clusters(&data, &forward); + + if self.command == Command::Canonical { + print_canonical_pages_as_json(&data, &cluster)?; + return Ok(()); + } + + // Measure cluster size + eprintln!(">> Measure clusters"); + let mut cluster_size = HashMap::::new(); + for (i, canonical) in cluster.0.iter().enumerate() { + assert!(*canonical != NodeIdx::NONE, "{}", data.pages[i].title); + *cluster_size.entry(*canonical).or_default() += 1; + } + let mut cluster_by_size = cluster_size.into_iter().collect::>(); + cluster_by_size.sort_by_key(|(c, s)| (*s, *c)); + cluster_by_size.reverse(); + + // Print clusters + assert!(self.command == Command::Cluster); + let resolved = resolve_clusters(&forward, &cluster); + for (canonical, size) in cluster_by_size { + match resolved.get(&canonical).unwrap() { + Cluster::DeadEnd(page) => { + let title = &data.pages[page.usize()].title; + println!("Cluster (dead-end, {size}): {title}"); + } + Cluster::Loop(pages) => { + println!("Cluster ({}-loop, {size}):", pages.len()); + for page in pages { + let page = &data.pages[page.usize()]; + let title = &page.title; + if page.redirect { + println!(" v {title}"); + } else { + println!(" - {title}"); + } + } + } + } + } + + Ok(()) + } +} diff --git a/brood/src/commands/philosophy_game.rs b/brood/src/commands/philosophy_game.rs deleted file mode 100644 index b276bd5..0000000 --- a/brood/src/commands/philosophy_game.rs +++ /dev/null @@ -1,269 +0,0 @@ -use std::{ - collections::{BTreeSet, HashMap, HashSet}, - fs::File, - io::{self, BufReader, BufWriter}, - path::Path, -}; - -use crate::{ - data::{ - adjacency_list::AdjacencyList, - info::{LinkInfo, PageInfo}, - store, - }, - util, PhilosophyGameCmd, -}; - -struct PageMap(Vec); - -impl PageMap { - fn new(len: usize) -> Self { - Self(vec![u32::MAX; len]) - } - - fn get(&self, page_idx: u32) -> u32 { - self.0[page_idx as usize] - } - - fn set(&mut self, page_idx: u32, to: u32) { - self.0[page_idx as usize] = to; - } -} - -fn first_viable_link(data: &AdjacencyList, page_idx: u32) -> Option { - for link_idx in data.link_range(page_idx) { - let link = data.link(link_idx); - if !link.data.in_parens() && !link.data.in_structure() { - return Some(link.to); - } - } - None -} - -fn find_forward_edges(data: &AdjacencyList) -> PageMap { - let mut result = PageMap::new(data.pages.len()); - for (page_idx, _) in data.pages() { - if let Some(first_link) = first_viable_link(data, page_idx) { - result.set(page_idx, first_link); - } - } - result -} - -fn find_clusters(data: &AdjacencyList, forward: &PageMap) -> PageMap { - let mut cluster = PageMap::new(data.pages.len()); - for (page_idx, _) in data.pages() { - let mut current = page_idx; - let mut visited = HashSet::new(); - let canonical = loop { - // We've already determined the canonical element for this page. - if cluster.get(current) != u32::MAX { - break cluster.get(current); - } - - // We've hit a loop - if visited.contains(¤t) { - let mut loop_members = BTreeSet::new(); - while !loop_members.contains(¤t) { - loop_members.insert(current); - current = forward.get(current); - } - break loop_members.pop_first().unwrap(); - } - - visited.insert(current); - - let next = forward.get(current); - if next == u32::MAX { - // We've hit a dead-end - break current; - } - - current = next; - }; - - for i in visited { - cluster.set(i, canonical); - } - } - - cluster -} - -enum Cluster { - DeadEnd(u32), - Loop(Vec), -} - -fn resolve_clusters(forward: &PageMap, cluster: &PageMap) -> HashMap { - let mut result = HashMap::new(); - for canonical in cluster.0.iter().copied().collect::>() { - if forward.get(canonical) == u32::MAX { - result.insert(canonical, Cluster::DeadEnd(canonical)); - continue; - } - - let mut members = vec![]; - let mut current = canonical; - loop { - members.push(current); - current = forward.get(current); - if current == canonical { - break; - } - } - result.insert(canonical, Cluster::Loop(members)); - } - - result -} - -fn print_forward_edges_as_json( - data: &AdjacencyList, - forward: &PageMap, -) -> io::Result<()> { - let map = forward - .0 - .iter() - .enumerate() - .map(|(page, first_link)| { - let page_title = &data.page(page as u32).data.title; - let first_link_title = if *first_link == u32::MAX { - None - } else { - Some(&data.page(*first_link).data.title) - }; - (page_title, first_link_title) - }) - .collect::>(); - - let writer = BufWriter::new(io::stdout()); - serde_json::to_writer_pretty(writer, &map)?; - Ok(()) -} - -fn print_trace(data: &AdjacencyList, forward: &PageMap, start: &str) { - let start_idx = util::resolve_redirects(data, util::find_index_of_title(&data.pages, start)); - - let mut current = start_idx; - let mut visited = HashSet::new(); - loop { - let page = data.page(current); - let title = &page.data.title; - if page.data.redirect { - println!(" v {title}"); - } else { - println!(" - {title}"); - } - - visited.insert(current); - - let next = forward.get(current); - - if next == u32::MAX { - println!("> dead-end reached"); - return; - } - - if visited.contains(&next) { - let page = data.page(next); - let title = &page.data.title; - println!("> loop detected ({title})"); - return; - } - - current = next; - } -} - -fn print_canonical_pages_as_json( - data: &AdjacencyList, - cluster: &PageMap, -) -> io::Result<()> { - let map = cluster - .0 - .iter() - .enumerate() - .map(|(page, canonical)| { - ( - &data.page(page as u32).data.title, - &data.page(*canonical).data.title, - ) - }) - .collect::>(); - - let writer = BufWriter::new(io::stdout()); - serde_json::to_writer_pretty(writer, &map)?; - Ok(()) -} - -pub fn run(datafile: &Path, subcmd: PhilosophyGameCmd) -> io::Result<()> { - eprintln!(">> Import"); - let mut databuf = BufReader::new(File::open(datafile)?); - let data = store::read_adjacency_list(&mut databuf)?; - - eprintln!(">> Forward"); - let forward = find_forward_edges(&data); - - match subcmd { - PhilosophyGameCmd::First => { - eprintln!(">> First links"); - print_forward_edges_as_json(&data, &forward)?; - return Ok(()); - } - PhilosophyGameCmd::Trace { start } => { - eprintln!(">> Tracing"); - print_trace(&data, &forward, &start); - return Ok(()); - } - _ => {} - } - - // Determine cluster for each page, represented via canonical page. The - // canonical page of a cluster is either a dead-end or the loop member with - // the smallest index. - eprintln!(">> Find clusters"); - let cluster = find_clusters(&data, &forward); - - if subcmd == PhilosophyGameCmd::Canonical { - print_canonical_pages_as_json(&data, &cluster)?; - return Ok(()); - } - - // Measure cluster size - eprintln!(">> Measure clusters"); - let mut cluster_size = HashMap::::new(); - for (i, canonical) in cluster.0.iter().enumerate() { - assert!(*canonical != u32::MAX, "{}", data.page(i as u32).data.title); - *cluster_size.entry(*canonical).or_default() += 1; - } - let mut cluster_by_size = cluster_size.into_iter().collect::>(); - cluster_by_size.sort_by_key(|(c, s)| (*s, *c)); - cluster_by_size.reverse(); - - // Print clusters - assert!(subcmd == PhilosophyGameCmd::Cluster); - let resolved = resolve_clusters(&forward, &cluster); - for (canonical, size) in cluster_by_size { - match resolved.get(&canonical).unwrap() { - Cluster::DeadEnd(page) => { - let title = &data.page(*page).data.title; - println!("Cluster (dead-end, {size}): {title}"); - } - Cluster::Loop(pages) => { - println!("Cluster ({}-loop, {size}):", pages.len()); - for page in pages { - let page = data.page(*page); - let title = &page.data.title; - if page.data.redirect { - println!(" v {title}"); - } else { - println!(" - {title}"); - } - } - } - } - } - - Ok(()) -} diff --git a/brood/src/main.rs b/brood/src/main.rs index 66b14df..270aee8 100644 --- a/brood/src/main.rs +++ b/brood/src/main.rs @@ -17,6 +17,7 @@ enum Command { Stats(commands::stats::Cmd), Path(commands::path::Cmd), LongestPath(commands::longest_path::Cmd), + Pg(commands::pg::Cmd), } #[derive(Debug, Parser)] @@ -77,5 +78,6 @@ fn main() -> io::Result<()> { Command::Stats(cmd) => cmd.run(data), Command::Path(cmd) => cmd.run(data), Command::LongestPath(cmd) => cmd.run(data), + Command::Pg(cmd) => cmd.run(data), } }