diff --git a/brood/src/commands/reexport.rs b/brood/src/commands/reexport.rs index 1125fb0..4c17d82 100644 --- a/brood/src/commands/reexport.rs +++ b/brood/src/commands/reexport.rs @@ -1,15 +1,59 @@ -use std::fs::File; +use std::collections::{HashMap, HashSet}; +use std::fs::{self, File}; use std::io::{self, BufReader, BufWriter}; -use std::path::Path; +use std::path::{Path, PathBuf}; + +use serde::Deserialize; use crate::data::adjacency_list::AdjacencyList; +use crate::data::info::{LinkInfo, PageInfo}; use crate::data::store; +use crate::util; + +#[derive(Deserialize)] +struct FilterFile { + title: String, + language: String, +} + +fn filter_pages( + data: &AdjacencyList, + keep: HashSet, +) -> AdjacencyList { + // Map from old to new indices. Only contains entries for pages to keep. + let mut index_map = HashMap::new(); + for (page_idx, page) in data.pages() { + if keep.contains(&util::normalize_link(&page.data.title)) { + index_map.insert(page_idx, index_map.len() as u32); + } + } + + // Create new adjacency list in a single pass + let mut result = AdjacencyList::default(); + for (page_idx, page) in data.pages() { + let Some(new_idx) = index_map.get(&page_idx) else { + continue; + }; + + let actual_new_idx = result.push_page(page.data.clone()); + assert!(*new_idx == actual_new_idx); + + for (_, link) in data.links(page_idx) { + if let Some(to) = index_map.get(&link.to) { + result.push_link(*to, link.data); + } + } + } + + result +} pub fn reexport( from: &Path, to: &Path, in_parens: Option, in_structure: Option, + filter: Option, ) -> io::Result<()> { eprintln!(">> Import"); let mut from = BufReader::new(File::open(from)?); @@ -18,7 +62,7 @@ pub fn reexport( eprintln!(">> Consistency check"); data.check_consistency(); - if in_parens.is_some() || in_structure.is_some() { + if in_parens.is_some() || in_structure.is_some() || filter.is_some() { eprintln!(">> Filtering"); let mut data2 = AdjacencyList::default(); @@ -38,6 +82,18 @@ pub fn reexport( } data = data2; + + if let Some(filter) = filter { + let filter = fs::read_to_string(filter)?; + let filter = serde_json::from_str::>(&filter).unwrap(); + let keep = filter + .into_iter() + .filter(|f| f.language == "en") + .map(|f| f.title) + .map(|t| util::normalize_link(&t)) + .collect::>(); + data = filter_pages(&data, keep); + } } eprintln!(">> Export"); diff --git a/brood/src/data/adjacency_list.rs b/brood/src/data/adjacency_list.rs index 04a1124..4c07b3d 100644 --- a/brood/src/data/adjacency_list.rs +++ b/brood/src/data/adjacency_list.rs @@ -49,15 +49,17 @@ impl Default for AdjacencyList { } impl AdjacencyList { - pub fn push_page(&mut self, data: P) { + pub fn push_page(&mut self, data: P) -> u32 { self.pages.push(Page { start: self.links.len() as u32, data, }); + self.pages.len() as u32 - 1 } - pub fn push_link(&mut self, to: u32, data: L) { - self.links.push(Link { to, data }) + pub fn push_link(&mut self, to: u32, data: L) -> u32 { + self.links.push(Link { to, data }); + self.links.len() as u32 - 1 } pub fn page(&self, page_idx: u32) -> &Page

{ diff --git a/brood/src/main.rs b/brood/src/main.rs index e4b4074..d31076c 100644 --- a/brood/src/main.rs +++ b/brood/src/main.rs @@ -26,6 +26,8 @@ enum Command { in_parens: Option, #[arg(long, short = 'S')] in_structure: Option, + #[arg(long, short = 'F')] + filter: Option, }, /// Find a path from one article to another. Path { @@ -61,7 +63,8 @@ fn main() -> io::Result<()> { to, in_parens, in_structure, - } => commands::reexport::reexport(&args.datafile, &to, in_parens, in_structure), + filter, + } => commands::reexport::reexport(&args.datafile, &to, in_parens, in_structure, filter), Command::Path { from, to, flip } => { if flip { commands::path::path(&args.datafile, &to, &from)