Compare commits
1 commit
master
...
old-filter
| Author | SHA1 | Date | |
|---|---|---|---|
| d85b61d419 |
3 changed files with 68 additions and 7 deletions
|
|
@ -1,15 +1,59 @@
|
|||
use std::fs::File;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::fs::{self, File};
|
||||
use std::io::{self, BufReader, BufWriter};
|
||||
use std::path::Path;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use serde::Deserialize;
|
||||
|
||||
use crate::data::adjacency_list::AdjacencyList;
|
||||
use crate::data::info::{LinkInfo, PageInfo};
|
||||
use crate::data::store;
|
||||
use crate::util;
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct FilterFile {
|
||||
title: String,
|
||||
language: String,
|
||||
}
|
||||
|
||||
fn filter_pages(
|
||||
data: &AdjacencyList<PageInfo, LinkInfo>,
|
||||
keep: HashSet<String>,
|
||||
) -> AdjacencyList<PageInfo, LinkInfo> {
|
||||
// Map from old to new indices. Only contains entries for pages to keep.
|
||||
let mut index_map = HashMap::new();
|
||||
for (page_idx, page) in data.pages() {
|
||||
if keep.contains(&util::normalize_link(&page.data.title)) {
|
||||
index_map.insert(page_idx, index_map.len() as u32);
|
||||
}
|
||||
}
|
||||
|
||||
// Create new adjacency list in a single pass
|
||||
let mut result = AdjacencyList::default();
|
||||
for (page_idx, page) in data.pages() {
|
||||
let Some(new_idx) = index_map.get(&page_idx) else {
|
||||
continue;
|
||||
};
|
||||
|
||||
let actual_new_idx = result.push_page(page.data.clone());
|
||||
assert!(*new_idx == actual_new_idx);
|
||||
|
||||
for (_, link) in data.links(page_idx) {
|
||||
if let Some(to) = index_map.get(&link.to) {
|
||||
result.push_link(*to, link.data);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
pub fn reexport(
|
||||
from: &Path,
|
||||
to: &Path,
|
||||
in_parens: Option<bool>,
|
||||
in_structure: Option<bool>,
|
||||
filter: Option<PathBuf>,
|
||||
) -> io::Result<()> {
|
||||
eprintln!(">> Import");
|
||||
let mut from = BufReader::new(File::open(from)?);
|
||||
|
|
@ -18,7 +62,7 @@ pub fn reexport(
|
|||
eprintln!(">> Consistency check");
|
||||
data.check_consistency();
|
||||
|
||||
if in_parens.is_some() || in_structure.is_some() {
|
||||
if in_parens.is_some() || in_structure.is_some() || filter.is_some() {
|
||||
eprintln!(">> Filtering");
|
||||
|
||||
let mut data2 = AdjacencyList::default();
|
||||
|
|
@ -38,6 +82,18 @@ pub fn reexport(
|
|||
}
|
||||
|
||||
data = data2;
|
||||
|
||||
if let Some(filter) = filter {
|
||||
let filter = fs::read_to_string(filter)?;
|
||||
let filter = serde_json::from_str::<Vec<FilterFile>>(&filter).unwrap();
|
||||
let keep = filter
|
||||
.into_iter()
|
||||
.filter(|f| f.language == "en")
|
||||
.map(|f| f.title)
|
||||
.map(|t| util::normalize_link(&t))
|
||||
.collect::<HashSet<_>>();
|
||||
data = filter_pages(&data, keep);
|
||||
}
|
||||
}
|
||||
|
||||
eprintln!(">> Export");
|
||||
|
|
|
|||
|
|
@ -49,15 +49,17 @@ impl<P, L> Default for AdjacencyList<P, L> {
|
|||
}
|
||||
|
||||
impl<P, L> AdjacencyList<P, L> {
|
||||
pub fn push_page(&mut self, data: P) {
|
||||
pub fn push_page(&mut self, data: P) -> u32 {
|
||||
self.pages.push(Page {
|
||||
start: self.links.len() as u32,
|
||||
data,
|
||||
});
|
||||
self.pages.len() as u32 - 1
|
||||
}
|
||||
|
||||
pub fn push_link(&mut self, to: u32, data: L) {
|
||||
self.links.push(Link { to, data })
|
||||
pub fn push_link(&mut self, to: u32, data: L) -> u32 {
|
||||
self.links.push(Link { to, data });
|
||||
self.links.len() as u32 - 1
|
||||
}
|
||||
|
||||
pub fn page(&self, page_idx: u32) -> &Page<P> {
|
||||
|
|
|
|||
|
|
@ -26,6 +26,8 @@ enum Command {
|
|||
in_parens: Option<bool>,
|
||||
#[arg(long, short = 'S')]
|
||||
in_structure: Option<bool>,
|
||||
#[arg(long, short = 'F')]
|
||||
filter: Option<PathBuf>,
|
||||
},
|
||||
/// Find a path from one article to another.
|
||||
Path {
|
||||
|
|
@ -61,7 +63,8 @@ fn main() -> io::Result<()> {
|
|||
to,
|
||||
in_parens,
|
||||
in_structure,
|
||||
} => commands::reexport::reexport(&args.datafile, &to, in_parens, in_structure),
|
||||
filter,
|
||||
} => commands::reexport::reexport(&args.datafile, &to, in_parens, in_structure, filter),
|
||||
Command::Path { from, to, flip } => {
|
||||
if flip {
|
||||
commands::path::path(&args.datafile, &to, &from)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue