Add progress bars to ingest command

This commit is contained in:
Joscha 2025-01-01 15:59:39 +01:00
parent 8016bbfc83
commit d9fd29c1c3
3 changed files with 195 additions and 21 deletions

155
brood/Cargo.lock generated
View file

@ -65,12 +65,25 @@ name = "brood"
version = "0.0.0" version = "0.0.0"
dependencies = [ dependencies = [
"clap", "clap",
"indicatif",
"regex", "regex",
"serde", "serde",
"serde_json", "serde_json",
"thousands", "thousands",
] ]
[[package]]
name = "bumpalo"
version = "3.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c"
[[package]]
name = "cfg-if"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]] [[package]]
name = "clap" name = "clap"
version = "4.5.23" version = "4.5.23"
@ -117,12 +130,44 @@ version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990"
[[package]]
name = "console"
version = "0.15.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ea3c6ecd8059b57859df5c69830340ed3c41d30e3da0c1cbed90a96ac853041b"
dependencies = [
"encode_unicode",
"libc",
"once_cell",
"unicode-width",
"windows-sys",
]
[[package]]
name = "encode_unicode"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0"
[[package]] [[package]]
name = "heck" name = "heck"
version = "0.5.0" version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
[[package]]
name = "indicatif"
version = "0.17.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cbf675b85ed934d3c67b5c5469701eec7db22689d0a2139d856e0925fa28b281"
dependencies = [
"console",
"number_prefix",
"portable-atomic",
"unicode-width",
"web-time",
]
[[package]] [[package]]
name = "is_terminal_polyfill" name = "is_terminal_polyfill"
version = "1.70.1" version = "1.70.1"
@ -135,12 +180,52 @@ version = "1.0.14"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674" checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674"
[[package]]
name = "js-sys"
version = "0.3.76"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6717b6b5b077764fb5966237269cb3c64edddde4b14ce42647430a78ced9e7b7"
dependencies = [
"once_cell",
"wasm-bindgen",
]
[[package]]
name = "libc"
version = "0.2.169"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a"
[[package]]
name = "log"
version = "0.4.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24"
[[package]] [[package]]
name = "memchr" name = "memchr"
version = "2.7.4" version = "2.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
[[package]]
name = "number_prefix"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
[[package]]
name = "once_cell"
version = "1.20.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775"
[[package]]
name = "portable-atomic"
version = "1.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6"
[[package]] [[package]]
name = "proc-macro2" name = "proc-macro2"
version = "1.0.92" version = "1.0.92"
@ -255,12 +340,82 @@ version = "1.0.14"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83" checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83"
[[package]]
name = "unicode-width"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd"
[[package]] [[package]]
name = "utf8parse" name = "utf8parse"
version = "0.2.2" version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
[[package]]
name = "wasm-bindgen"
version = "0.2.99"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a474f6281d1d70c17ae7aa6a613c87fce69a127e2624002df63dcb39d6cf6396"
dependencies = [
"cfg-if",
"once_cell",
"wasm-bindgen-macro",
]
[[package]]
name = "wasm-bindgen-backend"
version = "0.2.99"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5f89bb38646b4f81674e8f5c3fb81b562be1fd936d84320f3264486418519c79"
dependencies = [
"bumpalo",
"log",
"proc-macro2",
"quote",
"syn",
"wasm-bindgen-shared",
]
[[package]]
name = "wasm-bindgen-macro"
version = "0.2.99"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2cc6181fd9a7492eef6fef1f33961e3695e4579b9872a6f7c83aee556666d4fe"
dependencies = [
"quote",
"wasm-bindgen-macro-support",
]
[[package]]
name = "wasm-bindgen-macro-support"
version = "0.2.99"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "30d7a95b763d3c45903ed6c81f156801839e5ee968bb07e534c44df0fcd330c2"
dependencies = [
"proc-macro2",
"quote",
"syn",
"wasm-bindgen-backend",
"wasm-bindgen-shared",
]
[[package]]
name = "wasm-bindgen-shared"
version = "0.2.99"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "943aab3fdaaa029a6e0271b35ea10b72b943135afe9bffca82384098ad0e06a6"
[[package]]
name = "web-time"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb"
dependencies = [
"js-sys",
"wasm-bindgen",
]
[[package]] [[package]]
name = "windows-sys" name = "windows-sys"
version = "0.59.0" version = "0.59.0"

View file

@ -5,6 +5,7 @@ edition = "2021"
[dependencies] [dependencies]
clap = { version = "4.5.23", features = ["derive", "deprecated"] } clap = { version = "4.5.23", features = ["derive", "deprecated"] }
indicatif = "0.17.9"
regex = "1.11.1" regex = "1.11.1"
serde = { version = "1.0.217", features = ["derive"] } serde = { version = "1.0.217", features = ["derive"] }
serde_json = "1.0.134" serde_json = "1.0.134"

View file

@ -5,15 +5,30 @@ use std::{
path::{Path, PathBuf}, path::{Path, PathBuf},
}; };
use indicatif::{ProgressBar, ProgressStyle};
use serde::Deserialize; use serde::Deserialize;
use thousands::Separable; use thousands::Separable;
use crate::{ use crate::{
data::{Data, Link, Page}, data::{Data, Link, Page},
graph::NodeIdx, graph::NodeIdx,
util::{Counter, TitleNormalizer}, util::TitleNormalizer,
}; };
const PROGRESS_CHARS: &str = "█▉▊▋▌▍▎▏ ";
fn seek_to_start(f: &mut BufReader<File>) -> io::Result<u64> {
let size = f.seek(io::SeekFrom::End(0))?;
f.seek(io::SeekFrom::Start(0))?;
Ok(size)
}
fn file_progress_style() -> ProgressStyle {
ProgressStyle::with_template("{wide_bar} {bytes}/{total_bytes}")
.unwrap()
.progress_chars(PROGRESS_CHARS)
}
#[derive(Deserialize)] #[derive(Deserialize)]
struct JsonPage { struct JsonPage {
id: u32, id: u32,
@ -23,17 +38,17 @@ struct JsonPage {
redirect: Option<String>, redirect: Option<String>,
} }
fn read_titles(r: &mut BufReader<File>) -> io::Result<Vec<String>> { fn read_titles(f: &mut BufReader<File>) -> io::Result<Vec<String>> {
let mut counter = Counter::new(); let size = seek_to_start(f)?;
let bar = ProgressBar::new(size).with_style(file_progress_style());
let mut titles = vec![]; let mut titles = vec![];
for line in r.lines() { for line in bar.wrap_read(f).lines() {
counter.tick();
let page = serde_json::from_str::<JsonPage>(&line?).unwrap(); let page = serde_json::from_str::<JsonPage>(&line?).unwrap();
titles.push(page.title); titles.push(page.title);
} }
counter.done();
Ok(titles) Ok(titles)
} }
@ -49,12 +64,12 @@ fn compute_title_lookup(
normalizer: &TitleNormalizer, normalizer: &TitleNormalizer,
titles: &[String], titles: &[String],
) -> HashMap<String, (u32, u32)> { ) -> HashMap<String, (u32, u32)> {
let mut counter = Counter::new();
let mut title_lookup = HashMap::<String, (u32, u32)>::new(); let mut title_lookup = HashMap::<String, (u32, u32)>::new();
for (sift_i, title) in titles.iter().enumerate() { let bar = ProgressBar::new(titles.len() as u64)
counter.tick(); .with_style(ProgressStyle::default_bar().progress_chars(PROGRESS_CHARS));
for (sift_i, title) in bar.wrap_iter(titles.iter()).enumerate() {
// The index where this article will appear in the final list, assuming // The index where this article will appear in the final list, assuming
// it is not a duplicate. For ownership reasons, we compute this here // it is not a duplicate. For ownership reasons, we compute this here
// instead of inside the Entry::Vacant branch of the following match. // instead of inside the Entry::Vacant branch of the following match.
@ -68,31 +83,33 @@ fn compute_title_lookup(
let prev_sift_i = entry.get().0; let prev_sift_i = entry.get().0;
let prev = &titles[prev_sift_i as usize]; let prev = &titles[prev_sift_i as usize];
if prev == title { if prev == title {
println!(" {title:?} ({prev_sift_i}) occurs again at {sift_i}"); bar.println(format!(
" {title:?} ({prev_sift_i}) occurs again at {sift_i}"
));
} else { } else {
println!( bar.println(format!(
" {prev:?} ({prev_sift_i}) and {title:?} ({sift_i}) both normalize to {:?}", " {prev:?} ({prev_sift_i}) and {title:?} ({sift_i}) normalize to {:?}",
normalizer.normalize(title) normalizer.normalize(title)
); ));
} }
} }
} }
} }
counter.done();
title_lookup title_lookup
} }
fn read_page_data( fn read_page_data(
normalizer: &TitleNormalizer, normalizer: &TitleNormalizer,
title_lookup: &HashMap<String, (u32, u32)>, title_lookup: &HashMap<String, (u32, u32)>,
r: &mut BufReader<File>, f: &mut BufReader<File>,
) -> io::Result<Data> { ) -> io::Result<Data> {
let mut counter = Counter::new(); let size = seek_to_start(f)?;
let bar = ProgressBar::new(size).with_style(file_progress_style());
let mut data = Data::new(); let mut data = Data::new();
for (i, line) in r.lines().enumerate() { for (i, line) in bar.wrap_read(f).lines().enumerate() {
counter.tick();
let page = serde_json::from_str::<JsonPage>(&line?).unwrap(); let page = serde_json::from_str::<JsonPage>(&line?).unwrap();
let normalized = normalizer.normalize(&page.title); let normalized = normalizer.normalize(&page.title);
@ -100,7 +117,10 @@ fn read_page_data(
if i as u32 != sift_i { if i as u32 != sift_i {
// Articles may occur multiple times, and this is not the instance // Articles may occur multiple times, and this is not the instance
// of the article we should keep. // of the article we should keep.
println!(" Skipping {:?} ({i}) in favor of {sift_i}", page.title); bar.println(format!(
" Skipping {:?} ({i}) in favor of {sift_i}",
page.title
));
continue; continue;
} }
@ -127,7 +147,6 @@ fn read_page_data(
} }
} }
counter.done();
Ok(data) Ok(data)
} }
@ -153,7 +172,6 @@ impl Cmd {
drop(titles); // Don't hoard memory drop(titles); // Don't hoard memory
println!(">> Second pass"); println!(">> Second pass");
sift_data.seek(io::SeekFrom::Start(0))?;
println!("> Reading page data"); println!("> Reading page data");
let data = read_page_data(&normalizer, &title_lookup, &mut sift_data)?; let data = read_page_data(&normalizer, &title_lookup, &mut sift_data)?;