From d9fd29c1c3b9e93b7ce755926633366893972778 Mon Sep 17 00:00:00 2001 From: Joscha Date: Wed, 1 Jan 2025 15:59:39 +0100 Subject: [PATCH] Add progress bars to ingest command --- brood/Cargo.lock | 155 +++++++++++++++++++++++++++++++++++ brood/Cargo.toml | 1 + brood/src/commands/ingest.rs | 60 +++++++++----- 3 files changed, 195 insertions(+), 21 deletions(-) diff --git a/brood/Cargo.lock b/brood/Cargo.lock index 414bb49..180ca5c 100644 --- a/brood/Cargo.lock +++ b/brood/Cargo.lock @@ -65,12 +65,25 @@ name = "brood" version = "0.0.0" dependencies = [ "clap", + "indicatif", "regex", "serde", "serde_json", "thousands", ] +[[package]] +name = "bumpalo" +version = "3.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + [[package]] name = "clap" version = "4.5.23" @@ -117,12 +130,44 @@ version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" +[[package]] +name = "console" +version = "0.15.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea3c6ecd8059b57859df5c69830340ed3c41d30e3da0c1cbed90a96ac853041b" +dependencies = [ + "encode_unicode", + "libc", + "once_cell", + "unicode-width", + "windows-sys", +] + +[[package]] +name = "encode_unicode" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" + [[package]] name = "heck" version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "indicatif" +version = "0.17.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cbf675b85ed934d3c67b5c5469701eec7db22689d0a2139d856e0925fa28b281" +dependencies = [ + "console", + "number_prefix", + "portable-atomic", + "unicode-width", + "web-time", +] + [[package]] name = "is_terminal_polyfill" version = "1.70.1" @@ -135,12 +180,52 @@ version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674" +[[package]] +name = "js-sys" +version = "0.3.76" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6717b6b5b077764fb5966237269cb3c64edddde4b14ce42647430a78ced9e7b7" +dependencies = [ + "once_cell", + "wasm-bindgen", +] + +[[package]] +name = "libc" +version = "0.2.169" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" + +[[package]] +name = "log" +version = "0.4.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" + [[package]] name = "memchr" version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" +[[package]] +name = "number_prefix" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" + +[[package]] +name = "once_cell" +version = "1.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" + +[[package]] +name = "portable-atomic" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6" + [[package]] name = "proc-macro2" version = "1.0.92" @@ -255,12 +340,82 @@ version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83" +[[package]] +name = "unicode-width" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd" + [[package]] name = "utf8parse" version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" +[[package]] +name = "wasm-bindgen" +version = "0.2.99" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a474f6281d1d70c17ae7aa6a613c87fce69a127e2624002df63dcb39d6cf6396" +dependencies = [ + "cfg-if", + "once_cell", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.99" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f89bb38646b4f81674e8f5c3fb81b562be1fd936d84320f3264486418519c79" +dependencies = [ + "bumpalo", + "log", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.99" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2cc6181fd9a7492eef6fef1f33961e3695e4579b9872a6f7c83aee556666d4fe" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.99" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30d7a95b763d3c45903ed6c81f156801839e5ee968bb07e534c44df0fcd330c2" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.99" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "943aab3fdaaa029a6e0271b35ea10b72b943135afe9bffca82384098ad0e06a6" + +[[package]] +name = "web-time" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + [[package]] name = "windows-sys" version = "0.59.0" diff --git a/brood/Cargo.toml b/brood/Cargo.toml index 0dd4156..99890b6 100644 --- a/brood/Cargo.toml +++ b/brood/Cargo.toml @@ -5,6 +5,7 @@ edition = "2021" [dependencies] clap = { version = "4.5.23", features = ["derive", "deprecated"] } +indicatif = "0.17.9" regex = "1.11.1" serde = { version = "1.0.217", features = ["derive"] } serde_json = "1.0.134" diff --git a/brood/src/commands/ingest.rs b/brood/src/commands/ingest.rs index 2036062..74f5663 100644 --- a/brood/src/commands/ingest.rs +++ b/brood/src/commands/ingest.rs @@ -5,15 +5,30 @@ use std::{ path::{Path, PathBuf}, }; +use indicatif::{ProgressBar, ProgressStyle}; use serde::Deserialize; use thousands::Separable; use crate::{ data::{Data, Link, Page}, graph::NodeIdx, - util::{Counter, TitleNormalizer}, + util::TitleNormalizer, }; +const PROGRESS_CHARS: &str = "█▉▊▋▌▍▎▏ "; + +fn seek_to_start(f: &mut BufReader) -> io::Result { + let size = f.seek(io::SeekFrom::End(0))?; + f.seek(io::SeekFrom::Start(0))?; + Ok(size) +} + +fn file_progress_style() -> ProgressStyle { + ProgressStyle::with_template("{wide_bar} {bytes}/{total_bytes}") + .unwrap() + .progress_chars(PROGRESS_CHARS) +} + #[derive(Deserialize)] struct JsonPage { id: u32, @@ -23,17 +38,17 @@ struct JsonPage { redirect: Option, } -fn read_titles(r: &mut BufReader) -> io::Result> { - let mut counter = Counter::new(); +fn read_titles(f: &mut BufReader) -> io::Result> { + let size = seek_to_start(f)?; + let bar = ProgressBar::new(size).with_style(file_progress_style()); + let mut titles = vec![]; - for line in r.lines() { - counter.tick(); + for line in bar.wrap_read(f).lines() { let page = serde_json::from_str::(&line?).unwrap(); titles.push(page.title); } - counter.done(); Ok(titles) } @@ -49,12 +64,12 @@ fn compute_title_lookup( normalizer: &TitleNormalizer, titles: &[String], ) -> HashMap { - let mut counter = Counter::new(); let mut title_lookup = HashMap::::new(); - for (sift_i, title) in titles.iter().enumerate() { - counter.tick(); + let bar = ProgressBar::new(titles.len() as u64) + .with_style(ProgressStyle::default_bar().progress_chars(PROGRESS_CHARS)); + for (sift_i, title) in bar.wrap_iter(titles.iter()).enumerate() { // The index where this article will appear in the final list, assuming // it is not a duplicate. For ownership reasons, we compute this here // instead of inside the Entry::Vacant branch of the following match. @@ -68,31 +83,33 @@ fn compute_title_lookup( let prev_sift_i = entry.get().0; let prev = &titles[prev_sift_i as usize]; if prev == title { - println!(" {title:?} ({prev_sift_i}) occurs again at {sift_i}"); + bar.println(format!( + " {title:?} ({prev_sift_i}) occurs again at {sift_i}" + )); } else { - println!( - " {prev:?} ({prev_sift_i}) and {title:?} ({sift_i}) both normalize to {:?}", + bar.println(format!( + " {prev:?} ({prev_sift_i}) and {title:?} ({sift_i}) normalize to {:?}", normalizer.normalize(title) - ); + )); } } } } - counter.done(); title_lookup } fn read_page_data( normalizer: &TitleNormalizer, title_lookup: &HashMap, - r: &mut BufReader, + f: &mut BufReader, ) -> io::Result { - let mut counter = Counter::new(); + let size = seek_to_start(f)?; + let bar = ProgressBar::new(size).with_style(file_progress_style()); + let mut data = Data::new(); - for (i, line) in r.lines().enumerate() { - counter.tick(); + for (i, line) in bar.wrap_read(f).lines().enumerate() { let page = serde_json::from_str::(&line?).unwrap(); let normalized = normalizer.normalize(&page.title); @@ -100,7 +117,10 @@ fn read_page_data( if i as u32 != sift_i { // Articles may occur multiple times, and this is not the instance // of the article we should keep. - println!(" Skipping {:?} ({i}) in favor of {sift_i}", page.title); + bar.println(format!( + " Skipping {:?} ({i}) in favor of {sift_i}", + page.title + )); continue; } @@ -127,7 +147,6 @@ fn read_page_data( } } - counter.done(); Ok(data) } @@ -153,7 +172,6 @@ impl Cmd { drop(titles); // Don't hoard memory println!(">> Second pass"); - sift_data.seek(io::SeekFrom::Start(0))?; println!("> Reading page data"); let data = read_page_data(&normalizer, &title_lookup, &mut sift_data)?;