Add progress bars to ingest command
This commit is contained in:
parent
8016bbfc83
commit
d9fd29c1c3
3 changed files with 195 additions and 21 deletions
155
brood/Cargo.lock
generated
155
brood/Cargo.lock
generated
|
|
@ -65,12 +65,25 @@ name = "brood"
|
||||||
version = "0.0.0"
|
version = "0.0.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"clap",
|
"clap",
|
||||||
|
"indicatif",
|
||||||
"regex",
|
"regex",
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
"thousands",
|
"thousands",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "bumpalo"
|
||||||
|
version = "3.16.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "cfg-if"
|
||||||
|
version = "1.0.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "clap"
|
name = "clap"
|
||||||
version = "4.5.23"
|
version = "4.5.23"
|
||||||
|
|
@ -117,12 +130,44 @@ version = "1.0.3"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990"
|
checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "console"
|
||||||
|
version = "0.15.10"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "ea3c6ecd8059b57859df5c69830340ed3c41d30e3da0c1cbed90a96ac853041b"
|
||||||
|
dependencies = [
|
||||||
|
"encode_unicode",
|
||||||
|
"libc",
|
||||||
|
"once_cell",
|
||||||
|
"unicode-width",
|
||||||
|
"windows-sys",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "encode_unicode"
|
||||||
|
version = "1.0.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "heck"
|
name = "heck"
|
||||||
version = "0.5.0"
|
version = "0.5.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
|
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "indicatif"
|
||||||
|
version = "0.17.9"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "cbf675b85ed934d3c67b5c5469701eec7db22689d0a2139d856e0925fa28b281"
|
||||||
|
dependencies = [
|
||||||
|
"console",
|
||||||
|
"number_prefix",
|
||||||
|
"portable-atomic",
|
||||||
|
"unicode-width",
|
||||||
|
"web-time",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "is_terminal_polyfill"
|
name = "is_terminal_polyfill"
|
||||||
version = "1.70.1"
|
version = "1.70.1"
|
||||||
|
|
@ -135,12 +180,52 @@ version = "1.0.14"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674"
|
checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "js-sys"
|
||||||
|
version = "0.3.76"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "6717b6b5b077764fb5966237269cb3c64edddde4b14ce42647430a78ced9e7b7"
|
||||||
|
dependencies = [
|
||||||
|
"once_cell",
|
||||||
|
"wasm-bindgen",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "libc"
|
||||||
|
version = "0.2.169"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "log"
|
||||||
|
version = "0.4.22"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "memchr"
|
name = "memchr"
|
||||||
version = "2.7.4"
|
version = "2.7.4"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
|
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "number_prefix"
|
||||||
|
version = "0.4.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "once_cell"
|
||||||
|
version = "1.20.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "portable-atomic"
|
||||||
|
version = "1.10.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "proc-macro2"
|
name = "proc-macro2"
|
||||||
version = "1.0.92"
|
version = "1.0.92"
|
||||||
|
|
@ -255,12 +340,82 @@ version = "1.0.14"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83"
|
checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "unicode-width"
|
||||||
|
version = "0.2.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "utf8parse"
|
name = "utf8parse"
|
||||||
version = "0.2.2"
|
version = "0.2.2"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
|
checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "wasm-bindgen"
|
||||||
|
version = "0.2.99"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "a474f6281d1d70c17ae7aa6a613c87fce69a127e2624002df63dcb39d6cf6396"
|
||||||
|
dependencies = [
|
||||||
|
"cfg-if",
|
||||||
|
"once_cell",
|
||||||
|
"wasm-bindgen-macro",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "wasm-bindgen-backend"
|
||||||
|
version = "0.2.99"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "5f89bb38646b4f81674e8f5c3fb81b562be1fd936d84320f3264486418519c79"
|
||||||
|
dependencies = [
|
||||||
|
"bumpalo",
|
||||||
|
"log",
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn",
|
||||||
|
"wasm-bindgen-shared",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "wasm-bindgen-macro"
|
||||||
|
version = "0.2.99"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "2cc6181fd9a7492eef6fef1f33961e3695e4579b9872a6f7c83aee556666d4fe"
|
||||||
|
dependencies = [
|
||||||
|
"quote",
|
||||||
|
"wasm-bindgen-macro-support",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "wasm-bindgen-macro-support"
|
||||||
|
version = "0.2.99"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "30d7a95b763d3c45903ed6c81f156801839e5ee968bb07e534c44df0fcd330c2"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn",
|
||||||
|
"wasm-bindgen-backend",
|
||||||
|
"wasm-bindgen-shared",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "wasm-bindgen-shared"
|
||||||
|
version = "0.2.99"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "943aab3fdaaa029a6e0271b35ea10b72b943135afe9bffca82384098ad0e06a6"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "web-time"
|
||||||
|
version = "1.1.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb"
|
||||||
|
dependencies = [
|
||||||
|
"js-sys",
|
||||||
|
"wasm-bindgen",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "windows-sys"
|
name = "windows-sys"
|
||||||
version = "0.59.0"
|
version = "0.59.0"
|
||||||
|
|
|
||||||
|
|
@ -5,6 +5,7 @@ edition = "2021"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
clap = { version = "4.5.23", features = ["derive", "deprecated"] }
|
clap = { version = "4.5.23", features = ["derive", "deprecated"] }
|
||||||
|
indicatif = "0.17.9"
|
||||||
regex = "1.11.1"
|
regex = "1.11.1"
|
||||||
serde = { version = "1.0.217", features = ["derive"] }
|
serde = { version = "1.0.217", features = ["derive"] }
|
||||||
serde_json = "1.0.134"
|
serde_json = "1.0.134"
|
||||||
|
|
|
||||||
|
|
@ -5,15 +5,30 @@ use std::{
|
||||||
path::{Path, PathBuf},
|
path::{Path, PathBuf},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
use indicatif::{ProgressBar, ProgressStyle};
|
||||||
use serde::Deserialize;
|
use serde::Deserialize;
|
||||||
use thousands::Separable;
|
use thousands::Separable;
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
data::{Data, Link, Page},
|
data::{Data, Link, Page},
|
||||||
graph::NodeIdx,
|
graph::NodeIdx,
|
||||||
util::{Counter, TitleNormalizer},
|
util::TitleNormalizer,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const PROGRESS_CHARS: &str = "█▉▊▋▌▍▎▏ ";
|
||||||
|
|
||||||
|
fn seek_to_start(f: &mut BufReader<File>) -> io::Result<u64> {
|
||||||
|
let size = f.seek(io::SeekFrom::End(0))?;
|
||||||
|
f.seek(io::SeekFrom::Start(0))?;
|
||||||
|
Ok(size)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn file_progress_style() -> ProgressStyle {
|
||||||
|
ProgressStyle::with_template("{wide_bar} {bytes}/{total_bytes}")
|
||||||
|
.unwrap()
|
||||||
|
.progress_chars(PROGRESS_CHARS)
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Deserialize)]
|
#[derive(Deserialize)]
|
||||||
struct JsonPage {
|
struct JsonPage {
|
||||||
id: u32,
|
id: u32,
|
||||||
|
|
@ -23,17 +38,17 @@ struct JsonPage {
|
||||||
redirect: Option<String>,
|
redirect: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
fn read_titles(r: &mut BufReader<File>) -> io::Result<Vec<String>> {
|
fn read_titles(f: &mut BufReader<File>) -> io::Result<Vec<String>> {
|
||||||
let mut counter = Counter::new();
|
let size = seek_to_start(f)?;
|
||||||
|
let bar = ProgressBar::new(size).with_style(file_progress_style());
|
||||||
|
|
||||||
let mut titles = vec![];
|
let mut titles = vec![];
|
||||||
|
|
||||||
for line in r.lines() {
|
for line in bar.wrap_read(f).lines() {
|
||||||
counter.tick();
|
|
||||||
let page = serde_json::from_str::<JsonPage>(&line?).unwrap();
|
let page = serde_json::from_str::<JsonPage>(&line?).unwrap();
|
||||||
titles.push(page.title);
|
titles.push(page.title);
|
||||||
}
|
}
|
||||||
|
|
||||||
counter.done();
|
|
||||||
Ok(titles)
|
Ok(titles)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -49,12 +64,12 @@ fn compute_title_lookup(
|
||||||
normalizer: &TitleNormalizer,
|
normalizer: &TitleNormalizer,
|
||||||
titles: &[String],
|
titles: &[String],
|
||||||
) -> HashMap<String, (u32, u32)> {
|
) -> HashMap<String, (u32, u32)> {
|
||||||
let mut counter = Counter::new();
|
|
||||||
let mut title_lookup = HashMap::<String, (u32, u32)>::new();
|
let mut title_lookup = HashMap::<String, (u32, u32)>::new();
|
||||||
|
|
||||||
for (sift_i, title) in titles.iter().enumerate() {
|
let bar = ProgressBar::new(titles.len() as u64)
|
||||||
counter.tick();
|
.with_style(ProgressStyle::default_bar().progress_chars(PROGRESS_CHARS));
|
||||||
|
|
||||||
|
for (sift_i, title) in bar.wrap_iter(titles.iter()).enumerate() {
|
||||||
// The index where this article will appear in the final list, assuming
|
// The index where this article will appear in the final list, assuming
|
||||||
// it is not a duplicate. For ownership reasons, we compute this here
|
// it is not a duplicate. For ownership reasons, we compute this here
|
||||||
// instead of inside the Entry::Vacant branch of the following match.
|
// instead of inside the Entry::Vacant branch of the following match.
|
||||||
|
|
@ -68,31 +83,33 @@ fn compute_title_lookup(
|
||||||
let prev_sift_i = entry.get().0;
|
let prev_sift_i = entry.get().0;
|
||||||
let prev = &titles[prev_sift_i as usize];
|
let prev = &titles[prev_sift_i as usize];
|
||||||
if prev == title {
|
if prev == title {
|
||||||
println!(" {title:?} ({prev_sift_i}) occurs again at {sift_i}");
|
bar.println(format!(
|
||||||
|
" {title:?} ({prev_sift_i}) occurs again at {sift_i}"
|
||||||
|
));
|
||||||
} else {
|
} else {
|
||||||
println!(
|
bar.println(format!(
|
||||||
" {prev:?} ({prev_sift_i}) and {title:?} ({sift_i}) both normalize to {:?}",
|
" {prev:?} ({prev_sift_i}) and {title:?} ({sift_i}) normalize to {:?}",
|
||||||
normalizer.normalize(title)
|
normalizer.normalize(title)
|
||||||
);
|
));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
counter.done();
|
|
||||||
title_lookup
|
title_lookup
|
||||||
}
|
}
|
||||||
|
|
||||||
fn read_page_data(
|
fn read_page_data(
|
||||||
normalizer: &TitleNormalizer,
|
normalizer: &TitleNormalizer,
|
||||||
title_lookup: &HashMap<String, (u32, u32)>,
|
title_lookup: &HashMap<String, (u32, u32)>,
|
||||||
r: &mut BufReader<File>,
|
f: &mut BufReader<File>,
|
||||||
) -> io::Result<Data> {
|
) -> io::Result<Data> {
|
||||||
let mut counter = Counter::new();
|
let size = seek_to_start(f)?;
|
||||||
|
let bar = ProgressBar::new(size).with_style(file_progress_style());
|
||||||
|
|
||||||
let mut data = Data::new();
|
let mut data = Data::new();
|
||||||
|
|
||||||
for (i, line) in r.lines().enumerate() {
|
for (i, line) in bar.wrap_read(f).lines().enumerate() {
|
||||||
counter.tick();
|
|
||||||
let page = serde_json::from_str::<JsonPage>(&line?).unwrap();
|
let page = serde_json::from_str::<JsonPage>(&line?).unwrap();
|
||||||
let normalized = normalizer.normalize(&page.title);
|
let normalized = normalizer.normalize(&page.title);
|
||||||
|
|
||||||
|
|
@ -100,7 +117,10 @@ fn read_page_data(
|
||||||
if i as u32 != sift_i {
|
if i as u32 != sift_i {
|
||||||
// Articles may occur multiple times, and this is not the instance
|
// Articles may occur multiple times, and this is not the instance
|
||||||
// of the article we should keep.
|
// of the article we should keep.
|
||||||
println!(" Skipping {:?} ({i}) in favor of {sift_i}", page.title);
|
bar.println(format!(
|
||||||
|
" Skipping {:?} ({i}) in favor of {sift_i}",
|
||||||
|
page.title
|
||||||
|
));
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -127,7 +147,6 @@ fn read_page_data(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
counter.done();
|
|
||||||
Ok(data)
|
Ok(data)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -153,7 +172,6 @@ impl Cmd {
|
||||||
drop(titles); // Don't hoard memory
|
drop(titles); // Don't hoard memory
|
||||||
|
|
||||||
println!(">> Second pass");
|
println!(">> Second pass");
|
||||||
sift_data.seek(io::SeekFrom::Start(0))?;
|
|
||||||
|
|
||||||
println!("> Reading page data");
|
println!("> Reading page data");
|
||||||
let data = read_page_data(&normalizer, &title_lookup, &mut sift_data)?;
|
let data = read_page_data(&normalizer, &title_lookup, &mut sift_data)?;
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue