From c195fbb8d42185a82a17d50d58a1e688c0c74e23 Mon Sep 17 00:00:00 2001 From: Joscha Date: Fri, 30 Sep 2022 18:06:42 +0200 Subject: [PATCH] Load sift data from stdin --- brood/Cargo.lock | 246 ++++++++++++++++++++++++++++++++++++++++++++ brood/Cargo.toml | 3 + brood/src/ingest.rs | 34 ++++++ brood/src/main.rs | 18 +++- 4 files changed, 299 insertions(+), 2 deletions(-) create mode 100644 brood/src/ingest.rs diff --git a/brood/Cargo.lock b/brood/Cargo.lock index b079d72..4eaac25 100644 --- a/brood/Cargo.lock +++ b/brood/Cargo.lock @@ -2,6 +2,252 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "atty" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" +dependencies = [ + "hermit-abi", + "libc", + "winapi", +] + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + [[package]] name = "brood" version = "0.1.0" +dependencies = [ + "clap", + "serde", + "serde_json", +] + +[[package]] +name = "clap" +version = "4.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5e0d0fdd7dc774d433c78c9de5695ca04724beaec6a7d4d13ac6014608f3eaf" +dependencies = [ + "atty", + "bitflags", + "clap_derive", + "clap_lex", + "once_cell", + "strsim", + "termcolor", +] + +[[package]] +name = "clap_derive" +version = "4.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca689d7434ce44517a12a89456b2be4d1ea1cafcd8f581978c03d45f5a5c12a7" +dependencies = [ + "heck", + "proc-macro-error", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d4198f73e42b4936b35b5bb248d81d2b595ecb170da0bac7655c54eedfa8da8" +dependencies = [ + "os_str_bytes", +] + +[[package]] +name = "heck" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2540771e65fc8cb83cd6e8a237f70c319bd5c29f78ed1084ba5d50eeac86f7f9" + +[[package]] +name = "hermit-abi" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" +dependencies = [ + "libc", +] + +[[package]] +name = "itoa" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c8af84674fe1f223a982c933a0ee1086ac4d4052aa0fb8060c12c6ad838e754" + +[[package]] +name = "libc" +version = "0.2.134" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "329c933548736bc49fd575ee68c89e8be4d260064184389a5b77517cddd99ffb" + +[[package]] +name = "once_cell" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e82dad04139b71a90c080c8463fe0dc7902db5192d939bd0950f074d014339e1" + +[[package]] +name = "os_str_bytes" +version = "6.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ff7415e9ae3fff1225851df9e0d9e4e5479f947619774677a63572e55e80eff" + +[[package]] +name = "proc-macro-error" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" +dependencies = [ + "proc-macro-error-attr", + "proc-macro2", + "quote", + "syn", + "version_check", +] + +[[package]] +name = "proc-macro-error-attr" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" +dependencies = [ + "proc-macro2", + "quote", + "version_check", +] + +[[package]] +name = "proc-macro2" +version = "1.0.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94e2ef8dbfc347b10c094890f778ee2e36ca9bb4262e86dc99cd217e35f3470b" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbe448f377a7d6961e30f5955f9b8d106c3f5e449d493ee1b125c1d43c2b5179" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "ryu" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4501abdff3ae82a1c1b477a17252eb69cee9e66eb915c1abaa4f44d873df9f09" + +[[package]] +name = "serde" +version = "1.0.145" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "728eb6351430bccb993660dfffc5a72f91ccc1295abaa8ce19b27ebe4f75568b" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.145" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81fa1584d3d1bcacd84c277a0dfe21f5b0f6accf4a23d04d4c6d61f1af522b4c" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.85" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e55a28e3aaef9d5ce0506d0a14dbba8054ddc7e499ef522dd8b26859ec9d4a44" +dependencies = [ + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "strsim" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" + +[[package]] +name = "syn" +version = "1.0.101" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e90cde112c4b9690b8cbe810cba9ddd8bc1d7472e2cae317b69e9438c1cba7d2" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "termcolor" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bab24d30b911b2376f3a13cc2cd443142f0c81dda04c118693e35b3835757755" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "unicode-ident" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcc811dc4066ac62f84f11307873c4850cb653bfa9b1719cee2bd2204a4bc5dd" + +[[package]] +name = "version_check" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-util" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" +dependencies = [ + "winapi", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" diff --git a/brood/Cargo.toml b/brood/Cargo.toml index 2e32b42..395ce7f 100644 --- a/brood/Cargo.toml +++ b/brood/Cargo.toml @@ -4,3 +4,6 @@ version = "0.1.0" edition = "2021" [dependencies] +clap = { version = "4.0.5", features = ["derive"] } +serde = { version = "1.0.145", features = ["derive"] } +serde_json = "1.0.85" diff --git a/brood/src/ingest.rs b/brood/src/ingest.rs new file mode 100644 index 0000000..f96f84a --- /dev/null +++ b/brood/src/ingest.rs @@ -0,0 +1,34 @@ +use std::io::{self, BufRead, BufReader}; + +use serde::Deserialize; + +#[derive(Deserialize)] +struct JsonPage { + ns: u16, + id: u32, + title: String, + redirect: Option, + #[serde(default)] + links: Vec<(String, u32, u32)>, +} +pub fn ingest() -> io::Result<()> { + let stdin = BufReader::new(io::stdin()); + + let mut n_pages = 0; + let mut n_links = 0; + + for line in stdin.lines() { + let json_page = serde_json::from_str::(&line?)?; + + n_pages += 1; + n_links += json_page.links.len(); + + if n_pages % 100_000 == 0 { + eprintln!("{n_pages}"); + } + } + + eprintln!("{n_pages} - {n_links}"); + + Ok(()) +} diff --git a/brood/src/main.rs b/brood/src/main.rs index e7a11a9..b64b41d 100644 --- a/brood/src/main.rs +++ b/brood/src/main.rs @@ -1,3 +1,17 @@ -fn main() { - println!("Hello, world!"); +mod ingest; + +use std::io; + +use clap::Parser; + +#[derive(Debug, Parser)] +enum Command { + /// Read sift data on stdin and output brood data on stdout. + Ingest, +} + +fn main() -> io::Result<()> { + match Command::parse() { + Command::Ingest => ingest::ingest(), + } }