Load sift data from stdin

This commit is contained in:
Joscha 2022-09-30 18:06:42 +02:00
parent 2e2045a74d
commit c195fbb8d4
4 changed files with 299 additions and 2 deletions

34
brood/src/ingest.rs Normal file
View file

@ -0,0 +1,34 @@
use std::io::{self, BufRead, BufReader};
use serde::Deserialize;
#[derive(Deserialize)]
struct JsonPage {
ns: u16,
id: u32,
title: String,
redirect: Option<String>,
#[serde(default)]
links: Vec<(String, u32, u32)>,
}
pub fn ingest() -> io::Result<()> {
let stdin = BufReader::new(io::stdin());
let mut n_pages = 0;
let mut n_links = 0;
for line in stdin.lines() {
let json_page = serde_json::from_str::<JsonPage>(&line?)?;
n_pages += 1;
n_links += json_page.links.len();
if n_pages % 100_000 == 0 {
eprintln!("{n_pages}");
}
}
eprintln!("{n_pages} - {n_links}");
Ok(())
}

View file

@ -1,3 +1,17 @@
fn main() {
println!("Hello, world!");
mod ingest;
use std::io;
use clap::Parser;
#[derive(Debug, Parser)]
enum Command {
/// Read sift data on stdin and output brood data on stdout.
Ingest,
}
fn main() -> io::Result<()> {
match Command::parse() {
Command::Ingest => ingest::ingest(),
}
}