From 5e8589f73eb902c6edc492a7cb815609d717a3c3 Mon Sep 17 00:00:00 2001 From: Joscha Date: Fri, 30 Sep 2022 18:53:56 +0200 Subject: [PATCH] Load input into adjacency-list-like structure --- brood/src/data.rs | 18 ++++++ brood/src/ingest.rs | 139 ++++++++++++++++++++++++++++++++++++++++---- brood/src/main.rs | 1 + 3 files changed, 146 insertions(+), 12 deletions(-) create mode 100644 brood/src/data.rs diff --git a/brood/src/data.rs b/brood/src/data.rs new file mode 100644 index 0000000..8a57d7c --- /dev/null +++ b/brood/src/data.rs @@ -0,0 +1,18 @@ +pub struct Page { + pub link_idx: u32, + pub ns: u16, + pub id: u32, + pub title: String, + pub redirect: bool, +} + +pub struct Link { + pub to: u32, + pub start: u32, + pub end: u32, +} + +pub struct AdjacencyList { + pub pages: Vec, + pub links: Vec, +} diff --git a/brood/src/ingest.rs b/brood/src/ingest.rs index 80699bc..86a8b71 100644 --- a/brood/src/ingest.rs +++ b/brood/src/ingest.rs @@ -1,7 +1,11 @@ +use std::collections::hash_map::Entry; +use std::collections::HashMap; use std::io::{self, BufRead, BufReader}; use serde::Deserialize; +use crate::data::{Link, Page}; + #[derive(Deserialize)] struct JsonPage { ns: u16, @@ -11,25 +15,136 @@ struct JsonPage { #[serde(default)] links: Vec<(String, u32, u32)>, } -pub fn ingest() -> io::Result<()> { - let stdin = BufReader::new(io::stdin()); - let mut n_pages = 0; - let mut n_links = 0; +/* +Importing is a tad complicated because of multiple criteria: - for line in stdin.lines() { - // let json_page = serde_json::from_str::(&line?)?; - let json_page = simd_json::serde::from_str::(&mut line?).unwrap(); +1. The data must be read in a single pass on stdin +2. The process should not consume a lot of memory + (can't store the decoded json data directly) +3. The process should result in a nice and compact adjacency list format - n_pages += 1; - n_links += json_page.links.len(); +Because of this, the import is a bit more complex and has two passes. - if n_pages % 100_000 == 0 { - eprintln!("{n_pages}"); +The first pass imports the data into an adjacency-list-like format: +- `pages`: List with page info and index in `links` +- `pages_map`: Map from title to index in `pages` (used during the second pass) +- `links`: List with link info and index in `titles` +- `titles`: List with titles +- `titles_map`: Map from title to index in `titles` (used during decoding) + +The second pass then takes 1 and 3 and changes the indices in 3 to point to the +entries in 1 using 2 and 4. After this, 2, 4 and 5 can be discarded and 1 and 3 +form a proper adjacency list. +*/ + +struct FirstStage { + /// List with page info and index into [`Self::links`]. + /// + /// The first entry with id 0 represents a nonexistent link. + pages: Vec, + /// Map from title to index in [`Self::pages`] (used during the second pass). + pages_map: HashMap, + /// List with link info and index into [`Self::titles`]. + links: Vec, + /// List with titles. + titles: Vec, + /// Map from title to index in [`Self::titles`] (used during decoding). + titles_map: HashMap, +} + +impl FirstStage { + fn new() -> Self { + let mut result = Self { + pages: vec![], + pages_map: HashMap::new(), + links: vec![], + titles: vec![], + titles_map: HashMap::new(), + }; + result.push_page(0, 0, "this link does not exist".to_string(), false); + result + } + + fn insert_title(&mut self, title: String) -> u32 { + match self.titles_map.entry(title.clone()) { + Entry::Occupied(occupied) => *occupied.get(), + Entry::Vacant(vacant) => { + let idx = self.titles.len() as u32; + self.titles.push(title); + vacant.insert(idx); + idx + } } } - eprintln!("{n_pages} - {n_links}"); + fn push_page(&mut self, ns: u16, id: u32, title: String, redirect: bool) { + self.pages.push(Page { + link_idx: self.links.len() as u32, + ns, + id, + title, + redirect, + }); + } + fn insert_page(&mut self, ns: u16, id: u32, title: String, redirect: bool) { + // We know we haven't seen the page before + let idx = self.pages.len() as u32; + self.push_page(ns, id, title.clone(), redirect); + self.pages_map.insert(title, idx); + } + + fn insert_link(&mut self, to: u32, start: u32, end: u32) { + self.links.push(Link { to, start, end }); + } + + fn import_json_page(&mut self, page: JsonPage) { + self.insert_page(page.ns, page.id, page.title, page.redirect.is_some()); + if let Some(to) = page.redirect { + let to = self.insert_title(to); + self.insert_link(to, 0, 0); + } else { + for (to, start, end) in page.links { + let to = self.insert_title(to); + self.insert_link(to, start, end); + } + } + } + + fn finalize(&mut self) { + self.insert_page( + 0, + 0, + "dummy page at the end of all pages".to_string(), + false, + ); + } +} + +fn first_stage() -> io::Result { + let mut first_stage = FirstStage::new(); + let mut n = 0; + + let stdin = BufReader::new(io::stdin()); + for line in stdin.lines() { + // let json_page = serde_json::from_str::(&line?)?; + let json_page = simd_json::serde::from_str::(&mut line?).unwrap(); + first_stage.import_json_page(json_page); + + n += 1; + if n % 10_000 == 0 { + eprintln!("{n} imported") + } + } + + first_stage.finalize(); + Ok(first_stage) +} + +pub fn ingest() -> io::Result<()> { + let first_stage = first_stage()?; + eprintln!("{} pages", first_stage.pages.len() - 2); + eprintln!("{} links", first_stage.links.len()); Ok(()) } diff --git a/brood/src/main.rs b/brood/src/main.rs index b64b41d..310db54 100644 --- a/brood/src/main.rs +++ b/brood/src/main.rs @@ -1,4 +1,5 @@ mod ingest; +mod data; use std::io;