Load input into adjacency-list-like structure
This commit is contained in:
parent
b1f2af9577
commit
5e8589f73e
3 changed files with 146 additions and 12 deletions
18
brood/src/data.rs
Normal file
18
brood/src/data.rs
Normal file
|
|
@ -0,0 +1,18 @@
|
|||
pub struct Page {
|
||||
pub link_idx: u32,
|
||||
pub ns: u16,
|
||||
pub id: u32,
|
||||
pub title: String,
|
||||
pub redirect: bool,
|
||||
}
|
||||
|
||||
pub struct Link {
|
||||
pub to: u32,
|
||||
pub start: u32,
|
||||
pub end: u32,
|
||||
}
|
||||
|
||||
pub struct AdjacencyList {
|
||||
pub pages: Vec<Page>,
|
||||
pub links: Vec<Link>,
|
||||
}
|
||||
|
|
@ -1,7 +1,11 @@
|
|||
use std::collections::hash_map::Entry;
|
||||
use std::collections::HashMap;
|
||||
use std::io::{self, BufRead, BufReader};
|
||||
|
||||
use serde::Deserialize;
|
||||
|
||||
use crate::data::{Link, Page};
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct JsonPage {
|
||||
ns: u16,
|
||||
|
|
@ -11,25 +15,136 @@ struct JsonPage {
|
|||
#[serde(default)]
|
||||
links: Vec<(String, u32, u32)>,
|
||||
}
|
||||
pub fn ingest() -> io::Result<()> {
|
||||
let stdin = BufReader::new(io::stdin());
|
||||
|
||||
let mut n_pages = 0;
|
||||
let mut n_links = 0;
|
||||
/*
|
||||
Importing is a tad complicated because of multiple criteria:
|
||||
|
||||
for line in stdin.lines() {
|
||||
// let json_page = serde_json::from_str::<JsonPage>(&line?)?;
|
||||
let json_page = simd_json::serde::from_str::<JsonPage>(&mut line?).unwrap();
|
||||
1. The data must be read in a single pass on stdin
|
||||
2. The process should not consume a lot of memory
|
||||
(can't store the decoded json data directly)
|
||||
3. The process should result in a nice and compact adjacency list format
|
||||
|
||||
n_pages += 1;
|
||||
n_links += json_page.links.len();
|
||||
Because of this, the import is a bit more complex and has two passes.
|
||||
|
||||
if n_pages % 100_000 == 0 {
|
||||
eprintln!("{n_pages}");
|
||||
The first pass imports the data into an adjacency-list-like format:
|
||||
- `pages`: List with page info and index in `links`
|
||||
- `pages_map`: Map from title to index in `pages` (used during the second pass)
|
||||
- `links`: List with link info and index in `titles`
|
||||
- `titles`: List with titles
|
||||
- `titles_map`: Map from title to index in `titles` (used during decoding)
|
||||
|
||||
The second pass then takes 1 and 3 and changes the indices in 3 to point to the
|
||||
entries in 1 using 2 and 4. After this, 2, 4 and 5 can be discarded and 1 and 3
|
||||
form a proper adjacency list.
|
||||
*/
|
||||
|
||||
struct FirstStage {
|
||||
/// List with page info and index into [`Self::links`].
|
||||
///
|
||||
/// The first entry with id 0 represents a nonexistent link.
|
||||
pages: Vec<Page>,
|
||||
/// Map from title to index in [`Self::pages`] (used during the second pass).
|
||||
pages_map: HashMap<String, u32>,
|
||||
/// List with link info and index into [`Self::titles`].
|
||||
links: Vec<Link>,
|
||||
/// List with titles.
|
||||
titles: Vec<String>,
|
||||
/// Map from title to index in [`Self::titles`] (used during decoding).
|
||||
titles_map: HashMap<String, u32>,
|
||||
}
|
||||
|
||||
impl FirstStage {
|
||||
fn new() -> Self {
|
||||
let mut result = Self {
|
||||
pages: vec![],
|
||||
pages_map: HashMap::new(),
|
||||
links: vec![],
|
||||
titles: vec![],
|
||||
titles_map: HashMap::new(),
|
||||
};
|
||||
result.push_page(0, 0, "this link does not exist".to_string(), false);
|
||||
result
|
||||
}
|
||||
|
||||
fn insert_title(&mut self, title: String) -> u32 {
|
||||
match self.titles_map.entry(title.clone()) {
|
||||
Entry::Occupied(occupied) => *occupied.get(),
|
||||
Entry::Vacant(vacant) => {
|
||||
let idx = self.titles.len() as u32;
|
||||
self.titles.push(title);
|
||||
vacant.insert(idx);
|
||||
idx
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
eprintln!("{n_pages} - {n_links}");
|
||||
fn push_page(&mut self, ns: u16, id: u32, title: String, redirect: bool) {
|
||||
self.pages.push(Page {
|
||||
link_idx: self.links.len() as u32,
|
||||
ns,
|
||||
id,
|
||||
title,
|
||||
redirect,
|
||||
});
|
||||
}
|
||||
|
||||
fn insert_page(&mut self, ns: u16, id: u32, title: String, redirect: bool) {
|
||||
// We know we haven't seen the page before
|
||||
let idx = self.pages.len() as u32;
|
||||
self.push_page(ns, id, title.clone(), redirect);
|
||||
self.pages_map.insert(title, idx);
|
||||
}
|
||||
|
||||
fn insert_link(&mut self, to: u32, start: u32, end: u32) {
|
||||
self.links.push(Link { to, start, end });
|
||||
}
|
||||
|
||||
fn import_json_page(&mut self, page: JsonPage) {
|
||||
self.insert_page(page.ns, page.id, page.title, page.redirect.is_some());
|
||||
if let Some(to) = page.redirect {
|
||||
let to = self.insert_title(to);
|
||||
self.insert_link(to, 0, 0);
|
||||
} else {
|
||||
for (to, start, end) in page.links {
|
||||
let to = self.insert_title(to);
|
||||
self.insert_link(to, start, end);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn finalize(&mut self) {
|
||||
self.insert_page(
|
||||
0,
|
||||
0,
|
||||
"dummy page at the end of all pages".to_string(),
|
||||
false,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
fn first_stage() -> io::Result<FirstStage> {
|
||||
let mut first_stage = FirstStage::new();
|
||||
let mut n = 0;
|
||||
|
||||
let stdin = BufReader::new(io::stdin());
|
||||
for line in stdin.lines() {
|
||||
// let json_page = serde_json::from_str::<JsonPage>(&line?)?;
|
||||
let json_page = simd_json::serde::from_str::<JsonPage>(&mut line?).unwrap();
|
||||
first_stage.import_json_page(json_page);
|
||||
|
||||
n += 1;
|
||||
if n % 10_000 == 0 {
|
||||
eprintln!("{n} imported")
|
||||
}
|
||||
}
|
||||
|
||||
first_stage.finalize();
|
||||
Ok(first_stage)
|
||||
}
|
||||
|
||||
pub fn ingest() -> io::Result<()> {
|
||||
let first_stage = first_stage()?;
|
||||
eprintln!("{} pages", first_stage.pages.len() - 2);
|
||||
eprintln!("{} links", first_stage.links.len());
|
||||
Ok(())
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
mod ingest;
|
||||
mod data;
|
||||
|
||||
use std::io;
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue