Make data representation more flexible

This commit is contained in:
Joscha 2022-10-22 15:52:07 +02:00
parent 49b27715f0
commit 67f405a21e
3 changed files with 87 additions and 72 deletions

View file

@ -6,7 +6,7 @@ use std::path::Path;
use rustc_hash::FxHashMap; use rustc_hash::FxHashMap;
use serde::Deserialize; use serde::Deserialize;
use crate::data::{AdjacencyList, Link, Page}; use crate::data::{AdjacencyList, Link, LinkInfo, Page, PageInfo};
use crate::util; use crate::util;
#[derive(Deserialize)] #[derive(Deserialize)]
@ -61,7 +61,7 @@ impl Titles {
} }
} }
fn first_stage() -> io::Result<(AdjacencyList<(), ()>, Titles)> { fn first_stage() -> io::Result<(AdjacencyList<PageInfo, LinkInfo>, Titles)> {
let mut titles = Titles::default(); let mut titles = Titles::default();
let mut result = AdjacencyList::default(); let mut result = AdjacencyList::default();
@ -71,20 +71,19 @@ fn first_stage() -> io::Result<(AdjacencyList<(), ()>, Titles)> {
result.pages.push(Page { result.pages.push(Page {
link_idx: result.links.len() as u32, link_idx: result.links.len() as u32,
data: PageInfo {
id: json_page.id, id: json_page.id,
length: json_page.length, length: json_page.length,
redirect: json_page.redirect.is_some(), redirect: json_page.redirect.is_some(),
title: json_page.title, title: json_page.title,
data: (), },
}); });
for (to, start, end) in json_page.links { for (to, start, end) in json_page.links {
let to = titles.insert(util::normalize_link(&to)); let to = titles.insert(util::normalize_link(&to));
result.links.push(Link { result.links.push(Link {
to, to,
start, data: LinkInfo { start, end },
end,
data: (),
}); });
} }
@ -100,23 +99,29 @@ fn first_stage() -> io::Result<(AdjacencyList<(), ()>, Titles)> {
result.pages.push(Page { result.pages.push(Page {
link_idx: result.links.len() as u32, link_idx: result.links.len() as u32,
data: PageInfo {
id: 0, id: 0,
length: 0, length: 0,
redirect: false, redirect: false,
title: "Sentinel page at the end of all pages, Q2AKO3OYzyitmCJURghJ".to_string(), title: "Sentinel page at the end of all pages, Q2AKO3OYzyitmCJURghJ".to_string(),
data: (), },
}); });
Ok((result, titles)) Ok((result, titles))
} }
/// Create map from normalized title to index in pages. /// Create map from normalized title to index in pages.
fn initialize_pages_map(pages: &[Page<()>]) -> FxHashMap<String, u32> { fn initialize_pages_map(pages: &[Page<PageInfo>]) -> FxHashMap<String, u32> {
let mut result = FxHashMap::default(); let mut result = FxHashMap::default();
for (i, p) in pages.iter().enumerate() { for (i, p) in pages.iter().enumerate() {
match result.entry(util::normalize_link(&p.title)) { match result.entry(util::normalize_link(&p.data.title)) {
Entry::Occupied(entry) => { Entry::Occupied(entry) => {
eprintln!("{:?} already exists at index {}", p.title, entry.get()); eprintln!(
"{:?} already exists at index {} as {:?}",
p.data.title,
entry.get(),
util::normalize_link(&p.data.title)
);
} }
Entry::Vacant(entry) => { Entry::Vacant(entry) => {
entry.insert(i as u32); entry.insert(i as u32);
@ -126,7 +131,10 @@ fn initialize_pages_map(pages: &[Page<()>]) -> FxHashMap<String, u32> {
result result
} }
fn second_stage(first_stage: &AdjacencyList<(), ()>, titles: &Titles) -> AdjacencyList<(), ()> { fn second_stage(
first_stage: &AdjacencyList<PageInfo, LinkInfo>,
titles: &Titles,
) -> AdjacencyList<PageInfo, LinkInfo> {
let pages_map = initialize_pages_map(&first_stage.pages); let pages_map = initialize_pages_map(&first_stage.pages);
let mut result = AdjacencyList::default(); let mut result = AdjacencyList::default();

View file

@ -8,7 +8,7 @@ use crate::util;
pub fn path(datafile: &Path, from: &str, to: &str) -> io::Result<()> { pub fn path(datafile: &Path, from: &str, to: &str) -> io::Result<()> {
eprintln!(">> Import"); eprintln!(">> Import");
let mut databuf = BufReader::new(File::open(datafile)?); let mut databuf = BufReader::new(File::open(datafile)?);
let mut data = AdjacencyList::read(&mut databuf)?.change_page_data(f32::INFINITY); let data = AdjacencyList::read(&mut databuf)?;
eprintln!(">> Locate from and to"); eprintln!(">> Locate from and to");
let from = util::normalize_link(from); let from = util::normalize_link(from);
@ -17,15 +17,15 @@ pub fn path(datafile: &Path, from: &str, to: &str) -> io::Result<()> {
.pages .pages
.iter() .iter()
.enumerate() .enumerate()
.filter(|(_, p)| !p.redirect) .filter(|(_, p)| !p.data.redirect)
.find(|(_, p)| util::normalize_link(&p.title) == from) .find(|(_, p)| util::normalize_link(&p.data.title) == from)
.unwrap_or_else(|| panic!("no article called {from}")); .unwrap_or_else(|| panic!("no article called {from}"));
let (to_i, to_p) = data let (to_i, to_p) = data
.pages .pages
.iter() .iter()
.enumerate() .enumerate()
.filter(|(_, p)| !p.redirect) .filter(|(_, p)| !p.data.redirect)
.find(|(_, p)| util::normalize_link(&p.title) == to) .find(|(_, p)| util::normalize_link(&p.data.title) == to)
.unwrap_or_else(|| panic!("no article called {to}")); .unwrap_or_else(|| panic!("no article called {to}"));
dbg!(from_i, from_p, to_i, to_p); dbg!(from_i, from_p, to_i, to_p);

View file

@ -1,7 +1,5 @@
use std::io::{self, Read, Write}; use std::io::{self, Read, Write};
use serde::{Deserialize, Serialize};
mod ioutil { mod ioutil {
use std::io::{self, Read, Write}; use std::io::{self, Read, Write};
@ -50,23 +48,27 @@ mod ioutil {
} }
} }
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone)]
pub struct Page<P> { pub struct PageInfo {
pub link_idx: u32,
pub id: u32, pub id: u32,
pub length: u32, pub length: u32,
pub redirect: bool, pub redirect: bool,
pub title: String, pub title: String,
}
#[derive(Debug, Clone, Copy)]
pub struct Page<P> {
pub link_idx: u32,
pub data: P, pub data: P,
} }
impl Page<()> { impl Page<PageInfo> {
pub fn write<W: Write>(&self, to: &mut W) -> io::Result<()> { pub fn write<W: Write>(&self, to: &mut W) -> io::Result<()> {
ioutil::write_u32(self.link_idx, to)?; ioutil::write_u32(self.link_idx, to)?;
ioutil::write_u32(self.id, to)?; ioutil::write_u32(self.data.id, to)?;
ioutil::write_u32(self.length, to)?; ioutil::write_u32(self.data.length, to)?;
ioutil::write_u8(if self.redirect { 1 } else { 0 }, to)?; ioutil::write_u8(if self.data.redirect { 1 } else { 0 }, to)?;
ioutil::write_str(&self.title, to)?; ioutil::write_str(&self.data.title, to)?;
Ok(()) Ok(())
} }
@ -80,41 +82,42 @@ impl Page<()> {
Ok(Self { Ok(Self {
link_idx, link_idx,
data: PageInfo {
id, id,
length, length,
redirect, redirect,
title, title,
data: (), },
}) })
} }
} }
impl<P> Page<P> { impl<P> Page<P> {
pub fn change_data<P2>(self, data: P2) -> Page<P2> { pub fn change_data<P2>(self, f: &impl Fn(P) -> P2) -> Page<P2> {
Page { Page {
link_idx: self.link_idx, link_idx: self.link_idx,
id: self.id, data: f(self.data),
length: self.length,
redirect: self.redirect,
title: self.title,
data,
} }
} }
} }
#[derive(Debug, Clone, Copy, Serialize, Deserialize)] #[derive(Debug, Clone, Copy)]
pub struct Link<L> { pub struct LinkInfo {
pub to: u32,
pub start: u32, pub start: u32,
pub end: u32, pub end: u32,
}
#[derive(Debug, Clone, Copy)]
pub struct Link<L> {
pub to: u32,
pub data: L, pub data: L,
} }
impl Link<()> { impl Link<LinkInfo> {
pub fn write<W: Write>(&self, to: &mut W) -> io::Result<()> { pub fn write<W: Write>(&self, to: &mut W) -> io::Result<()> {
ioutil::write_u32(self.to, to)?; ioutil::write_u32(self.to, to)?;
ioutil::write_u32(self.start, to)?; ioutil::write_u32(self.data.start, to)?;
ioutil::write_u32(self.end, to)?; ioutil::write_u32(self.data.end, to)?;
Ok(()) Ok(())
} }
@ -126,31 +129,35 @@ impl Link<()> {
Ok(Self { Ok(Self {
to, to,
start, data: LinkInfo { start, end },
end,
data: (),
}) })
} }
} }
impl<P> Link<P> { impl<L> Link<L> {
pub fn change_data<P2>(self, data: P2) -> Link<P2> { pub fn change_data<L2>(self, f: &impl Fn(L) -> L2) -> Link<L2> {
Link { Link {
to: self.to, to: self.to,
start: self.start, data: f(self.data),
end: self.end,
data,
} }
} }
} }
#[derive(Debug, Default, Serialize, Deserialize)]
pub struct AdjacencyList<P, L> { pub struct AdjacencyList<P, L> {
pub pages: Vec<Page<P>>, pub pages: Vec<Page<P>>,
pub links: Vec<Link<L>>, pub links: Vec<Link<L>>,
} }
impl AdjacencyList<(), ()> { impl<P, L> Default for AdjacencyList<P, L> {
fn default() -> Self {
Self {
pages: Default::default(),
links: Default::default(),
}
}
}
impl AdjacencyList<PageInfo, LinkInfo> {
pub fn write<W: Write>(&self, to: &mut W) -> io::Result<()> { pub fn write<W: Write>(&self, to: &mut W) -> io::Result<()> {
ioutil::write_u32(self.pages.len() as u32, to)?; ioutil::write_u32(self.pages.len() as u32, to)?;
ioutil::write_u32(self.links.len() as u32, to)?; ioutil::write_u32(self.links.len() as u32, to)?;
@ -182,23 +189,21 @@ impl AdjacencyList<(), ()> {
Ok(Self { pages, links }) Ok(Self { pages, links })
} }
}
impl<P, L> AdjacencyList<P, L> {
pub fn check_consistency(&self) { pub fn check_consistency(&self) {
// Check that all types are large enough // Check that all types are large enough
assert!(self.pages.len() <= u32::MAX as usize, "pages len"); assert!(self.pages.len() <= u32::MAX as usize, "pages len");
assert!(self.links.len() <= u32::MAX as usize, "links len"); assert!(self.links.len() <= u32::MAX as usize, "links len");
for page in &self.pages { for page in &self.pages {
assert!(page.link_idx <= u32::MAX as u32, "page link_idx"); assert!(page.link_idx <= u32::MAX as u32, "page link_idx");
assert!(page.id <= u32::MAX as u32, "page id"); assert!(page.data.id <= u32::MAX as u32, "page id");
assert!(page.length <= u32::MAX as u32, "page length"); assert!(page.data.length <= u32::MAX as u32, "page length");
assert!(page.title.len() <= u8::MAX as usize, "page title len"); assert!(page.data.title.len() <= u8::MAX as usize, "page title len");
} }
for link in &self.links { for link in &self.links {
assert!(link.to <= u32::MAX as u32, "link to"); assert!(link.to <= u32::MAX as u32, "link to");
assert!(link.start <= u32::MAX as u32, "link start"); assert!(link.data.start <= u32::MAX as u32, "link start");
assert!(link.end <= u32::MAX as u32, "link end"); assert!(link.data.end <= u32::MAX as u32, "link end");
} }
// Check that all links contain valid indices // Check that all links contain valid indices
@ -209,12 +214,14 @@ impl<P, L> AdjacencyList<P, L> {
} }
} }
} }
}
pub fn change_page_data<P2: Clone>(self, data: P2) -> AdjacencyList<P2, L> { impl<P, L> AdjacencyList<P, L> {
pub fn change_page_data<P2: Clone>(self, page_f: &impl Fn(P) -> P2) -> AdjacencyList<P2, L> {
let pages = self let pages = self
.pages .pages
.into_iter() .into_iter()
.map(|p| p.change_data(data.clone())) .map(|p| p.change_data(page_f))
.collect::<Vec<_>>(); .collect::<Vec<_>>();
AdjacencyList { AdjacencyList {
@ -223,11 +230,11 @@ impl<P, L> AdjacencyList<P, L> {
} }
} }
pub fn change_link_data<L2: Clone>(self, data: L2) -> AdjacencyList<P, L2> { pub fn change_link_data<L2: Clone>(self, link_f: &impl Fn(L) -> L2) -> AdjacencyList<P, L2> {
let links = self let links = self
.links .links
.into_iter() .into_iter()
.map(|l| l.change_data(data.clone())) .map(|l| l.change_data(link_f))
.collect::<Vec<_>>(); .collect::<Vec<_>>();
AdjacencyList { AdjacencyList {