Make data representation more flexible
This commit is contained in:
parent
49b27715f0
commit
67f405a21e
3 changed files with 87 additions and 72 deletions
|
|
@ -6,7 +6,7 @@ use std::path::Path;
|
||||||
use rustc_hash::FxHashMap;
|
use rustc_hash::FxHashMap;
|
||||||
use serde::Deserialize;
|
use serde::Deserialize;
|
||||||
|
|
||||||
use crate::data::{AdjacencyList, Link, Page};
|
use crate::data::{AdjacencyList, Link, LinkInfo, Page, PageInfo};
|
||||||
use crate::util;
|
use crate::util;
|
||||||
|
|
||||||
#[derive(Deserialize)]
|
#[derive(Deserialize)]
|
||||||
|
|
@ -61,7 +61,7 @@ impl Titles {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn first_stage() -> io::Result<(AdjacencyList<(), ()>, Titles)> {
|
fn first_stage() -> io::Result<(AdjacencyList<PageInfo, LinkInfo>, Titles)> {
|
||||||
let mut titles = Titles::default();
|
let mut titles = Titles::default();
|
||||||
let mut result = AdjacencyList::default();
|
let mut result = AdjacencyList::default();
|
||||||
|
|
||||||
|
|
@ -71,20 +71,19 @@ fn first_stage() -> io::Result<(AdjacencyList<(), ()>, Titles)> {
|
||||||
|
|
||||||
result.pages.push(Page {
|
result.pages.push(Page {
|
||||||
link_idx: result.links.len() as u32,
|
link_idx: result.links.len() as u32,
|
||||||
|
data: PageInfo {
|
||||||
id: json_page.id,
|
id: json_page.id,
|
||||||
length: json_page.length,
|
length: json_page.length,
|
||||||
redirect: json_page.redirect.is_some(),
|
redirect: json_page.redirect.is_some(),
|
||||||
title: json_page.title,
|
title: json_page.title,
|
||||||
data: (),
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
for (to, start, end) in json_page.links {
|
for (to, start, end) in json_page.links {
|
||||||
let to = titles.insert(util::normalize_link(&to));
|
let to = titles.insert(util::normalize_link(&to));
|
||||||
result.links.push(Link {
|
result.links.push(Link {
|
||||||
to,
|
to,
|
||||||
start,
|
data: LinkInfo { start, end },
|
||||||
end,
|
|
||||||
data: (),
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -100,23 +99,29 @@ fn first_stage() -> io::Result<(AdjacencyList<(), ()>, Titles)> {
|
||||||
|
|
||||||
result.pages.push(Page {
|
result.pages.push(Page {
|
||||||
link_idx: result.links.len() as u32,
|
link_idx: result.links.len() as u32,
|
||||||
|
data: PageInfo {
|
||||||
id: 0,
|
id: 0,
|
||||||
length: 0,
|
length: 0,
|
||||||
redirect: false,
|
redirect: false,
|
||||||
title: "Sentinel page at the end of all pages, Q2AKO3OYzyitmCJURghJ".to_string(),
|
title: "Sentinel page at the end of all pages, Q2AKO3OYzyitmCJURghJ".to_string(),
|
||||||
data: (),
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
Ok((result, titles))
|
Ok((result, titles))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Create map from normalized title to index in pages.
|
/// Create map from normalized title to index in pages.
|
||||||
fn initialize_pages_map(pages: &[Page<()>]) -> FxHashMap<String, u32> {
|
fn initialize_pages_map(pages: &[Page<PageInfo>]) -> FxHashMap<String, u32> {
|
||||||
let mut result = FxHashMap::default();
|
let mut result = FxHashMap::default();
|
||||||
for (i, p) in pages.iter().enumerate() {
|
for (i, p) in pages.iter().enumerate() {
|
||||||
match result.entry(util::normalize_link(&p.title)) {
|
match result.entry(util::normalize_link(&p.data.title)) {
|
||||||
Entry::Occupied(entry) => {
|
Entry::Occupied(entry) => {
|
||||||
eprintln!("{:?} already exists at index {}", p.title, entry.get());
|
eprintln!(
|
||||||
|
"{:?} already exists at index {} as {:?}",
|
||||||
|
p.data.title,
|
||||||
|
entry.get(),
|
||||||
|
util::normalize_link(&p.data.title)
|
||||||
|
);
|
||||||
}
|
}
|
||||||
Entry::Vacant(entry) => {
|
Entry::Vacant(entry) => {
|
||||||
entry.insert(i as u32);
|
entry.insert(i as u32);
|
||||||
|
|
@ -126,7 +131,10 @@ fn initialize_pages_map(pages: &[Page<()>]) -> FxHashMap<String, u32> {
|
||||||
result
|
result
|
||||||
}
|
}
|
||||||
|
|
||||||
fn second_stage(first_stage: &AdjacencyList<(), ()>, titles: &Titles) -> AdjacencyList<(), ()> {
|
fn second_stage(
|
||||||
|
first_stage: &AdjacencyList<PageInfo, LinkInfo>,
|
||||||
|
titles: &Titles,
|
||||||
|
) -> AdjacencyList<PageInfo, LinkInfo> {
|
||||||
let pages_map = initialize_pages_map(&first_stage.pages);
|
let pages_map = initialize_pages_map(&first_stage.pages);
|
||||||
let mut result = AdjacencyList::default();
|
let mut result = AdjacencyList::default();
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -8,7 +8,7 @@ use crate::util;
|
||||||
pub fn path(datafile: &Path, from: &str, to: &str) -> io::Result<()> {
|
pub fn path(datafile: &Path, from: &str, to: &str) -> io::Result<()> {
|
||||||
eprintln!(">> Import");
|
eprintln!(">> Import");
|
||||||
let mut databuf = BufReader::new(File::open(datafile)?);
|
let mut databuf = BufReader::new(File::open(datafile)?);
|
||||||
let mut data = AdjacencyList::read(&mut databuf)?.change_page_data(f32::INFINITY);
|
let data = AdjacencyList::read(&mut databuf)?;
|
||||||
|
|
||||||
eprintln!(">> Locate from and to");
|
eprintln!(">> Locate from and to");
|
||||||
let from = util::normalize_link(from);
|
let from = util::normalize_link(from);
|
||||||
|
|
@ -17,15 +17,15 @@ pub fn path(datafile: &Path, from: &str, to: &str) -> io::Result<()> {
|
||||||
.pages
|
.pages
|
||||||
.iter()
|
.iter()
|
||||||
.enumerate()
|
.enumerate()
|
||||||
.filter(|(_, p)| !p.redirect)
|
.filter(|(_, p)| !p.data.redirect)
|
||||||
.find(|(_, p)| util::normalize_link(&p.title) == from)
|
.find(|(_, p)| util::normalize_link(&p.data.title) == from)
|
||||||
.unwrap_or_else(|| panic!("no article called {from}"));
|
.unwrap_or_else(|| panic!("no article called {from}"));
|
||||||
let (to_i, to_p) = data
|
let (to_i, to_p) = data
|
||||||
.pages
|
.pages
|
||||||
.iter()
|
.iter()
|
||||||
.enumerate()
|
.enumerate()
|
||||||
.filter(|(_, p)| !p.redirect)
|
.filter(|(_, p)| !p.data.redirect)
|
||||||
.find(|(_, p)| util::normalize_link(&p.title) == to)
|
.find(|(_, p)| util::normalize_link(&p.data.title) == to)
|
||||||
.unwrap_or_else(|| panic!("no article called {to}"));
|
.unwrap_or_else(|| panic!("no article called {to}"));
|
||||||
dbg!(from_i, from_p, to_i, to_p);
|
dbg!(from_i, from_p, to_i, to_p);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,5 @@
|
||||||
use std::io::{self, Read, Write};
|
use std::io::{self, Read, Write};
|
||||||
|
|
||||||
use serde::{Deserialize, Serialize};
|
|
||||||
|
|
||||||
mod ioutil {
|
mod ioutil {
|
||||||
use std::io::{self, Read, Write};
|
use std::io::{self, Read, Write};
|
||||||
|
|
||||||
|
|
@ -50,23 +48,27 @@ mod ioutil {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct Page<P> {
|
pub struct PageInfo {
|
||||||
pub link_idx: u32,
|
|
||||||
pub id: u32,
|
pub id: u32,
|
||||||
pub length: u32,
|
pub length: u32,
|
||||||
pub redirect: bool,
|
pub redirect: bool,
|
||||||
pub title: String,
|
pub title: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Copy)]
|
||||||
|
pub struct Page<P> {
|
||||||
|
pub link_idx: u32,
|
||||||
pub data: P,
|
pub data: P,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Page<()> {
|
impl Page<PageInfo> {
|
||||||
pub fn write<W: Write>(&self, to: &mut W) -> io::Result<()> {
|
pub fn write<W: Write>(&self, to: &mut W) -> io::Result<()> {
|
||||||
ioutil::write_u32(self.link_idx, to)?;
|
ioutil::write_u32(self.link_idx, to)?;
|
||||||
ioutil::write_u32(self.id, to)?;
|
ioutil::write_u32(self.data.id, to)?;
|
||||||
ioutil::write_u32(self.length, to)?;
|
ioutil::write_u32(self.data.length, to)?;
|
||||||
ioutil::write_u8(if self.redirect { 1 } else { 0 }, to)?;
|
ioutil::write_u8(if self.data.redirect { 1 } else { 0 }, to)?;
|
||||||
ioutil::write_str(&self.title, to)?;
|
ioutil::write_str(&self.data.title, to)?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
@ -80,41 +82,42 @@ impl Page<()> {
|
||||||
|
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
link_idx,
|
link_idx,
|
||||||
|
data: PageInfo {
|
||||||
id,
|
id,
|
||||||
length,
|
length,
|
||||||
redirect,
|
redirect,
|
||||||
title,
|
title,
|
||||||
data: (),
|
},
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<P> Page<P> {
|
impl<P> Page<P> {
|
||||||
pub fn change_data<P2>(self, data: P2) -> Page<P2> {
|
pub fn change_data<P2>(self, f: &impl Fn(P) -> P2) -> Page<P2> {
|
||||||
Page {
|
Page {
|
||||||
link_idx: self.link_idx,
|
link_idx: self.link_idx,
|
||||||
id: self.id,
|
data: f(self.data),
|
||||||
length: self.length,
|
|
||||||
redirect: self.redirect,
|
|
||||||
title: self.title,
|
|
||||||
data,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Copy)]
|
||||||
pub struct Link<L> {
|
pub struct LinkInfo {
|
||||||
pub to: u32,
|
|
||||||
pub start: u32,
|
pub start: u32,
|
||||||
pub end: u32,
|
pub end: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Copy)]
|
||||||
|
pub struct Link<L> {
|
||||||
|
pub to: u32,
|
||||||
pub data: L,
|
pub data: L,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Link<()> {
|
impl Link<LinkInfo> {
|
||||||
pub fn write<W: Write>(&self, to: &mut W) -> io::Result<()> {
|
pub fn write<W: Write>(&self, to: &mut W) -> io::Result<()> {
|
||||||
ioutil::write_u32(self.to, to)?;
|
ioutil::write_u32(self.to, to)?;
|
||||||
ioutil::write_u32(self.start, to)?;
|
ioutil::write_u32(self.data.start, to)?;
|
||||||
ioutil::write_u32(self.end, to)?;
|
ioutil::write_u32(self.data.end, to)?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
@ -126,31 +129,35 @@ impl Link<()> {
|
||||||
|
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
to,
|
to,
|
||||||
start,
|
data: LinkInfo { start, end },
|
||||||
end,
|
|
||||||
data: (),
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<P> Link<P> {
|
impl<L> Link<L> {
|
||||||
pub fn change_data<P2>(self, data: P2) -> Link<P2> {
|
pub fn change_data<L2>(self, f: &impl Fn(L) -> L2) -> Link<L2> {
|
||||||
Link {
|
Link {
|
||||||
to: self.to,
|
to: self.to,
|
||||||
start: self.start,
|
data: f(self.data),
|
||||||
end: self.end,
|
|
||||||
data,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Default, Serialize, Deserialize)]
|
|
||||||
pub struct AdjacencyList<P, L> {
|
pub struct AdjacencyList<P, L> {
|
||||||
pub pages: Vec<Page<P>>,
|
pub pages: Vec<Page<P>>,
|
||||||
pub links: Vec<Link<L>>,
|
pub links: Vec<Link<L>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl AdjacencyList<(), ()> {
|
impl<P, L> Default for AdjacencyList<P, L> {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self {
|
||||||
|
pages: Default::default(),
|
||||||
|
links: Default::default(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl AdjacencyList<PageInfo, LinkInfo> {
|
||||||
pub fn write<W: Write>(&self, to: &mut W) -> io::Result<()> {
|
pub fn write<W: Write>(&self, to: &mut W) -> io::Result<()> {
|
||||||
ioutil::write_u32(self.pages.len() as u32, to)?;
|
ioutil::write_u32(self.pages.len() as u32, to)?;
|
||||||
ioutil::write_u32(self.links.len() as u32, to)?;
|
ioutil::write_u32(self.links.len() as u32, to)?;
|
||||||
|
|
@ -182,23 +189,21 @@ impl AdjacencyList<(), ()> {
|
||||||
|
|
||||||
Ok(Self { pages, links })
|
Ok(Self { pages, links })
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
impl<P, L> AdjacencyList<P, L> {
|
|
||||||
pub fn check_consistency(&self) {
|
pub fn check_consistency(&self) {
|
||||||
// Check that all types are large enough
|
// Check that all types are large enough
|
||||||
assert!(self.pages.len() <= u32::MAX as usize, "pages len");
|
assert!(self.pages.len() <= u32::MAX as usize, "pages len");
|
||||||
assert!(self.links.len() <= u32::MAX as usize, "links len");
|
assert!(self.links.len() <= u32::MAX as usize, "links len");
|
||||||
for page in &self.pages {
|
for page in &self.pages {
|
||||||
assert!(page.link_idx <= u32::MAX as u32, "page link_idx");
|
assert!(page.link_idx <= u32::MAX as u32, "page link_idx");
|
||||||
assert!(page.id <= u32::MAX as u32, "page id");
|
assert!(page.data.id <= u32::MAX as u32, "page id");
|
||||||
assert!(page.length <= u32::MAX as u32, "page length");
|
assert!(page.data.length <= u32::MAX as u32, "page length");
|
||||||
assert!(page.title.len() <= u8::MAX as usize, "page title len");
|
assert!(page.data.title.len() <= u8::MAX as usize, "page title len");
|
||||||
}
|
}
|
||||||
for link in &self.links {
|
for link in &self.links {
|
||||||
assert!(link.to <= u32::MAX as u32, "link to");
|
assert!(link.to <= u32::MAX as u32, "link to");
|
||||||
assert!(link.start <= u32::MAX as u32, "link start");
|
assert!(link.data.start <= u32::MAX as u32, "link start");
|
||||||
assert!(link.end <= u32::MAX as u32, "link end");
|
assert!(link.data.end <= u32::MAX as u32, "link end");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check that all links contain valid indices
|
// Check that all links contain valid indices
|
||||||
|
|
@ -209,12 +214,14 @@ impl<P, L> AdjacencyList<P, L> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub fn change_page_data<P2: Clone>(self, data: P2) -> AdjacencyList<P2, L> {
|
impl<P, L> AdjacencyList<P, L> {
|
||||||
|
pub fn change_page_data<P2: Clone>(self, page_f: &impl Fn(P) -> P2) -> AdjacencyList<P2, L> {
|
||||||
let pages = self
|
let pages = self
|
||||||
.pages
|
.pages
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|p| p.change_data(data.clone()))
|
.map(|p| p.change_data(page_f))
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
AdjacencyList {
|
AdjacencyList {
|
||||||
|
|
@ -223,11 +230,11 @@ impl<P, L> AdjacencyList<P, L> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn change_link_data<L2: Clone>(self, data: L2) -> AdjacencyList<P, L2> {
|
pub fn change_link_data<L2: Clone>(self, link_f: &impl Fn(L) -> L2) -> AdjacencyList<P, L2> {
|
||||||
let links = self
|
let links = self
|
||||||
.links
|
.links
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|l| l.change_data(data.clone()))
|
.map(|l| l.change_data(link_f))
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
AdjacencyList {
|
AdjacencyList {
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue