Remove leon wiki graph command
This commit is contained in:
parent
c612bd35ad
commit
0eb745e928
5 changed files with 0 additions and 129 deletions
28
brood/Cargo.lock
generated
28
brood/Cargo.lock
generated
|
|
@ -56,7 +56,6 @@ name = "brood"
|
|||
version = "0.0.0"
|
||||
dependencies = [
|
||||
"clap",
|
||||
"csv",
|
||||
"rustc-hash",
|
||||
"serde",
|
||||
"serde_json",
|
||||
|
|
@ -108,27 +107,6 @@ version = "1.0.1"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0b6a852b24ab71dffc585bcb46eaf7959d175cb865a7152e35b348d1b2960422"
|
||||
|
||||
[[package]]
|
||||
name = "csv"
|
||||
version = "1.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ac574ff4d437a7b5ad237ef331c17ccca63c46479e5b5453eb8e10bb99a759fe"
|
||||
dependencies = [
|
||||
"csv-core",
|
||||
"itoa",
|
||||
"ryu",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "csv-core"
|
||||
version = "0.1.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5efa2b3d7902f4b634a20cae3c9c4e6209dc4779feb6863329607560143efa70"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "heck"
|
||||
version = "0.5.0"
|
||||
|
|
@ -147,12 +125,6 @@ version = "1.0.11"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
|
||||
|
||||
[[package]]
|
||||
name = "memchr"
|
||||
version = "2.7.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.86"
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ edition = "2021"
|
|||
|
||||
[dependencies]
|
||||
clap = { version = "4.5.7", features = ["derive", "deprecated"] }
|
||||
csv = "1.3.0"
|
||||
rustc-hash = "2.0.0"
|
||||
serde = { version = "1.0.203", features = ["derive"] }
|
||||
serde_json = "1.0.118"
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
pub mod ingest;
|
||||
pub mod leon_wiki_graph;
|
||||
pub mod list_pages;
|
||||
pub mod longest_shortest_path;
|
||||
pub mod path;
|
||||
|
|
|
|||
|
|
@ -1,94 +0,0 @@
|
|||
use std::collections::HashSet;
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader, BufWriter};
|
||||
use std::path::Path;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::data::AdjacencyList;
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct Article {
|
||||
title: String,
|
||||
url: String,
|
||||
language: String,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct NodeRow {
|
||||
id: u32,
|
||||
label: String,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct EdgeRow {
|
||||
source: u32,
|
||||
target: u32,
|
||||
}
|
||||
|
||||
pub fn run(datafile: &Path, articlesfile: &Path, language: &str) -> io::Result<()> {
|
||||
let mut databuf = BufReader::new(File::open(datafile)?);
|
||||
let data = AdjacencyList::read(&mut databuf)?;
|
||||
|
||||
let articlesbuf = BufReader::new(File::open(articlesfile)?);
|
||||
let articles: Vec<Article> =
|
||||
serde_json::from_reader(articlesbuf).expect("failed to parse articles file");
|
||||
|
||||
let titles = articles
|
||||
.into_iter()
|
||||
.filter(|a| a.language == language)
|
||||
.map(|a| a.title)
|
||||
.collect::<HashSet<_>>();
|
||||
|
||||
let page_ids = data
|
||||
.pages
|
||||
.split_last()
|
||||
.unwrap()
|
||||
.1
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter(|(_, p)| titles.contains(&p.data.title))
|
||||
.map(|(i, _)| i as u32)
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let mut node_rows = vec![];
|
||||
for i in &page_ids {
|
||||
let page = data.page(*i);
|
||||
let row = NodeRow {
|
||||
id: *i,
|
||||
label: page.data.title.clone(),
|
||||
};
|
||||
node_rows.push(row);
|
||||
}
|
||||
|
||||
let mut edge_rows = vec![];
|
||||
for i in &page_ids {
|
||||
let links = data
|
||||
.link_range(*i)
|
||||
.map(|li| data.link(li).to)
|
||||
.filter(|to| page_ids.contains(to))
|
||||
.collect::<HashSet<_>>();
|
||||
|
||||
for to in links {
|
||||
let row = EdgeRow {
|
||||
source: *i,
|
||||
target: to,
|
||||
};
|
||||
edge_rows.push(row);
|
||||
}
|
||||
}
|
||||
|
||||
let node_writer = BufWriter::new(File::create("nodes.csv")?);
|
||||
let mut node_writer = csv::Writer::from_writer(node_writer);
|
||||
for node in node_rows {
|
||||
node_writer.serialize(node).unwrap();
|
||||
}
|
||||
|
||||
let edge_writer = BufWriter::new(File::create("edges.csv")?);
|
||||
let mut edge_writer = csv::Writer::from_writer(edge_writer);
|
||||
for edge in edge_rows {
|
||||
edge_writer.serialize(edge).unwrap();
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
|
@ -25,8 +25,6 @@ enum Command {
|
|||
LongestShortestPath { from: String },
|
||||
/// Print all page titles.
|
||||
ListPages,
|
||||
/// Construct wikipedia article graph for Leon.
|
||||
LeonWikiGraph { articles: PathBuf, language: String },
|
||||
}
|
||||
|
||||
#[derive(Debug, Parser)]
|
||||
|
|
@ -52,8 +50,5 @@ fn main() -> io::Result<()> {
|
|||
commands::longest_shortest_path::run(&args.datafile, &from)
|
||||
}
|
||||
Command::ListPages => commands::list_pages::run(&args.datafile),
|
||||
Command::LeonWikiGraph { articles, language } => {
|
||||
commands::leon_wiki_graph::run(&args.datafile, &articles, &language)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue