From 0eb745e928f24d351bcc4e37179f5931a0349411 Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 25 Aug 2024 20:32:06 +0200 Subject: [PATCH] Remove leon wiki graph command --- brood/Cargo.lock | 28 -------- brood/Cargo.toml | 1 - brood/src/commands.rs | 1 - brood/src/commands/leon_wiki_graph.rs | 94 --------------------------- brood/src/main.rs | 5 -- 5 files changed, 129 deletions(-) delete mode 100644 brood/src/commands/leon_wiki_graph.rs diff --git a/brood/Cargo.lock b/brood/Cargo.lock index 130b704..0162043 100644 --- a/brood/Cargo.lock +++ b/brood/Cargo.lock @@ -56,7 +56,6 @@ name = "brood" version = "0.0.0" dependencies = [ "clap", - "csv", "rustc-hash", "serde", "serde_json", @@ -108,27 +107,6 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b6a852b24ab71dffc585bcb46eaf7959d175cb865a7152e35b348d1b2960422" -[[package]] -name = "csv" -version = "1.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac574ff4d437a7b5ad237ef331c17ccca63c46479e5b5453eb8e10bb99a759fe" -dependencies = [ - "csv-core", - "itoa", - "ryu", - "serde", -] - -[[package]] -name = "csv-core" -version = "0.1.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5efa2b3d7902f4b634a20cae3c9c4e6209dc4779feb6863329607560143efa70" -dependencies = [ - "memchr", -] - [[package]] name = "heck" version = "0.5.0" @@ -147,12 +125,6 @@ version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" -[[package]] -name = "memchr" -version = "2.7.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" - [[package]] name = "proc-macro2" version = "1.0.86" diff --git a/brood/Cargo.toml b/brood/Cargo.toml index 188b948..940f920 100644 --- a/brood/Cargo.toml +++ b/brood/Cargo.toml @@ -5,7 +5,6 @@ edition = "2021" [dependencies] clap = { version = "4.5.7", features = ["derive", "deprecated"] } -csv = "1.3.0" rustc-hash = "2.0.0" serde = { version = "1.0.203", features = ["derive"] } serde_json = "1.0.118" diff --git a/brood/src/commands.rs b/brood/src/commands.rs index 6e64354..02d47b9 100644 --- a/brood/src/commands.rs +++ b/brood/src/commands.rs @@ -1,5 +1,4 @@ pub mod ingest; -pub mod leon_wiki_graph; pub mod list_pages; pub mod longest_shortest_path; pub mod path; diff --git a/brood/src/commands/leon_wiki_graph.rs b/brood/src/commands/leon_wiki_graph.rs deleted file mode 100644 index d8b1a0f..0000000 --- a/brood/src/commands/leon_wiki_graph.rs +++ /dev/null @@ -1,94 +0,0 @@ -use std::collections::HashSet; -use std::fs::File; -use std::io::{self, BufReader, BufWriter}; -use std::path::Path; - -use serde::{Deserialize, Serialize}; - -use crate::data::AdjacencyList; - -#[derive(Deserialize)] -struct Article { - title: String, - url: String, - language: String, -} - -#[derive(Serialize)] -struct NodeRow { - id: u32, - label: String, -} - -#[derive(Serialize)] -struct EdgeRow { - source: u32, - target: u32, -} - -pub fn run(datafile: &Path, articlesfile: &Path, language: &str) -> io::Result<()> { - let mut databuf = BufReader::new(File::open(datafile)?); - let data = AdjacencyList::read(&mut databuf)?; - - let articlesbuf = BufReader::new(File::open(articlesfile)?); - let articles: Vec
= - serde_json::from_reader(articlesbuf).expect("failed to parse articles file"); - - let titles = articles - .into_iter() - .filter(|a| a.language == language) - .map(|a| a.title) - .collect::>(); - - let page_ids = data - .pages - .split_last() - .unwrap() - .1 - .iter() - .enumerate() - .filter(|(_, p)| titles.contains(&p.data.title)) - .map(|(i, _)| i as u32) - .collect::>(); - - let mut node_rows = vec![]; - for i in &page_ids { - let page = data.page(*i); - let row = NodeRow { - id: *i, - label: page.data.title.clone(), - }; - node_rows.push(row); - } - - let mut edge_rows = vec![]; - for i in &page_ids { - let links = data - .link_range(*i) - .map(|li| data.link(li).to) - .filter(|to| page_ids.contains(to)) - .collect::>(); - - for to in links { - let row = EdgeRow { - source: *i, - target: to, - }; - edge_rows.push(row); - } - } - - let node_writer = BufWriter::new(File::create("nodes.csv")?); - let mut node_writer = csv::Writer::from_writer(node_writer); - for node in node_rows { - node_writer.serialize(node).unwrap(); - } - - let edge_writer = BufWriter::new(File::create("edges.csv")?); - let mut edge_writer = csv::Writer::from_writer(edge_writer); - for edge in edge_rows { - edge_writer.serialize(edge).unwrap(); - } - - Ok(()) -} diff --git a/brood/src/main.rs b/brood/src/main.rs index 91e2e1b..d97d617 100644 --- a/brood/src/main.rs +++ b/brood/src/main.rs @@ -25,8 +25,6 @@ enum Command { LongestShortestPath { from: String }, /// Print all page titles. ListPages, - /// Construct wikipedia article graph for Leon. - LeonWikiGraph { articles: PathBuf, language: String }, } #[derive(Debug, Parser)] @@ -52,8 +50,5 @@ fn main() -> io::Result<()> { commands::longest_shortest_path::run(&args.datafile, &from) } Command::ListPages => commands::list_pages::run(&args.datafile), - Command::LeonWikiGraph { articles, language } => { - commands::leon_wiki_graph::run(&args.datafile, &articles, &language) - } } }