Compute article rankings by link degrees

This commit is contained in:
Joscha 2024-12-31 18:26:12 +01:00
parent 5b8feb6368
commit cdf9a7d7ae
2 changed files with 87 additions and 0 deletions

View file

@ -1,3 +1,4 @@
mod degrees;
mod redirects; mod redirects;
use std::io; use std::io;
@ -8,6 +9,7 @@ use crate::data::Data;
#[derive(Debug, clap::Parser)] #[derive(Debug, clap::Parser)]
enum Command { enum Command {
Degrees(degrees::Cmd),
Redirects(redirects::Cmd), Redirects(redirects::Cmd),
} }
@ -22,6 +24,7 @@ impl Cmd {
pub fn run(self, data: Data) -> io::Result<()> { pub fn run(self, data: Data) -> io::Result<()> {
if let Some(cmd) = self.command { if let Some(cmd) = self.command {
return match cmd { return match cmd {
Command::Degrees(cmd) => cmd.run(data),
Command::Redirects(cmd) => cmd.run(data), Command::Redirects(cmd) => cmd.run(data),
}; };
} }

View file

@ -0,0 +1,84 @@
use std::{cmp::Reverse, io};
use crate::{
algo,
data::{Data, Page},
util,
};
/// Show stats on article in- and out-degrees.
#[derive(Debug, clap::Parser)]
pub struct Cmd {
#[arg(long, short, default_value_t = 5)]
top: usize,
}
impl Cmd {
pub fn run(self, mut data: Data) -> io::Result<()> {
println!(">> Outdegree");
println!("> Counting links");
let mut outdegree = vec![usize::MAX; data.pages.len()];
for node in data.graph.nodes() {
outdegree[node.usize()] = data.graph.edge_range(node).len();
}
println!(">> Indegree");
println!("> Inverting edges");
algo::invert(&mut data);
let mut indegree = vec![usize::MAX; data.pages.len()];
println!("> Counting links");
for node in data.graph.nodes() {
indegree[node.usize()] = data.graph.edge_range(node).len();
}
let mut by_degrees = data
.pages
.iter()
.zip(outdegree)
.zip(indegree)
.map(|((p, od), id)| (p, od, id))
.collect::<Vec<_>>();
println!();
println!("Most outlinks");
println!("¯¯¯¯¯¯¯¯¯¯¯¯¯");
by_degrees.sort_by_key(|(_, od, _)| Reverse(*od));
self.print_links(&by_degrees);
println!();
println!("Most inlinks");
println!("¯¯¯¯¯¯¯¯¯¯¯¯");
by_degrees.sort_by_key(|(_, _, id)| Reverse(*id));
self.print_links(&by_degrees);
by_degrees.retain(|(_, od, id)| *od > 0 && *id > 0);
println!();
println!("Most outlinks per non-zero inlink");
println!("¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯");
by_degrees.sort_by(|(_, od1, id1), (_, od2, id2)| {
let r1 = *od1 as f32 / *id1 as f32;
let r2 = *od2 as f32 / *id2 as f32;
r2.total_cmp(&r1) // Reverse order so max values are at beginnibg
});
self.print_links(&by_degrees);
println!();
println!("Most inlinks per non-zero outlink");
println!("¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯");
by_degrees.reverse();
self.print_links(&by_degrees);
Ok(())
}
fn print_links(&self, by_degrees: &Vec<(&Page, usize, usize)>) {
for (page, od, id) in by_degrees.iter().take(self.top) {
println!("{} ({od} out, {id} in)", util::fmt_page(page));
}
}
}