From e04215802e2ffeeb12a6c85d304b66fa5ccc569a Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 31 Dec 2024 02:37:15 +0100 Subject: [PATCH] Speed up ingest using rustc_hash An enwiki ingest went from ca. 6:50 minutes down to ca. 7:00 minutes. Oh wait... This was not a rigorous test, but rustc_hash doesn't seem to have a significant positive impact. Maybe I'm just holding it wrong, but right now I'd rather remove it again and have simpler code/deps. --- brood/src/commands/ingest.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/brood/src/commands/ingest.rs b/brood/src/commands/ingest.rs index 7e0f223..fd26d39 100644 --- a/brood/src/commands/ingest.rs +++ b/brood/src/commands/ingest.rs @@ -1,10 +1,11 @@ use std::{ - collections::{hash_map::Entry, HashMap}, + collections::hash_map::Entry, fs::File, io::{self, BufRead, BufReader, Seek}, path::{Path, PathBuf}, }; +use rustc_hash::FxHashMap; use serde::Deserialize; use thousands::Separable; @@ -48,9 +49,9 @@ fn read_titles(r: &mut BufReader) -> io::Result> { fn compute_title_lookup( normalizer: &TitleNormalizer, titles: &[String], -) -> HashMap { +) -> FxHashMap { let mut counter = Counter::new(); - let mut title_lookup = HashMap::::new(); + let mut title_lookup = FxHashMap::::default(); for (sift_i, title) in titles.iter().enumerate() { counter.tick(); @@ -85,7 +86,7 @@ fn compute_title_lookup( fn read_page_data( normalizer: &TitleNormalizer, - title_lookup: &HashMap, + title_lookup: &FxHashMap, r: &mut BufReader, ) -> io::Result<(Vec, Vec, Graph)> { let mut counter = Counter::new();