Speed up ingest using rustc_hash

An enwiki ingest went from ca. 6:50 minutes down to ca. 7:00 minutes. Oh
wait...

This was not a rigorous test, but rustc_hash doesn't seem to have a
significant positive impact. Maybe I'm just holding it wrong, but right
now I'd rather remove it again and have simpler code/deps.
This commit is contained in:
Joscha 2024-12-31 02:37:15 +01:00
parent eb631250d7
commit e04215802e

View file

@ -1,10 +1,11 @@
use std::{ use std::{
collections::{hash_map::Entry, HashMap}, collections::hash_map::Entry,
fs::File, fs::File,
io::{self, BufRead, BufReader, Seek}, io::{self, BufRead, BufReader, Seek},
path::{Path, PathBuf}, path::{Path, PathBuf},
}; };
use rustc_hash::FxHashMap;
use serde::Deserialize; use serde::Deserialize;
use thousands::Separable; use thousands::Separable;
@ -48,9 +49,9 @@ fn read_titles(r: &mut BufReader<File>) -> io::Result<Vec<String>> {
fn compute_title_lookup( fn compute_title_lookup(
normalizer: &TitleNormalizer, normalizer: &TitleNormalizer,
titles: &[String], titles: &[String],
) -> HashMap<String, (u32, u32)> { ) -> FxHashMap<String, (u32, u32)> {
let mut counter = Counter::new(); let mut counter = Counter::new();
let mut title_lookup = HashMap::<String, (u32, u32)>::new(); let mut title_lookup = FxHashMap::<String, (u32, u32)>::default();
for (sift_i, title) in titles.iter().enumerate() { for (sift_i, title) in titles.iter().enumerate() {
counter.tick(); counter.tick();
@ -85,7 +86,7 @@ fn compute_title_lookup(
fn read_page_data( fn read_page_data(
normalizer: &TitleNormalizer, normalizer: &TitleNormalizer,
title_lookup: &HashMap<String, (u32, u32)>, title_lookup: &FxHashMap<String, (u32, u32)>,
r: &mut BufReader<File>, r: &mut BufReader<File>,
) -> io::Result<(Vec<Page>, Vec<Link>, Graph)> { ) -> io::Result<(Vec<Page>, Vec<Link>, Graph)> {
let mut counter = Counter::new(); let mut counter = Counter::new();