From 423dd100c1360decffc5107ea4757d751ac0f4db Mon Sep 17 00:00:00 2001 From: Joscha Date: Sun, 23 Feb 2025 17:19:59 +0100 Subject: [PATCH] Add unicode-based grapheme width estimation method --- CHANGELOG.md | 3 ++ src/terminal.rs | 20 ++++++++++-- src/widthdb.rs | 85 ++++++++++++++++++++++++++++++------------------- 3 files changed, 73 insertions(+), 35 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d84e1fe..6292746 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,9 @@ Procedure when bumping the version number: ## Unreleased +### Added +- Unicode-based grapheme width estimation method + ## v0.3.1 - 2025-02-21 ### Fixed diff --git a/src/terminal.rs b/src/terminal.rs index 439ce4e..c26b0fc 100644 --- a/src/terminal.rs +++ b/src/terminal.rs @@ -16,7 +16,7 @@ use crossterm::terminal::{ use crossterm::{ExecutableCommand, QueueableCommand}; use crate::buffer::Buffer; -use crate::{AsyncWidget, Frame, Size, Widget, WidthDb}; +use crate::{AsyncWidget, Frame, Size, Widget, WidthDb, WidthEstimationMethod}; /// Wrapper that manages terminal output. /// @@ -112,11 +112,25 @@ impl Terminal { self.frame.widthdb.tab_width } + /// Set the grapheme width estimation method. + /// + /// For more details, see [`WidthEstimationMethod`]. + pub fn set_width_estimation_method(&mut self, method: WidthEstimationMethod) { + self.frame.widthdb.estimate = method; + } + + /// The grapheme width estimation method. + /// + /// For more details, see [`WidthEstimationMethod`]. + pub fn width_estimation_method(&mut self) -> WidthEstimationMethod { + self.frame.widthdb.estimate + } + /// Enable or disable grapheme width measurements. /// /// For more details, see [`Self::measuring`]. pub fn set_measuring(&mut self, active: bool) { - self.frame.widthdb.active = active; + self.frame.widthdb.measure = active; } /// Whether grapheme widths should be measured or estimated. @@ -135,7 +149,7 @@ impl Terminal { /// Standard Annex #11. This usually works fine, but may break on some emoji /// or other less commonly used character sequences. pub fn measuring(&self) -> bool { - self.frame.widthdb.active + self.frame.widthdb.measure } /// Whether any unmeasured graphemes were seen since the last call to diff --git a/src/widthdb.rs b/src/widthdb.rs index 53f20ec..bb21ef6 100644 --- a/src/widthdb.rs +++ b/src/widthdb.rs @@ -6,14 +6,31 @@ use crossterm::style::Print; use crossterm::terminal::{Clear, ClearType}; use crossterm::QueueableCommand; use unicode_segmentation::UnicodeSegmentation; -use unicode_width::UnicodeWidthChar; +use unicode_width::{UnicodeWidthChar, UnicodeWidthStr}; use crate::wrap; +#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)] +pub enum WidthEstimationMethod { + /// Estimate the width of a grapheme using legacy methods. + /// + /// Different terminal emulators all use different approaches to determine + /// grapheme widths, so this method will never be able to give a fully + /// correct solution. For that, the only possible approach is measuring the + /// actual grapheme width. + #[default] + Legacy, + + /// Estimate the width of a grapheme using the unicode standard in a + /// best-effort manner. + Unicode, +} + /// Measures and stores the with (in terminal coordinates) of graphemes. #[derive(Debug)] pub struct WidthDb { - pub(crate) active: bool, + pub(crate) estimate: WidthEstimationMethod, + pub(crate) measure: bool, pub(crate) tab_width: u8, known: HashMap, requested: HashSet, @@ -22,7 +39,8 @@ pub struct WidthDb { impl Default for WidthDb { fn default() -> Self { Self { - active: false, + estimate: WidthEstimationMethod::default(), + measure: false, tab_width: 8, known: Default::default(), requested: Default::default(), @@ -36,26 +54,6 @@ impl WidthDb { self.tab_width - (col % self.tab_width as usize) as u8 } - /// Estimate what our terminal emulator thinks the width of a grapheme is. - /// - /// Different terminal emulators are all broken in different ways, so this - /// method will never be able to give a correct solution. For that, the only - /// possible method is actually measuring. - /// - /// Instead, it implements a character-wise width calculation. The hope is - /// that dumb terminal emulators do something roughly like this, and smart - /// terminal emulators try to emulate dumb ones for compatibility. In - /// practice, this counting approach seems to be fairly robust. - fn grapheme_width_estimate(grapheme: &str) -> u8 { - grapheme - .chars() - .filter(|c| !c.is_ascii_control()) - .flat_map(|c| c.width()) - .sum::() - .try_into() - .unwrap_or(u8::MAX) - } - /// Determine the width of a grapheme. /// /// If the grapheme is a tab, the column is used to determine its width. @@ -67,14 +65,37 @@ impl WidthDb { if grapheme == "\t" { return self.tab_width_at_column(col); } - if !self.active { - return Self::grapheme_width_estimate(grapheme); - } - if let Some(width) = self.known.get(grapheme) { - *width - } else { + + if self.measure { + if let Some(width) = self.known.get(grapheme) { + return *width; + } self.requested.insert(grapheme.to_string()); - Self::grapheme_width_estimate(grapheme) + } + + match self.estimate { + // A character-wise width calculation is a simple and obvious + // approach to compute character widths. The idea is that dumb + // terminal emulators tend to do something roughly like this, and + // smart terminal emulators try to emulate dumb ones for + // compatibility. In practice, this approach seems to be fairly + // robust. + WidthEstimationMethod::Legacy => grapheme + .chars() + .filter(|c| !c.is_ascii_control()) + .flat_map(|c| c.width()) + .sum::() + .try_into() + .unwrap_or(u8::MAX), + + // The unicode width crate considers newlines to have a width of 1 + // while the rendering code expects it to have a width of 0. + WidthEstimationMethod::Unicode => grapheme + .split('\n') + .map(|s| s.width()) + .sum::() + .try_into() + .unwrap_or(u8::MAX), } } @@ -107,7 +128,7 @@ impl WidthDb { /// Whether any new graphemes have been seen since the last time /// [`Self::measure_widths`] was called. pub(crate) fn measuring_required(&self) -> bool { - self.active && !self.requested.is_empty() + self.measure && !self.requested.is_empty() } /// Measure the width of all new graphemes that have been seen since the @@ -117,7 +138,7 @@ impl WidthDb { /// the terminal. After it finishes, the terminal's contents should be /// assumed to be garbage and a full redraw should be performed. pub(crate) fn measure_widths(&mut self, out: &mut impl Write) -> io::Result<()> { - if !self.active { + if !self.measure { return Ok(()); } for grapheme in self.requested.drain() {