Add unicode-based grapheme width estimation method

This commit is contained in:
Joscha 2025-02-23 17:19:59 +01:00
parent be7eff0979
commit 423dd100c1
3 changed files with 73 additions and 35 deletions

View file

@ -13,6 +13,9 @@ Procedure when bumping the version number:
## Unreleased ## Unreleased
### Added
- Unicode-based grapheme width estimation method
## v0.3.1 - 2025-02-21 ## v0.3.1 - 2025-02-21
### Fixed ### Fixed

View file

@ -16,7 +16,7 @@ use crossterm::terminal::{
use crossterm::{ExecutableCommand, QueueableCommand}; use crossterm::{ExecutableCommand, QueueableCommand};
use crate::buffer::Buffer; use crate::buffer::Buffer;
use crate::{AsyncWidget, Frame, Size, Widget, WidthDb}; use crate::{AsyncWidget, Frame, Size, Widget, WidthDb, WidthEstimationMethod};
/// Wrapper that manages terminal output. /// Wrapper that manages terminal output.
/// ///
@ -112,11 +112,25 @@ impl Terminal {
self.frame.widthdb.tab_width self.frame.widthdb.tab_width
} }
/// Set the grapheme width estimation method.
///
/// For more details, see [`WidthEstimationMethod`].
pub fn set_width_estimation_method(&mut self, method: WidthEstimationMethod) {
self.frame.widthdb.estimate = method;
}
/// The grapheme width estimation method.
///
/// For more details, see [`WidthEstimationMethod`].
pub fn width_estimation_method(&mut self) -> WidthEstimationMethod {
self.frame.widthdb.estimate
}
/// Enable or disable grapheme width measurements. /// Enable or disable grapheme width measurements.
/// ///
/// For more details, see [`Self::measuring`]. /// For more details, see [`Self::measuring`].
pub fn set_measuring(&mut self, active: bool) { pub fn set_measuring(&mut self, active: bool) {
self.frame.widthdb.active = active; self.frame.widthdb.measure = active;
} }
/// Whether grapheme widths should be measured or estimated. /// Whether grapheme widths should be measured or estimated.
@ -135,7 +149,7 @@ impl Terminal {
/// Standard Annex #11. This usually works fine, but may break on some emoji /// Standard Annex #11. This usually works fine, but may break on some emoji
/// or other less commonly used character sequences. /// or other less commonly used character sequences.
pub fn measuring(&self) -> bool { pub fn measuring(&self) -> bool {
self.frame.widthdb.active self.frame.widthdb.measure
} }
/// Whether any unmeasured graphemes were seen since the last call to /// Whether any unmeasured graphemes were seen since the last call to

View file

@ -6,14 +6,31 @@ use crossterm::style::Print;
use crossterm::terminal::{Clear, ClearType}; use crossterm::terminal::{Clear, ClearType};
use crossterm::QueueableCommand; use crossterm::QueueableCommand;
use unicode_segmentation::UnicodeSegmentation; use unicode_segmentation::UnicodeSegmentation;
use unicode_width::UnicodeWidthChar; use unicode_width::{UnicodeWidthChar, UnicodeWidthStr};
use crate::wrap; use crate::wrap;
#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
pub enum WidthEstimationMethod {
/// Estimate the width of a grapheme using legacy methods.
///
/// Different terminal emulators all use different approaches to determine
/// grapheme widths, so this method will never be able to give a fully
/// correct solution. For that, the only possible approach is measuring the
/// actual grapheme width.
#[default]
Legacy,
/// Estimate the width of a grapheme using the unicode standard in a
/// best-effort manner.
Unicode,
}
/// Measures and stores the with (in terminal coordinates) of graphemes. /// Measures and stores the with (in terminal coordinates) of graphemes.
#[derive(Debug)] #[derive(Debug)]
pub struct WidthDb { pub struct WidthDb {
pub(crate) active: bool, pub(crate) estimate: WidthEstimationMethod,
pub(crate) measure: bool,
pub(crate) tab_width: u8, pub(crate) tab_width: u8,
known: HashMap<String, u8>, known: HashMap<String, u8>,
requested: HashSet<String>, requested: HashSet<String>,
@ -22,7 +39,8 @@ pub struct WidthDb {
impl Default for WidthDb { impl Default for WidthDb {
fn default() -> Self { fn default() -> Self {
Self { Self {
active: false, estimate: WidthEstimationMethod::default(),
measure: false,
tab_width: 8, tab_width: 8,
known: Default::default(), known: Default::default(),
requested: Default::default(), requested: Default::default(),
@ -36,26 +54,6 @@ impl WidthDb {
self.tab_width - (col % self.tab_width as usize) as u8 self.tab_width - (col % self.tab_width as usize) as u8
} }
/// Estimate what our terminal emulator thinks the width of a grapheme is.
///
/// Different terminal emulators are all broken in different ways, so this
/// method will never be able to give a correct solution. For that, the only
/// possible method is actually measuring.
///
/// Instead, it implements a character-wise width calculation. The hope is
/// that dumb terminal emulators do something roughly like this, and smart
/// terminal emulators try to emulate dumb ones for compatibility. In
/// practice, this counting approach seems to be fairly robust.
fn grapheme_width_estimate(grapheme: &str) -> u8 {
grapheme
.chars()
.filter(|c| !c.is_ascii_control())
.flat_map(|c| c.width())
.sum::<usize>()
.try_into()
.unwrap_or(u8::MAX)
}
/// Determine the width of a grapheme. /// Determine the width of a grapheme.
/// ///
/// If the grapheme is a tab, the column is used to determine its width. /// If the grapheme is a tab, the column is used to determine its width.
@ -67,14 +65,37 @@ impl WidthDb {
if grapheme == "\t" { if grapheme == "\t" {
return self.tab_width_at_column(col); return self.tab_width_at_column(col);
} }
if !self.active {
return Self::grapheme_width_estimate(grapheme); if self.measure {
}
if let Some(width) = self.known.get(grapheme) { if let Some(width) = self.known.get(grapheme) {
*width return *width;
} else { }
self.requested.insert(grapheme.to_string()); self.requested.insert(grapheme.to_string());
Self::grapheme_width_estimate(grapheme) }
match self.estimate {
// A character-wise width calculation is a simple and obvious
// approach to compute character widths. The idea is that dumb
// terminal emulators tend to do something roughly like this, and
// smart terminal emulators try to emulate dumb ones for
// compatibility. In practice, this approach seems to be fairly
// robust.
WidthEstimationMethod::Legacy => grapheme
.chars()
.filter(|c| !c.is_ascii_control())
.flat_map(|c| c.width())
.sum::<usize>()
.try_into()
.unwrap_or(u8::MAX),
// The unicode width crate considers newlines to have a width of 1
// while the rendering code expects it to have a width of 0.
WidthEstimationMethod::Unicode => grapheme
.split('\n')
.map(|s| s.width())
.sum::<usize>()
.try_into()
.unwrap_or(u8::MAX),
} }
} }
@ -107,7 +128,7 @@ impl WidthDb {
/// Whether any new graphemes have been seen since the last time /// Whether any new graphemes have been seen since the last time
/// [`Self::measure_widths`] was called. /// [`Self::measure_widths`] was called.
pub(crate) fn measuring_required(&self) -> bool { pub(crate) fn measuring_required(&self) -> bool {
self.active && !self.requested.is_empty() self.measure && !self.requested.is_empty()
} }
/// Measure the width of all new graphemes that have been seen since the /// Measure the width of all new graphemes that have been seen since the
@ -117,7 +138,7 @@ impl WidthDb {
/// the terminal. After it finishes, the terminal's contents should be /// the terminal. After it finishes, the terminal's contents should be
/// assumed to be garbage and a full redraw should be performed. /// assumed to be garbage and a full redraw should be performed.
pub(crate) fn measure_widths(&mut self, out: &mut impl Write) -> io::Result<()> { pub(crate) fn measure_widths(&mut self, out: &mut impl Write) -> io::Result<()> {
if !self.active { if !self.measure {
return Ok(()); return Ok(());
} }
for grapheme in self.requested.drain() { for grapheme in self.requested.drain() {