From cc3f85e6e14e6250da8a36cfbbc7f5cf31f5111e Mon Sep 17 00:00:00 2001 From: Joscha Date: Tue, 26 Nov 2024 19:38:55 +0100 Subject: [PATCH] Model and render elements --- src/check.rs | 78 ++++++++++++++++ src/element.rs | 92 +++++++++++++++++++ src/elements.rs | 166 ++++++++++++++++++++++++++++++++++ src/lib.rs | 123 +++++++++++++++++++++++++ src/render.rs | 235 ++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 694 insertions(+) create mode 100644 src/check.rs create mode 100644 src/element.rs create mode 100644 src/elements.rs create mode 100644 src/render.rs diff --git a/src/check.rs b/src/check.rs new file mode 100644 index 0000000..4fc811f --- /dev/null +++ b/src/check.rs @@ -0,0 +1,78 @@ +/// +pub fn is_ascii_alpha(c: char) -> bool { + c.is_ascii_alphabetic() +} + +/// +pub fn is_ascii_alphanumeric(c: char) -> bool { + c.is_ascii_alphanumeric() +} + +/// +/// +/// The rules around what is a valid tag name are complicated. The standard +/// doesn't give an easy answer. Because of this, we're conservative in what we +/// allow. This way, the output we produce should parse correctly in a wide +/// range of circumstances while following the standard. +pub fn is_valid_tag_name(name: &str) -> bool { + !name.is_empty() + && name.chars().take(1).all(is_ascii_alpha) + && name.chars().all(is_ascii_alphanumeric) +} + +/// +/// +/// The rules around what is a valid attribute name are complicated. The +/// standard doesn't give an easy answer. Because of this, we're conservative in +/// what we allow. This way, the output we produce should parse correctly in a +/// wide range of circumstances while following the standard. +pub fn is_valid_attribute_name(name: &str) -> bool { + !name.is_empty() + && name.chars().take(1).all(is_ascii_alpha) + && name + .chars() + .all(|c| is_ascii_alphanumeric(c) || c == '-' || c == '_') +} + +/// https://html.spec.whatwg.org/multipage/syntax.html#cdata-rcdata-restrictions +/// +/// The tag name must be ascii-only. +pub fn is_valid_raw_text(tag_name: &str, text: &str) -> bool { + // In case we ever decide to relax tag name ascii requirements. + assert!(tag_name.is_ascii()); + + // "The text in raw text and escapable raw text elements must not contain + // any occurrences of the string "(); + + // "[...] followed by characters that case-insensitively match the tag + // name of the element [...]" + // + // Note: Since we know that tag names are ascii-only, we can convert + // both to lowercase for a case-insensitive comparison without weird + // unicode shenanigans. + if potential_tag_name.to_ascii_lowercase() != tag_name.to_ascii_lowercase() { + continue; + } + + // "[...] followed by [...]" + let Some(trailing) = text[start + potential_tag_name.len()..].chars().next() else { + continue; + }; + + // "[...] one of U+0009 CHARACTER TABULATION (tab), U+000A LINE FEED + // (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), U+0020 + // SPACE, U+003E GREATER-THAN SIGN (>), or U+002F SOLIDUS (/)." + if matches!(trailing, '\t' | '\n' | '\x0C' | '\r' | ' ' | '>' | '/') { + return false; + } + } + true +} diff --git a/src/element.rs b/src/element.rs new file mode 100644 index 0000000..8da0968 --- /dev/null +++ b/src/element.rs @@ -0,0 +1,92 @@ +use std::collections::BTreeMap; + +/// +#[derive(Clone, Copy, PartialEq, Eq)] +pub enum ElementKind { + Void, + Template, + RawText, + EscapableRawText, + Foreign, + Normal, +} + +pub enum Content { + Raw(String), + Text(String), + Comment(String), + Element(Element), +} + +impl Content { + pub fn raw(str: impl ToString) -> Self { + Self::Raw(str.to_string()) + } + + pub fn text(str: impl ToString) -> Self { + Self::Text(str.to_string()) + } + + pub fn comment(str: impl ToString) -> Self { + Self::Comment(str.to_string()) + } + + pub fn doctype() -> Self { + Self::raw("") + } +} + +impl From for Content { + fn from(value: String) -> Self { + Self::text(value) + } +} + +impl From<&str> for Content { + fn from(value: &str) -> Self { + Self::text(value) + } +} + +impl From for Content { + fn from(value: Element) -> Self { + Self::Element(value) + } +} + +pub struct Element { + pub name: String, + pub kind: ElementKind, + pub attributes: BTreeMap, + pub children: Vec, +} + +impl Element { + pub fn new(name: impl ToString, kind: ElementKind) -> Self { + Self { + name: name.to_string().to_ascii_lowercase(), + kind, + attributes: BTreeMap::new(), + children: vec![], + } + } + + pub fn normal(name: impl ToString) -> Self { + Self::new(name, ElementKind::Normal) + } + + pub fn attr(mut self, name: impl ToString, value: impl ToString) -> Self { + self.attributes + .insert(name.to_string().to_ascii_lowercase(), value.to_string()); + self + } + + pub fn attr_true(self, name: impl ToString) -> Self { + self.attr(name, "") + } + + pub fn child(mut self, child: impl Into) -> Self { + self.children.push(child.into()); + self + } +} diff --git a/src/elements.rs b/src/elements.rs new file mode 100644 index 0000000..2e117e7 --- /dev/null +++ b/src/elements.rs @@ -0,0 +1,166 @@ +//! Definitions for all non-deprecated HTML elements. +//! +//! + +use crate::{Element, ElementKind}; + +macro_rules! element { + ( $name:ident ) => { + element!($name, ElementKind::Normal); + }; + ( $name:ident, $kind:expr ) => { + pub fn $name() -> Element { + Element::new(stringify!($name), $kind) + } + }; +} + +// Main root +element!(html); + +// Document metadata +element!(base, ElementKind::Void); +element!(head); +element!(link, ElementKind::Void); +element!(meta, ElementKind::Void); +element!(style, ElementKind::RawText); +element!(title, ElementKind::EscapableRawText); + +// Sectioning root +element!(body); + +// Content sectioning +element!(address); +element!(article); +element!(aside); +element!(footer); +element!(header); +element!(h1); +element!(h2); +element!(h3); +element!(h4); +element!(h5); +element!(h6); +element!(hgroup); +element!(main); +element!(nav); +element!(section); +element!(search); + +// Text content +element!(blockquote); +element!(dd); +element!(div); +element!(dl); +element!(dt); +element!(figcaption); +element!(figure); +element!(hr, ElementKind::Void); +element!(li); +element!(menu); +element!(ol); +element!(p); +element!(pre); +element!(ul); + +// Inline text semantics +element!(a); +element!(abbr); +element!(b); +element!(bdi); +element!(bdo); +element!(br, ElementKind::Void); +element!(cite); +element!(code); +element!(data); +element!(dfn); +element!(em); +element!(i); +element!(kbd); +element!(mark); +element!(q); +element!(rp); +element!(rt); +element!(ruby); +element!(s); +element!(samp); +element!(small); +element!(span); +element!(strong); +element!(sub); +element!(sup); +element!(time); +element!(u); +element!(var); +element!(wbr, ElementKind::Void); + +// Image and multimedia +element!(area, ElementKind::Void); +element!(audio); +element!(img, ElementKind::Void); +element!(map); +element!(track, ElementKind::Void); +element!(video); + +// Embedded content +element!(embed, ElementKind::Void); +element!(fencedframe); +element!(iframe); +element!(object); +element!(picture); +element!(portal); +element!(source, ElementKind::Void); + +// SVG and MathML +// TODO Proper SVG and MathML support +element!(svg, ElementKind::Foreign); +element!(math, ElementKind::Foreign); + +// Scripting +element!(canvas); +element!(noscript); +element!(script, ElementKind::RawText); + +// Demarcating edits +element!(del); +element!(ins); + +// Table content +element!(caption); +element!(col, ElementKind::Void); +element!(colgroup); +element!(table); +element!(tbody); +element!(td); +element!(tfoot); +element!(th); +element!(thead); +element!(tr); + +// Forms +element!(button); +element!(datalist); +element!(fieldset); +element!(form); +element!(input, ElementKind::Void); +element!(label); +element!(legend); +element!(meter); +element!(optgroup); +element!(option); +element!(output); +element!(progress); +element!(select); +element!(textarea, ElementKind::EscapableRawText); + +// Interactive elements +element!(details); +element!(dialog); +element!(summary); + +// Web Components +element!(slot); +element!(template, ElementKind::Template); + +// Obsolete and deprecated elements +// Intentionally excluded! diff --git a/src/lib.rs b/src/lib.rs index 8b13789..7a186fd 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1 +1,124 @@ +//! Create HTML by manipulating elements as structured data. Inspired by the +//! clojure library [hiccup][hiccup]. +//! +//! [hiccup]: https://github.com/weavejester/hiccup +mod check; +mod element; +pub mod elements; +mod render; + +pub use self::{element::*, elements::*, render::*}; + +#[cfg(test)] +mod tests { + use crate::{elements::*, render::Render, Content, Element}; + + #[test] + fn simple_website() { + let els = [ + Content::doctype(), + html() + .child(head().child(title().child("Hello"))) + .child( + body() + .child(h1().child("Hello")) + .child(p().child("Hello ").child(em().child("world")).child("!")), + ) + .into(), + ]; + + assert_eq!( + els.render_to_string().unwrap(), + concat!( + "", + "Hello", + "

Hello

Hello world!

", + "", + ), + ); + } + + #[test] + fn void_elements() { + // Difference between void and non-void + assert_eq!(head().render_to_string().unwrap(), ""); + assert_eq!(input().render_to_string().unwrap(), ""); + + // Void elements must not contain any children + assert!(input().child(p()).render_to_string().is_err()); + } + + #[test] + fn raw_text_elements() { + assert_eq!( + script() + .child("foo ", + ); + + println!( + "{:?}", + script().child("hello world").render_to_string(), + ); + + assert!(script() + .child("hello world") + .render_to_string() + .is_err()); + + assert!(script() + .child("hello & bar") + .render_to_string() + .unwrap(), + "", + ); + + assert!(textarea().child(p()).render_to_string().is_err()); + } + + #[test] + fn attributes() { + assert_eq!( + input() + .attr("name", "tentacles") + .attr("type", "number") + .attr("min", 10) + .attr("max", 100) + .render_to_string() + .unwrap(), + r#""#, + ); + + assert_eq!( + input() + .attr("name", "horns") + .attr_true("checked") + .render_to_string() + .unwrap(), + r#""#, + ); + } + + #[test] + fn always_lowercase() { + assert_eq!( + Element::normal("HTML") + .attr("LANG", "EN") + .render_to_string() + .unwrap(), + r#""#, + ); + } +} diff --git a/src/render.rs b/src/render.rs new file mode 100644 index 0000000..c1d4a0a --- /dev/null +++ b/src/render.rs @@ -0,0 +1,235 @@ +use std::{error, fmt}; + +use crate::{ + check, + element::{Content, Element, ElementKind}, +}; + +#[derive(Debug)] +pub enum ErrorCause { + Format(fmt::Error), + InvalidTagName(String), + InvalidAttrName(String), + InvalidChild, + InvalidRawText(String), +} + +#[derive(Debug)] +pub struct Error { + reverse_path: Vec, + cause: ErrorCause, +} + +impl Error { + pub fn new(cause: ErrorCause) -> Self { + Self { + reverse_path: vec![], + cause, + } + } + + pub fn at(mut self, index: usize, child: &Content) -> Self { + self.reverse_path.push(match child { + Content::Element(el) => format!("{index}:{}", el.name), + _ => index.to_string(), + }); + self + } +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let path = self + .reverse_path + .iter() + .rev() + .map(|s| s as &str) + .collect::>() + .join("."); + + write!(f, "Render error at {path}: ")?; + + match &self.cause { + ErrorCause::Format(error) => write!(f, "{error}")?, + ErrorCause::InvalidTagName(name) => write!(f, "Invalid tag name {name:?}")?, + ErrorCause::InvalidAttrName(name) => write!(f, "Invalid attribute name {name:?}")?, + ErrorCause::InvalidChild => write!(f, "Invalid child")?, + ErrorCause::InvalidRawText(text) => write!(f, "Invalid raw text {text:?}")?, + } + + Ok(()) + } +} + +impl error::Error for Error {} + +impl From for Error { + fn from(value: fmt::Error) -> Self { + Self::new(ErrorCause::Format(value)) + } +} + +pub type Result = std::result::Result; + +pub trait Render { + fn render(&self, w: &mut W) -> Result<()>; + + fn render_to_string(&self) -> Result { + let mut result = String::new(); + self.render(&mut result)?; + Ok(result) + } +} + +impl Render for [Content] { + fn render(&self, w: &mut W) -> Result<()> { + for content in self { + content.render(w)?; + } + Ok(()) + } +} + +impl Render for Content { + fn render(&self, w: &mut W) -> Result<()> { + match self { + Self::Raw(text) => write!(w, "{text}")?, + Self::Text(text) => render_text(w, text)?, + Self::Comment(text) => render_comment(w, text)?, + Self::Element(element) => element.render(w)?, + } + Ok(()) + } +} + +impl Render for Element { + fn render(&self, w: &mut W) -> Result<()> { + // Checks + if !check::is_valid_tag_name(&self.name) { + return Err(Error::new(ErrorCause::InvalidTagName(self.name.clone()))); + } + for name in self.attributes.keys() { + if !check::is_valid_attribute_name(name) { + return Err(Error::new(ErrorCause::InvalidAttrName(name.clone()))); + } + } + + // Opening tag + write!(w, "<{}", self.name)?; + for (name, value) in &self.attributes { + write!(w, " {name}")?; + if !value.is_empty() { + write!(w, "=")?; + render_attribute_value(w, value)?; + } + } + if self.children.is_empty() { + // Closing early + match self.kind { + ElementKind::Void => write!(w, ">")?, + ElementKind::Foreign => write!(w, " />")?, + _ => write!(w, ">", self.name)?, + } + return Ok(()); + } + write!(w, ">")?; + + // Children + for (i, child) in self.children.iter().enumerate() { + match self.kind { + ElementKind::Void => Err(Error::new(ErrorCause::InvalidChild)), + ElementKind::RawText => match child { + c @ Content::Raw(_) => c.render(w), + Content::Text(text) if check::is_valid_raw_text(&self.name, text) => { + write!(w, "{text}").map_err(|e| e.into()) + } + Content::Text(text) => { + Err(Error::new(ErrorCause::InvalidRawText(text.clone()))) + } + _ => Err(Error::new(ErrorCause::InvalidChild)), + }, + ElementKind::EscapableRawText => match child { + c @ (Content::Raw(_) | Content::Text(_)) => c.render(w), + _ => Err(Error::new(ErrorCause::InvalidChild)), + }, + _ => child.render(w), + } + .map_err(|e| e.at(i, child))?; + } + + // Closing tag + if self.kind != ElementKind::Void { + write!(w, "", self.name)?; + } + + Ok(()) + } +} + +fn render_text(w: &mut W, text: &str) -> Result<()> { + // As far as I can tell, it should be sufficient to escape `&` and `<`. + // `>` is escaped too for symmetry, not for any real reason. + // + // Reasoning: Whenever we're inside tags, we're in one of these states, + // https://html.spec.whatwg.org/multipage/parsing.html#data-state + // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-state + // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state + + for c in text.chars() { + match c { + '&' => write!(w, "&")?, + '<' => write!(w, "<")?, + '>' => write!(w, ">")?, + c => write!(w, "{c}")?, + } + } + + Ok(()) +} + +fn render_comment(w: &mut W, text: &str) -> Result<()> { + // A comment... + // - must not start with the string ">" + // - must not start with the string "->" + // - must not contain the strings "", or "--!>" + // - must not end with the string "", "==>") + .replace("--!>", "==!>"); + + if text.starts_with(">") || text.starts_with("->") { + write!(w, " ")?; + } + + write!(w, "{text}")?; + + if text.ends_with("(w: &mut W, text: &str) -> Result<()> { + // Quoted attribute values are escaped like text, but the set of characters + // to escape is different. + // + // https://html.spec.whatwg.org/multipage/syntax.html#attributes-2 + + write!(w, "\"")?; + + for c in text.chars() { + match c { + '"' => write!(w, """)?, + c => write!(w, "{c}")?, + } + } + + write!(w, "\"")?; + + Ok(()) +}