Model and render elements

This commit is contained in:
Joscha 2024-11-26 19:38:55 +01:00
parent c8d9cf16f5
commit cc3f85e6e1
5 changed files with 694 additions and 0 deletions

78
src/check.rs Normal file
View file

@ -0,0 +1,78 @@
/// <https://infra.spec.whatwg.org/#ascii-alpha>
pub fn is_ascii_alpha(c: char) -> bool {
c.is_ascii_alphabetic()
}
/// <https://infra.spec.whatwg.org/#ascii-alphanumeric>
pub fn is_ascii_alphanumeric(c: char) -> bool {
c.is_ascii_alphanumeric()
}
/// <https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-name>
///
/// The rules around what is a valid tag name are complicated. The standard
/// doesn't give an easy answer. Because of this, we're conservative in what we
/// allow. This way, the output we produce should parse correctly in a wide
/// range of circumstances while following the standard.
pub fn is_valid_tag_name(name: &str) -> bool {
!name.is_empty()
&& name.chars().take(1).all(is_ascii_alpha)
&& name.chars().all(is_ascii_alphanumeric)
}
/// <https://html.spec.whatwg.org/multipage/syntax.html#syntax-attribute-name>
///
/// The rules around what is a valid attribute name are complicated. The
/// standard doesn't give an easy answer. Because of this, we're conservative in
/// what we allow. This way, the output we produce should parse correctly in a
/// wide range of circumstances while following the standard.
pub fn is_valid_attribute_name(name: &str) -> bool {
!name.is_empty()
&& name.chars().take(1).all(is_ascii_alpha)
&& name
.chars()
.all(|c| is_ascii_alphanumeric(c) || c == '-' || c == '_')
}
/// https://html.spec.whatwg.org/multipage/syntax.html#cdata-rcdata-restrictions
///
/// The tag name must be ascii-only.
pub fn is_valid_raw_text(tag_name: &str, text: &str) -> bool {
// In case we ever decide to relax tag name ascii requirements.
assert!(tag_name.is_ascii());
// "The text in raw text and escapable raw text elements must not contain
// any occurrences of the string "</" (U+003C LESS-THAN SIGN, U+002F
// SOLIDUS) [...]"
for (i, _) in text.match_indices("</") {
let start = i + "</".len();
let potential_tag_name = text[start..]
.chars()
.take(tag_name.chars().count())
.collect::<String>();
// "[...] followed by characters that case-insensitively match the tag
// name of the element [...]"
//
// Note: Since we know that tag names are ascii-only, we can convert
// both to lowercase for a case-insensitive comparison without weird
// unicode shenanigans.
if potential_tag_name.to_ascii_lowercase() != tag_name.to_ascii_lowercase() {
continue;
}
// "[...] followed by [...]"
let Some(trailing) = text[start + potential_tag_name.len()..].chars().next() else {
continue;
};
// "[...] one of U+0009 CHARACTER TABULATION (tab), U+000A LINE FEED
// (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), U+0020
// SPACE, U+003E GREATER-THAN SIGN (>), or U+002F SOLIDUS (/)."
if matches!(trailing, '\t' | '\n' | '\x0C' | '\r' | ' ' | '>' | '/') {
return false;
}
}
true
}

92
src/element.rs Normal file
View file

@ -0,0 +1,92 @@
use std::collections::BTreeMap;
/// <https://html.spec.whatwg.org/multipage/syntax.html#elements-2>
#[derive(Clone, Copy, PartialEq, Eq)]
pub enum ElementKind {
Void,
Template,
RawText,
EscapableRawText,
Foreign,
Normal,
}
pub enum Content {
Raw(String),
Text(String),
Comment(String),
Element(Element),
}
impl Content {
pub fn raw(str: impl ToString) -> Self {
Self::Raw(str.to_string())
}
pub fn text(str: impl ToString) -> Self {
Self::Text(str.to_string())
}
pub fn comment(str: impl ToString) -> Self {
Self::Comment(str.to_string())
}
pub fn doctype() -> Self {
Self::raw("<!DOCTYPE html>")
}
}
impl From<String> for Content {
fn from(value: String) -> Self {
Self::text(value)
}
}
impl From<&str> for Content {
fn from(value: &str) -> Self {
Self::text(value)
}
}
impl From<Element> for Content {
fn from(value: Element) -> Self {
Self::Element(value)
}
}
pub struct Element {
pub name: String,
pub kind: ElementKind,
pub attributes: BTreeMap<String, String>,
pub children: Vec<Content>,
}
impl Element {
pub fn new(name: impl ToString, kind: ElementKind) -> Self {
Self {
name: name.to_string().to_ascii_lowercase(),
kind,
attributes: BTreeMap::new(),
children: vec![],
}
}
pub fn normal(name: impl ToString) -> Self {
Self::new(name, ElementKind::Normal)
}
pub fn attr(mut self, name: impl ToString, value: impl ToString) -> Self {
self.attributes
.insert(name.to_string().to_ascii_lowercase(), value.to_string());
self
}
pub fn attr_true(self, name: impl ToString) -> Self {
self.attr(name, "")
}
pub fn child(mut self, child: impl Into<Content>) -> Self {
self.children.push(child.into());
self
}
}

166
src/elements.rs Normal file
View file

@ -0,0 +1,166 @@
//! Definitions for all non-deprecated HTML elements.
//!
//! <https://developer.mozilla.org/en-US/docs/Web/HTML/Element>
use crate::{Element, ElementKind};
macro_rules! element {
( $name:ident ) => {
element!($name, ElementKind::Normal);
};
( $name:ident, $kind:expr ) => {
pub fn $name() -> Element {
Element::new(stringify!($name), $kind)
}
};
}
// Main root
element!(html);
// Document metadata
element!(base, ElementKind::Void);
element!(head);
element!(link, ElementKind::Void);
element!(meta, ElementKind::Void);
element!(style, ElementKind::RawText);
element!(title, ElementKind::EscapableRawText);
// Sectioning root
element!(body);
// Content sectioning
element!(address);
element!(article);
element!(aside);
element!(footer);
element!(header);
element!(h1);
element!(h2);
element!(h3);
element!(h4);
element!(h5);
element!(h6);
element!(hgroup);
element!(main);
element!(nav);
element!(section);
element!(search);
// Text content
element!(blockquote);
element!(dd);
element!(div);
element!(dl);
element!(dt);
element!(figcaption);
element!(figure);
element!(hr, ElementKind::Void);
element!(li);
element!(menu);
element!(ol);
element!(p);
element!(pre);
element!(ul);
// Inline text semantics
element!(a);
element!(abbr);
element!(b);
element!(bdi);
element!(bdo);
element!(br, ElementKind::Void);
element!(cite);
element!(code);
element!(data);
element!(dfn);
element!(em);
element!(i);
element!(kbd);
element!(mark);
element!(q);
element!(rp);
element!(rt);
element!(ruby);
element!(s);
element!(samp);
element!(small);
element!(span);
element!(strong);
element!(sub);
element!(sup);
element!(time);
element!(u);
element!(var);
element!(wbr, ElementKind::Void);
// Image and multimedia
element!(area, ElementKind::Void);
element!(audio);
element!(img, ElementKind::Void);
element!(map);
element!(track, ElementKind::Void);
element!(video);
// Embedded content
element!(embed, ElementKind::Void);
element!(fencedframe);
element!(iframe);
element!(object);
element!(picture);
element!(portal);
element!(source, ElementKind::Void);
// SVG and MathML
// TODO Proper SVG and MathML support
element!(svg, ElementKind::Foreign);
element!(math, ElementKind::Foreign);
// Scripting
element!(canvas);
element!(noscript);
element!(script, ElementKind::RawText);
// Demarcating edits
element!(del);
element!(ins);
// Table content
element!(caption);
element!(col, ElementKind::Void);
element!(colgroup);
element!(table);
element!(tbody);
element!(td);
element!(tfoot);
element!(th);
element!(thead);
element!(tr);
// Forms
element!(button);
element!(datalist);
element!(fieldset);
element!(form);
element!(input, ElementKind::Void);
element!(label);
element!(legend);
element!(meter);
element!(optgroup);
element!(option);
element!(output);
element!(progress);
element!(select);
element!(textarea, ElementKind::EscapableRawText);
// Interactive elements
element!(details);
element!(dialog);
element!(summary);
// Web Components
element!(slot);
element!(template, ElementKind::Template);
// Obsolete and deprecated elements
// Intentionally excluded!

View file

@ -1 +1,124 @@
//! Create HTML by manipulating elements as structured data. Inspired by the
//! clojure library [hiccup][hiccup].
//!
//! [hiccup]: https://github.com/weavejester/hiccup
mod check;
mod element;
pub mod elements;
mod render;
pub use self::{element::*, elements::*, render::*};
#[cfg(test)]
mod tests {
use crate::{elements::*, render::Render, Content, Element};
#[test]
fn simple_website() {
let els = [
Content::doctype(),
html()
.child(head().child(title().child("Hello")))
.child(
body()
.child(h1().child("Hello"))
.child(p().child("Hello ").child(em().child("world")).child("!")),
)
.into(),
];
assert_eq!(
els.render_to_string().unwrap(),
concat!(
"<!DOCTYPE html><html>",
"<head><title>Hello</title></head>",
"<body><h1>Hello</h1><p>Hello <em>world</em>!</p></body>",
"</html>",
),
);
}
#[test]
fn void_elements() {
// Difference between void and non-void
assert_eq!(head().render_to_string().unwrap(), "<head></head>");
assert_eq!(input().render_to_string().unwrap(), "<input>");
// Void elements must not contain any children
assert!(input().child(p()).render_to_string().is_err());
}
#[test]
fn raw_text_elements() {
assert_eq!(
script()
.child("foo <script> & </style> bar")
.render_to_string()
.unwrap(),
"<script>foo <script> & </style> bar</script>",
);
println!(
"{:?}",
script().child("hello </script> world").render_to_string(),
);
assert!(script()
.child("hello </script> world")
.render_to_string()
.is_err());
assert!(script()
.child("hello </ScRiPt ... world")
.render_to_string()
.is_err());
}
#[test]
fn escaped_text_elements() {
assert_eq!(
textarea()
.child("foo <p> & bar")
.render_to_string()
.unwrap(),
"<textarea>foo &lt;p&gt; &amp; bar</textarea>",
);
assert!(textarea().child(p()).render_to_string().is_err());
}
#[test]
fn attributes() {
assert_eq!(
input()
.attr("name", "tentacles")
.attr("type", "number")
.attr("min", 10)
.attr("max", 100)
.render_to_string()
.unwrap(),
r#"<input max="100" min="10" name="tentacles" type="number">"#,
);
assert_eq!(
input()
.attr("name", "horns")
.attr_true("checked")
.render_to_string()
.unwrap(),
r#"<input checked name="horns">"#,
);
}
#[test]
fn always_lowercase() {
assert_eq!(
Element::normal("HTML")
.attr("LANG", "EN")
.render_to_string()
.unwrap(),
r#"<html lang="EN"></html>"#,
);
}
}

235
src/render.rs Normal file
View file

@ -0,0 +1,235 @@
use std::{error, fmt};
use crate::{
check,
element::{Content, Element, ElementKind},
};
#[derive(Debug)]
pub enum ErrorCause {
Format(fmt::Error),
InvalidTagName(String),
InvalidAttrName(String),
InvalidChild,
InvalidRawText(String),
}
#[derive(Debug)]
pub struct Error {
reverse_path: Vec<String>,
cause: ErrorCause,
}
impl Error {
pub fn new(cause: ErrorCause) -> Self {
Self {
reverse_path: vec![],
cause,
}
}
pub fn at(mut self, index: usize, child: &Content) -> Self {
self.reverse_path.push(match child {
Content::Element(el) => format!("{index}:{}", el.name),
_ => index.to_string(),
});
self
}
}
impl fmt::Display for Error {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let path = self
.reverse_path
.iter()
.rev()
.map(|s| s as &str)
.collect::<Vec<_>>()
.join(".");
write!(f, "Render error at {path}: ")?;
match &self.cause {
ErrorCause::Format(error) => write!(f, "{error}")?,
ErrorCause::InvalidTagName(name) => write!(f, "Invalid tag name {name:?}")?,
ErrorCause::InvalidAttrName(name) => write!(f, "Invalid attribute name {name:?}")?,
ErrorCause::InvalidChild => write!(f, "Invalid child")?,
ErrorCause::InvalidRawText(text) => write!(f, "Invalid raw text {text:?}")?,
}
Ok(())
}
}
impl error::Error for Error {}
impl From<fmt::Error> for Error {
fn from(value: fmt::Error) -> Self {
Self::new(ErrorCause::Format(value))
}
}
pub type Result<T> = std::result::Result<T, Error>;
pub trait Render {
fn render<W: fmt::Write>(&self, w: &mut W) -> Result<()>;
fn render_to_string(&self) -> Result<String> {
let mut result = String::new();
self.render(&mut result)?;
Ok(result)
}
}
impl Render for [Content] {
fn render<W: fmt::Write>(&self, w: &mut W) -> Result<()> {
for content in self {
content.render(w)?;
}
Ok(())
}
}
impl Render for Content {
fn render<W: fmt::Write>(&self, w: &mut W) -> Result<()> {
match self {
Self::Raw(text) => write!(w, "{text}")?,
Self::Text(text) => render_text(w, text)?,
Self::Comment(text) => render_comment(w, text)?,
Self::Element(element) => element.render(w)?,
}
Ok(())
}
}
impl Render for Element {
fn render<W: fmt::Write>(&self, w: &mut W) -> Result<()> {
// Checks
if !check::is_valid_tag_name(&self.name) {
return Err(Error::new(ErrorCause::InvalidTagName(self.name.clone())));
}
for name in self.attributes.keys() {
if !check::is_valid_attribute_name(name) {
return Err(Error::new(ErrorCause::InvalidAttrName(name.clone())));
}
}
// Opening tag
write!(w, "<{}", self.name)?;
for (name, value) in &self.attributes {
write!(w, " {name}")?;
if !value.is_empty() {
write!(w, "=")?;
render_attribute_value(w, value)?;
}
}
if self.children.is_empty() {
// Closing early
match self.kind {
ElementKind::Void => write!(w, ">")?,
ElementKind::Foreign => write!(w, " />")?,
_ => write!(w, "></{}>", self.name)?,
}
return Ok(());
}
write!(w, ">")?;
// Children
for (i, child) in self.children.iter().enumerate() {
match self.kind {
ElementKind::Void => Err(Error::new(ErrorCause::InvalidChild)),
ElementKind::RawText => match child {
c @ Content::Raw(_) => c.render(w),
Content::Text(text) if check::is_valid_raw_text(&self.name, text) => {
write!(w, "{text}").map_err(|e| e.into())
}
Content::Text(text) => {
Err(Error::new(ErrorCause::InvalidRawText(text.clone())))
}
_ => Err(Error::new(ErrorCause::InvalidChild)),
},
ElementKind::EscapableRawText => match child {
c @ (Content::Raw(_) | Content::Text(_)) => c.render(w),
_ => Err(Error::new(ErrorCause::InvalidChild)),
},
_ => child.render(w),
}
.map_err(|e| e.at(i, child))?;
}
// Closing tag
if self.kind != ElementKind::Void {
write!(w, "</{}>", self.name)?;
}
Ok(())
}
}
fn render_text<W: fmt::Write>(w: &mut W, text: &str) -> Result<()> {
// As far as I can tell, it should be sufficient to escape `&` and `<`.
// `>` is escaped too for symmetry, not for any real reason.
//
// Reasoning: Whenever we're inside tags, we're in one of these states,
// https://html.spec.whatwg.org/multipage/parsing.html#data-state
// https://html.spec.whatwg.org/multipage/parsing.html#rawtext-state
// https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state
for c in text.chars() {
match c {
'&' => write!(w, "&amp;")?,
'<' => write!(w, "&lt;")?,
'>' => write!(w, "&gt;")?,
c => write!(w, "{c}")?,
}
}
Ok(())
}
fn render_comment<W: fmt::Write>(w: &mut W, text: &str) -> Result<()> {
// A comment...
// - must not start with the string ">"
// - must not start with the string "->"
// - must not contain the strings "<!--", "-->", or "--!>"
// - must not end with the string "<!-"
//
// https://html.spec.whatwg.org/multipage/syntax.html#comments
let text = text
.replace("<!--", "<!==")
.replace("-->", "==>")
.replace("--!>", "==!>");
if text.starts_with(">") || text.starts_with("->") {
write!(w, " ")?;
}
write!(w, "{text}")?;
if text.ends_with("<!-") {
write!(w, " ")?;
}
Ok(())
}
fn render_attribute_value<W: fmt::Write>(w: &mut W, text: &str) -> Result<()> {
// Quoted attribute values are escaped like text, but the set of characters
// to escape is different.
//
// https://html.spec.whatwg.org/multipage/syntax.html#attributes-2
write!(w, "\"")?;
for c in text.chars() {
match c {
'"' => write!(w, "&quot;")?,
c => write!(w, "{c}")?,
}
}
write!(w, "\"")?;
Ok(())
}