/* * parser.rs - An IRCv3 parser. * * Written on Wednesday, 4 September 2024 by oldfashionedcow * * Copyright 2024 Cow * * This file is part of Beryllium. * * Beryllium is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Beryllium is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with Beryllium. If not, see . * * SPDX-License-Identifier: AGPL-3.0-or-later */ use smallvec::SmallVec; use std::fmt; use thiserror::Error; #[must_use] pub (crate) fn bytesep (s: &mut [u8], delim: u8) -> (&mut [u8], Option<&mut [u8]>) { let mut index = 0; let len = s.len (); while index < len { if s[index] == delim { let (first, second) = s.split_at_mut (index); return (first, Some (&mut second[1..])); } index += 1; } (s, None) } #[derive (Debug, Error)] pub enum ParseError { #[error ("Cannot parse command-less line!")] EmptyCommand, #[error ("Cannot split line without separator!")] NoSeparator, #[error ("Cannot parse tags with missing key!")] MissingKey, #[error ("Cannot parse line with missing parameter!")] MissingParameter, #[error ("Cannot parse tags with trailing semicolon!")] TrailingSemicolon, #[error ("Cannot parse line with missing tags!")] MissingTags, #[error ("Non last params cannot have spaces!")] NonLastParamSpaces, #[error ("Non last params cannot start with colon!")] NonLastParamColon, #[error ("Cannot parse line with missing delimiter!")] MissingDelim, } pub (crate) const IRC_TAG_MAX: usize = 6; pub (crate) const IRC_TAG_MAP: [&str; IRC_TAG_MAX] = [ "batch", "msgid", "+draft/react", "+draft/reply", "time", "+typing", ]; pub (crate) fn unescape_tag (input: &mut str) -> Result<&str, std::str::Utf8Error> { let bytes = unsafe { input.as_bytes_mut () }; let mut source_pos = 0; let mut dest_pos = 0; while source_pos < bytes.len () { if bytes[source_pos] == b'\\' && source_pos + 1 < bytes.len () { source_pos += 1; let unescaped_char = match bytes[source_pos] { b':' => b';', b's' => b' ', b'r' => b'\r', b'n' => b'\n', _ => bytes[source_pos], }; bytes[dest_pos] = unescaped_char; } else { bytes[dest_pos] = bytes[source_pos]; } source_pos += 1; dest_pos += 1; } std::str::from_utf8 (&bytes[..dest_pos]) } #[derive(Debug)] pub struct Message<'a> { tags: [Option<&'a str>; IRC_TAG_MAX], line: String, source_index: Option<(usize, usize)>, command_index: (usize, usize), param_indexes: SmallVec<[(usize, usize); 15]>, } impl<'a> fmt::Display for Message<'a> { fn fmt (&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let mut tags_iter = self.tags.iter ().flatten (); if let Some (first_tag) = tags_iter.next () { write! (f, "@{}", first_tag)?; for tag in tags_iter { write! (f, ";{}", tag)?; } write! (f, " ")?; } if let Some ((start, end)) = self.source_index { write! (f, ":{} ", &self.line[start..end])?; } write! (f, "{}", self.command ())?; for param in self.params () { write! (f, " {}", param)?; } Ok (()) } } impl<'a> Message<'a> { pub fn from (line: &'a mut [u8]) -> Result { let mut message = Message { tags: [None; IRC_TAG_MAX], line: String::with_capacity (line.len ()), source_index: None, command_index: (0, 0), param_indexes: SmallVec::new (), }; if let Some (first) = line.get_mut (0) { if *first == b'@' { if let (tags, Some (line)) = bytesep (line, b' ') { message.line = String::from_utf8_lossy (line).to_string (); message.parse_tags (&mut tags[1..])?; // While this comes at the cost of a realloc, it is worth it // as we will be storing the entire line in the struct. message.line.shrink_to_fit (); } else { // As there is no space delimiter in the byte slice, // we can be sure that there are either missing tags // or a missing command, and hence return early. return Err (ParseError::MissingDelim); } } else { // Earlier, line was created with the size of the length of // the byte array, so there is no need to shrink to fit. message.line = String::from_utf8_lossy (line).to_string (); } } else { // We know at this point that the message is completly empty // and hence can return early. return Err (ParseError::EmptyCommand); } let mut source_offset: usize = 0; if message.line.starts_with (':') { // Parse prefix source_offset += 1; let space_index = match message.line.find (' ') { Some (space_index) => space_index, None => return Err (ParseError::EmptyCommand), // If there is no space delimiter, // the message doesn't contain a // command, so we can return early. }; message.source_index = Some ((1, space_index)); } message.command_index.0 = match message.line.as_str () [(source_offset + message.source_index.unwrap_or_default ().1)..] .find (|arg0: char| char::is_ascii_alphanumeric (&arg0)) { Some (command_index) => { source_offset + message.source_index.unwrap_or_default ().1 + command_index } None => return Err (ParseError::EmptyCommand), // If there is no alphanumeric char // after the source, the message // doesn't contain a valid command, // so we can return early. }; match message.line.as_str ()[message.command_index.0..].find (' ') { Some (command_index) => { message.command_index.1 = message.command_index.0 + command_index } None => { message.command_index.1 = message.line.len (); // If there is no space after the command, the rest of the // entire message is the command, and we can return early. return Ok (message); } } let mut command_offset = match message.line.as_str ()[message.command_index.1..] .find (|c: char| c.is_ascii () && !c.is_ascii_whitespace ()) { Some (command_index) => message.command_index.1 + command_index, None => { // If there are no more non-whitespace ascii characters after // the command, we've reached the end of the message. message.command_index.1 = message.line.len (); return Ok (message); } }; loop { if message.line.as_str ()[command_offset..].starts_with (':') { message .param_indexes .push ((command_offset + 1, message.line.len ())); break; } match message.line.as_str ()[command_offset..] .find (|c: char| c.is_ascii () && !c.is_ascii_whitespace ()) { Some (index) => match message.line.as_str ()[(command_offset + index)..].find (' ') { Some (param_index) => { message .param_indexes .push ((command_offset + index, command_offset + index + param_index)); command_offset = command_offset + index + param_index + 1; command_offset += message.line[command_offset..] .chars () .take_while (|&ch| ch == ' ') .count (); } None => { message .param_indexes .push ((command_offset, message.line.len ())); // There are no more valid characters making up a parameter, // so we can break as there are no more parameters left to store. break; } }, None => { // There are no more valid characters making up a parameter, // so we can break as there are no more parameters left to store. break; } } } Ok (message) } fn parse_tags (&mut self, tags: &'a mut [u8]) -> Result<(), ParseError> { match bytesep (tags, b';') { (head, Some (tail)) => { if head.is_empty () { // The head component being empty indicates // an empty split between tags, i.e. ";;", // meaning that at least one tag is missing. return Err (ParseError::MissingTags); } // The specification states that IRCv3 tags // should be UTF8, and tags that are invalid // UTF8 may be dropped by clients. if let Ok (tag) = std::str::from_utf8_mut (head) { let (key, value) = if let Some (index) = tag.find ('=') { let (key, value) = tag.split_at_mut (index); if key.is_empty () { // The key component being empty indicates // an empty key, i.e. "=value", which is invalid. return Err (ParseError::MissingKey); } let key: &str = key; // Make key immutable (key, unescape_tag (&mut value[1..]).unwrap_or_default()) } else { let tag: &str = tag; // Make tag immutable (tag, "") }; for (i, tag) in IRC_TAG_MAP.iter ().enumerate ().take (IRC_TAG_MAX) { if tag != &key { continue; } self.tags[i] = Some (value); } } self.parse_tags(tail) } (head, None) => { if head.is_empty () { // The tail component being empty indicates a // trailing semicolon, i.e. "foo;", meaning // that at least the last tag is missing. return Err (ParseError::MissingTags); } match std::str::from_utf8_mut (head) { Ok (tag) => { let (key, value) = if let Some (index) = tag.find ('=') { let (key, value) = tag.split_at_mut (index); if key.is_empty () { // The key component being empty indicates // an empty key, i.e. "=value", which is invalid. return Err (ParseError::MissingKey); } let key: &str = key; // Make key immutable (key, unescape_tag (&mut value[1..]).unwrap_or_default ()) } else { let tag: &str = tag; // Make tag immutable (tag, "") }; for (i, tag) in IRC_TAG_MAP.iter ().enumerate ().take (IRC_TAG_MAX) { if tag != &key { continue; } self.tags[i] = Some (value); } Ok (()) } // The specification states that IRCv3 tags // should be UTF8, and tags that are invalid // UTF8 may be dropped by clients. Err (_) => Ok (()), } } } } #[must_use] pub fn command (&self) -> &str { &self.line[self.command_index.0..self.command_index.1] } #[must_use] pub fn param_len (&self) -> usize { self.param_indexes.len () } #[must_use] pub fn param_at (&self, index: usize) -> Option<&str> { self.param_indexes .get (index) .map (|(start, end)| &self.line[*start..*end]) } pub fn params (&self) -> impl Iterator + '_ { self.param_indexes .iter () .map (|(start, end)| &self.line[*start..*end]) } }