| /*
|
| * parser.rs - An IRCv3 parser.
|
| *
|
| * Written on Wednesday, 4 September 2024 by oldfashionedcow
|
| *
|
| * Copyright 2024 Cow
|
| *
|
| * This file is part of Beryllium.
|
| *
|
| * Beryllium is free software: you can redistribute it and/or modify
|
| * it under the terms of the GNU Affero General Public License as published by
|
| * the Free Software Foundation, either version 3 of the License, or
|
| * (at your option) any later version.
|
| *
|
| * Beryllium is distributed in the hope that it will be useful,
|
| * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
| * GNU Affero General Public License for more details.
|
| *
|
| * You should have received a copy of the GNU Affero General Public License
|
| * along with Beryllium. If not, see <http://www.gnu.org/licenses/>.
|
| *
|
| * SPDX-License-Identifier: AGPL-3.0-or-later
|
| */
|
|
|
| use smallvec::SmallVec;
|
| use std::fmt;
|
| use thiserror::Error;
|
|
|
| #[must_use]
|
| pub (crate) fn bytesep (s: &mut [u8], delim: u8) -> (&mut [u8], Option<&mut [u8]>)
|
| {
|
| let mut index = 0;
|
| let len = s.len ();
|
|
|
| while index < len {
|
| if s[index] == delim {
|
| let (first, second) = s.split_at_mut (index);
|
| return (first, Some (&mut second[1..]));
|
| }
|
| index += 1;
|
| }
|
|
|
| (s, None)
|
| }
|
|
|
| #[derive (Debug, Error)]
|
| pub enum ParseError
|
| {
|
| #[error ("Cannot parse command-less line!")]
|
| EmptyCommand,
|
| #[error ("Cannot split line without separator!")]
|
| NoSeparator,
|
| #[error ("Cannot parse tags with missing key!")]
|
| MissingKey,
|
| #[error ("Cannot parse line with missing parameter!")]
|
| MissingParameter,
|
| #[error ("Cannot parse tags with trailing semicolon!")]
|
| TrailingSemicolon,
|
| #[error ("Cannot parse line with missing tags!")]
|
| MissingTags,
|
| #[error ("Non last params cannot have spaces!")]
|
| NonLastParamSpaces,
|
| #[error ("Non last params cannot start with colon!")]
|
| NonLastParamColon,
|
| #[error ("Cannot parse line with missing delimiter!")]
|
| MissingDelim,
|
| }
|
|
|
| pub (crate) const IRC_TAG_MAX: usize = 6;
|
| pub (crate) const IRC_TAG_MAP: [&str; IRC_TAG_MAX] = [
|
| "batch",
|
| "msgid",
|
| "+draft/react",
|
| "+draft/reply",
|
| "time",
|
| "+typing",
|
| ];
|
|
|
| pub (crate) fn unescape_tag (input: &mut str) -> Result<&str, std::str::Utf8Error>
|
| {
|
| let bytes = unsafe { input.as_bytes_mut () };
|
| let mut source_pos = 0;
|
| let mut dest_pos = 0;
|
|
|
| while source_pos < bytes.len () {
|
| if bytes[source_pos] == b'\\' && source_pos + 1 < bytes.len () {
|
| source_pos += 1;
|
| let unescaped_char = match bytes[source_pos] {
|
| b':' => b';',
|
| b's' => b' ',
|
| b'r' => b'\r',
|
| b'n' => b'\n',
|
| _ => bytes[source_pos],
|
| };
|
| bytes[dest_pos] = unescaped_char;
|
| } else {
|
| bytes[dest_pos] = bytes[source_pos];
|
| }
|
| source_pos += 1;
|
| dest_pos += 1;
|
| }
|
|
|
| std::str::from_utf8 (&bytes[..dest_pos])
|
| }
|
|
|
| #[derive(Debug)]
|
| pub struct Message<'a>
|
| {
|
| tags: [Option<&'a str>; IRC_TAG_MAX],
|
| line: String,
|
| source_index: Option<(usize, usize)>,
|
| command_index: (usize, usize),
|
| param_indexes: SmallVec<[(usize, usize); 15]>,
|
| }
|
|
|
| impl<'a> fmt::Display for Message<'a>
|
| {
|
| fn fmt (&self, f: &mut fmt::Formatter<'_>) -> fmt::Result
|
| {
|
| let mut tags_iter = self.tags.iter ().flatten ();
|
|
|
| if let Some (first_tag) = tags_iter.next () {
|
| write! (f, "@{}", first_tag)?;
|
|
|
| for tag in tags_iter {
|
| write! (f, ";{}", tag)?;
|
| }
|
|
|
| write! (f, " ")?;
|
| }
|
|
|
| if let Some ((start, end)) = self.source_index {
|
| write! (f, ":{} ", &self.line[start..end])?;
|
| }
|
|
|
| write! (f, "{}", self.command ())?;
|
|
|
| for param in self.params () {
|
| write! (f, " {}", param)?;
|
| }
|
|
|
| Ok (())
|
| }
|
| }
|
|
|
| impl<'a> Message<'a> {
|
| pub fn from (line: &'a mut [u8]) -> Result<Self, ParseError>
|
| {
|
| let mut message = Message {
|
| tags: [None; IRC_TAG_MAX],
|
| line: String::with_capacity (line.len ()),
|
| source_index: None,
|
| command_index: (0, 0),
|
| param_indexes: SmallVec::new (),
|
| };
|
|
|
| if let Some (first) = line.get_mut (0) {
|
| if *first == b'@' {
|
| if let (tags, Some (line)) = bytesep (line, b' ') {
|
| message.line = String::from_utf8_lossy (line).to_string ();
|
| message.parse_tags (&mut tags[1..])?;
|
| // While this comes at the cost of a realloc, it is worth it
|
| // as we will be storing the entire line in the struct.
|
| message.line.shrink_to_fit ();
|
| } else {
|
| // As there is no space delimiter in the byte slice,
|
| // we can be sure that there are either missing tags
|
| // or a missing command, and hence return early.
|
| return Err (ParseError::MissingDelim);
|
| }
|
| } else {
|
| // Earlier, line was created with the size of the length of
|
| // the byte array, so there is no need to shrink to fit.
|
| message.line = String::from_utf8_lossy (line).to_string ();
|
| }
|
| } else {
|
| // We know at this point that the message is completly empty
|
| // and hence can return early.
|
| return Err (ParseError::EmptyCommand);
|
| }
|
|
|
| let mut source_offset: usize = 0;
|
|
|
| if message.line.starts_with (':') {
|
| // Parse prefix
|
| source_offset += 1;
|
| let space_index = match message.line.find (' ') {
|
| Some (space_index) => space_index,
|
| None => return Err (ParseError::EmptyCommand), // If there is no space delimiter,
|
| // the message doesn't contain a
|
| // command, so we can return early.
|
| };
|
|
|
| message.source_index = Some ((1, space_index));
|
| }
|
|
|
| message.command_index.0 = match message.line.as_str ()
|
| [(source_offset + message.source_index.unwrap_or_default ().1)..]
|
| .find (|arg0: char| char::is_ascii_alphanumeric (&arg0))
|
| {
|
| Some (command_index) => {
|
| source_offset + message.source_index.unwrap_or_default ().1 + command_index
|
| }
|
| None => return Err (ParseError::EmptyCommand), // If there is no alphanumeric char
|
| // after the source, the message
|
| // doesn't contain a valid command,
|
| // so we can return early.
|
| };
|
|
|
| match message.line.as_str ()[message.command_index.0..].find (' ') {
|
| Some (command_index) => {
|
| message.command_index.1 = message.command_index.0 + command_index
|
| }
|
| None => {
|
| message.command_index.1 = message.line.len (); // If there is no space after the command, the rest of the
|
| // entire message is the command, and we can return early.
|
| return Ok (message);
|
| }
|
| }
|
|
|
| let mut command_offset = match message.line.as_str ()[message.command_index.1..]
|
| .find (|c: char| c.is_ascii () && !c.is_ascii_whitespace ())
|
| {
|
| Some (command_index) => message.command_index.1 + command_index,
|
| None => {
|
| // If there are no more non-whitespace ascii characters after
|
| // the command, we've reached the end of the message.
|
| message.command_index.1 = message.line.len ();
|
| return Ok (message);
|
| }
|
| };
|
|
|
| loop {
|
| if message.line.as_str ()[command_offset..].starts_with (':') {
|
| message
|
| .param_indexes
|
| .push ((command_offset + 1, message.line.len ()));
|
| break;
|
| }
|
|
|
| match message.line.as_str ()[command_offset..]
|
| .find (|c: char| c.is_ascii () && !c.is_ascii_whitespace ())
|
| {
|
| Some (index) => match message.line.as_str ()[(command_offset + index)..].find (' ') {
|
| Some (param_index) => {
|
| message
|
| .param_indexes
|
| .push ((command_offset + index, command_offset + index + param_index));
|
| command_offset = command_offset + index + param_index + 1;
|
| command_offset += message.line[command_offset..]
|
| .chars ()
|
| .take_while (|&ch| ch == ' ')
|
| .count ();
|
| }
|
| None => {
|
| message
|
| .param_indexes
|
| .push ((command_offset, message.line.len ()));
|
| // There are no more valid characters making up a parameter,
|
| // so we can break as there are no more parameters left to store.
|
| break;
|
| }
|
| },
|
|
|
| None => {
|
| // There are no more valid characters making up a parameter,
|
| // so we can break as there are no more parameters left to store.
|
| break;
|
| }
|
| }
|
| }
|
|
|
| Ok (message)
|
| }
|
|
|
| fn parse_tags (&mut self, tags: &'a mut [u8]) -> Result<(), ParseError>
|
| {
|
| match bytesep (tags, b';') {
|
| (head, Some (tail)) => {
|
| if head.is_empty () {
|
| // The head component being empty indicates
|
| // an empty split between tags, i.e. ";;",
|
| // meaning that at least one tag is missing.
|
| return Err (ParseError::MissingTags);
|
| }
|
|
|
| // The specification states that IRCv3 tags
|
| // should be UTF8, and tags that are invalid
|
| // UTF8 may be dropped by clients.
|
| if let Ok (tag) = std::str::from_utf8_mut (head) {
|
| let (key, value) = if let Some (index) = tag.find ('=') {
|
| let (key, value) = tag.split_at_mut (index);
|
| if key.is_empty () {
|
| // The key component being empty indicates
|
| // an empty key, i.e. "=value", which is invalid.
|
| return Err (ParseError::MissingKey);
|
| }
|
| let key: &str = key; // Make key immutable
|
| (key, unescape_tag (&mut value[1..]).unwrap_or_default())
|
| } else {
|
| let tag: &str = tag; // Make tag immutable
|
| (tag, "")
|
| };
|
|
|
| for (i, tag) in IRC_TAG_MAP.iter ().enumerate ().take (IRC_TAG_MAX) {
|
| if tag != &key {
|
| continue;
|
| }
|
| self.tags[i] = Some (value);
|
| }
|
| }
|
|
|
| self.parse_tags(tail)
|
| }
|
| (head, None) => {
|
| if head.is_empty () {
|
| // The tail component being empty indicates a
|
| // trailing semicolon, i.e. "foo;", meaning
|
| // that at least the last tag is missing.
|
| return Err (ParseError::MissingTags);
|
| }
|
| match std::str::from_utf8_mut (head) {
|
| Ok (tag) => {
|
| let (key, value) = if let Some (index) = tag.find ('=') {
|
| let (key, value) = tag.split_at_mut (index);
|
| if key.is_empty () {
|
| // The key component being empty indicates
|
| // an empty key, i.e. "=value", which is invalid.
|
| return Err (ParseError::MissingKey);
|
| }
|
| let key: &str = key; // Make key immutable
|
| (key, unescape_tag (&mut value[1..]).unwrap_or_default ())
|
| } else {
|
| let tag: &str = tag; // Make tag immutable
|
| (tag, "")
|
| };
|
|
|
| for (i, tag) in IRC_TAG_MAP.iter ().enumerate ().take (IRC_TAG_MAX) {
|
| if tag != &key {
|
| continue;
|
| }
|
| self.tags[i] = Some (value);
|
| }
|
| Ok (())
|
| }
|
| // The specification states that IRCv3 tags
|
| // should be UTF8, and tags that are invalid
|
| // UTF8 may be dropped by clients.
|
| Err (_) => Ok (()),
|
| }
|
| }
|
| }
|
| }
|
|
|
| #[must_use]
|
| pub fn command (&self) -> &str
|
| {
|
| &self.line[self.command_index.0..self.command_index.1]
|
| }
|
|
|
| #[must_use]
|
| pub fn param_len (&self) -> usize
|
| {
|
| self.param_indexes.len ()
|
| }
|
|
|
| #[must_use]
|
| pub fn param_at (&self, index: usize) -> Option<&str>
|
| {
|
| self.param_indexes
|
| .get (index)
|
| .map (|(start, end)| &self.line[*start..*end])
|
| }
|
|
|
| pub fn params (&self) -> impl Iterator<Item = &str> + '_
|
| {
|
| self.param_indexes
|
| .iter ()
|
| .map (|(start, end)| &self.line[*start..*end])
|
| }
|
| }
|