/*
* parser.rs - An IRCv3 parser.
*
* Written on Wednesday, 4 September 2024 by oldfashionedcow
*
* Copyright 2024 Cow
*
* This file is part of Beryllium.
*
* Beryllium is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Beryllium is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with Beryllium. If not, see .
*
* SPDX-License-Identifier: AGPL-3.0-or-later
*/
use smallvec::SmallVec;
use std::fmt;
use thiserror::Error;
#[must_use]
pub (crate) fn bytesep (s: &mut [u8], delim: u8) -> (&mut [u8], Option<&mut [u8]>)
{
let mut index = 0;
let len = s.len ();
while index < len {
if s[index] == delim {
let (first, second) = s.split_at_mut (index);
return (first, Some (&mut second[1..]));
}
index += 1;
}
(s, None)
}
#[derive (Debug, Error)]
pub enum ParseError
{
#[error ("Cannot parse command-less line!")]
EmptyCommand,
#[error ("Cannot split line without separator!")]
NoSeparator,
#[error ("Cannot parse tags with missing key!")]
MissingKey,
#[error ("Cannot parse line with missing parameter!")]
MissingParameter,
#[error ("Cannot parse tags with trailing semicolon!")]
TrailingSemicolon,
#[error ("Cannot parse line with missing tags!")]
MissingTags,
#[error ("Non last params cannot have spaces!")]
NonLastParamSpaces,
#[error ("Non last params cannot start with colon!")]
NonLastParamColon,
#[error ("Cannot parse line with missing delimiter!")]
MissingDelim,
}
pub (crate) const IRC_TAG_MAX: usize = 6;
pub (crate) const IRC_TAG_MAP: [&str; IRC_TAG_MAX] = [
"batch",
"msgid",
"+draft/react",
"+draft/reply",
"time",
"+typing",
];
pub (crate) fn unescape_tag (input: &mut str) -> Result<&str, std::str::Utf8Error>
{
let bytes = unsafe { input.as_bytes_mut () };
let mut source_pos = 0;
let mut dest_pos = 0;
while source_pos < bytes.len () {
if bytes[source_pos] == b'\\' && source_pos + 1 < bytes.len () {
source_pos += 1;
let unescaped_char = match bytes[source_pos] {
b':' => b';',
b's' => b' ',
b'r' => b'\r',
b'n' => b'\n',
_ => bytes[source_pos],
};
bytes[dest_pos] = unescaped_char;
} else {
bytes[dest_pos] = bytes[source_pos];
}
source_pos += 1;
dest_pos += 1;
}
std::str::from_utf8 (&bytes[..dest_pos])
}
#[derive(Debug)]
pub struct Message<'a>
{
tags: [Option<&'a str>; IRC_TAG_MAX],
line: String,
source_index: Option<(usize, usize)>,
command_index: (usize, usize),
param_indexes: SmallVec<[(usize, usize); 15]>,
}
impl<'a> fmt::Display for Message<'a>
{
fn fmt (&self, f: &mut fmt::Formatter<'_>) -> fmt::Result
{
let mut tags_iter = self.tags.iter ().flatten ();
if let Some (first_tag) = tags_iter.next () {
write! (f, "@{}", first_tag)?;
for tag in tags_iter {
write! (f, ";{}", tag)?;
}
write! (f, " ")?;
}
if let Some ((start, end)) = self.source_index {
write! (f, ":{} ", &self.line[start..end])?;
}
write! (f, "{}", self.command ())?;
for param in self.params () {
write! (f, " {}", param)?;
}
Ok (())
}
}
impl<'a> Message<'a> {
pub fn from (line: &'a mut [u8]) -> Result
{
let mut message = Message {
tags: [None; IRC_TAG_MAX],
line: String::with_capacity (line.len ()),
source_index: None,
command_index: (0, 0),
param_indexes: SmallVec::new (),
};
if let Some (first) = line.get_mut (0) {
if *first == b'@' {
if let (tags, Some (line)) = bytesep (line, b' ') {
message.line = String::from_utf8_lossy (line).to_string ();
message.parse_tags (&mut tags[1..])?;
// While this comes at the cost of a realloc, it is worth it
// as we will be storing the entire line in the struct.
message.line.shrink_to_fit ();
} else {
// As there is no space delimiter in the byte slice,
// we can be sure that there are either missing tags
// or a missing command, and hence return early.
return Err (ParseError::MissingDelim);
}
} else {
// Earlier, line was created with the size of the length of
// the byte array, so there is no need to shrink to fit.
message.line = String::from_utf8_lossy (line).to_string ();
}
} else {
// We know at this point that the message is completly empty
// and hence can return early.
return Err (ParseError::EmptyCommand);
}
let mut source_offset: usize = 0;
if message.line.starts_with (':') {
// Parse prefix
source_offset += 1;
let space_index = match message.line.find (' ') {
Some (space_index) => space_index,
None => return Err (ParseError::EmptyCommand), // If there is no space delimiter,
// the message doesn't contain a
// command, so we can return early.
};
message.source_index = Some ((1, space_index));
}
message.command_index.0 = match message.line.as_str ()
[(source_offset + message.source_index.unwrap_or_default ().1)..]
.find (|arg0: char| char::is_ascii_alphanumeric (&arg0))
{
Some (command_index) => {
source_offset + message.source_index.unwrap_or_default ().1 + command_index
}
None => return Err (ParseError::EmptyCommand), // If there is no alphanumeric char
// after the source, the message
// doesn't contain a valid command,
// so we can return early.
};
match message.line.as_str ()[message.command_index.0..].find (' ') {
Some (command_index) => {
message.command_index.1 = message.command_index.0 + command_index
}
None => {
message.command_index.1 = message.line.len (); // If there is no space after the command, the rest of the
// entire message is the command, and we can return early.
return Ok (message);
}
}
let mut command_offset = match message.line.as_str ()[message.command_index.1..]
.find (|c: char| c.is_ascii () && !c.is_ascii_whitespace ())
{
Some (command_index) => message.command_index.1 + command_index,
None => {
// If there are no more non-whitespace ascii characters after
// the command, we've reached the end of the message.
message.command_index.1 = message.line.len ();
return Ok (message);
}
};
loop {
if message.line.as_str ()[command_offset..].starts_with (':') {
message
.param_indexes
.push ((command_offset + 1, message.line.len ()));
break;
}
match message.line.as_str ()[command_offset..]
.find (|c: char| c.is_ascii () && !c.is_ascii_whitespace ())
{
Some (index) => match message.line.as_str ()[(command_offset + index)..].find (' ') {
Some (param_index) => {
message
.param_indexes
.push ((command_offset + index, command_offset + index + param_index));
command_offset = command_offset + index + param_index + 1;
command_offset += message.line[command_offset..]
.chars ()
.take_while (|&ch| ch == ' ')
.count ();
}
None => {
message
.param_indexes
.push ((command_offset, message.line.len ()));
// There are no more valid characters making up a parameter,
// so we can break as there are no more parameters left to store.
break;
}
},
None => {
// There are no more valid characters making up a parameter,
// so we can break as there are no more parameters left to store.
break;
}
}
}
Ok (message)
}
fn parse_tags (&mut self, tags: &'a mut [u8]) -> Result<(), ParseError>
{
match bytesep (tags, b';') {
(head, Some (tail)) => {
if head.is_empty () {
// The head component being empty indicates
// an empty split between tags, i.e. ";;",
// meaning that at least one tag is missing.
return Err (ParseError::MissingTags);
}
// The specification states that IRCv3 tags
// should be UTF8, and tags that are invalid
// UTF8 may be dropped by clients.
if let Ok (tag) = std::str::from_utf8_mut (head) {
let (key, value) = if let Some (index) = tag.find ('=') {
let (key, value) = tag.split_at_mut (index);
if key.is_empty () {
// The key component being empty indicates
// an empty key, i.e. "=value", which is invalid.
return Err (ParseError::MissingKey);
}
let key: &str = key; // Make key immutable
(key, unescape_tag (&mut value[1..]).unwrap_or_default())
} else {
let tag: &str = tag; // Make tag immutable
(tag, "")
};
for (i, tag) in IRC_TAG_MAP.iter ().enumerate ().take (IRC_TAG_MAX) {
if tag != &key {
continue;
}
self.tags[i] = Some (value);
}
}
self.parse_tags(tail)
}
(head, None) => {
if head.is_empty () {
// The tail component being empty indicates a
// trailing semicolon, i.e. "foo;", meaning
// that at least the last tag is missing.
return Err (ParseError::MissingTags);
}
match std::str::from_utf8_mut (head) {
Ok (tag) => {
let (key, value) = if let Some (index) = tag.find ('=') {
let (key, value) = tag.split_at_mut (index);
if key.is_empty () {
// The key component being empty indicates
// an empty key, i.e. "=value", which is invalid.
return Err (ParseError::MissingKey);
}
let key: &str = key; // Make key immutable
(key, unescape_tag (&mut value[1..]).unwrap_or_default ())
} else {
let tag: &str = tag; // Make tag immutable
(tag, "")
};
for (i, tag) in IRC_TAG_MAP.iter ().enumerate ().take (IRC_TAG_MAX) {
if tag != &key {
continue;
}
self.tags[i] = Some (value);
}
Ok (())
}
// The specification states that IRCv3 tags
// should be UTF8, and tags that are invalid
// UTF8 may be dropped by clients.
Err (_) => Ok (()),
}
}
}
}
#[must_use]
pub fn command (&self) -> &str
{
&self.line[self.command_index.0..self.command_index.1]
}
#[must_use]
pub fn param_len (&self) -> usize
{
self.param_indexes.len ()
}
#[must_use]
pub fn param_at (&self, index: usize) -> Option<&str>
{
self.param_indexes
.get (index)
.map (|(start, end)| &self.line[*start..*end])
}
pub fn params (&self) -> impl Iterator- + '_
{
self.param_indexes
.iter ()
.map (|(start, end)| &self.line[*start..*end])
}
}