New paste Repaste Download
/*
* parser.rs - An IRCv3 parser.
*
* Written on Wednesday, 4 September 2024 by oldfashionedcow
*
* Copyright 2024 Cow
*
* This file is part of Beryllium.
*
* Beryllium is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Beryllium is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with Beryllium.  If not, see <http://www.gnu.org/licenses/>.
*
* SPDX-License-Identifier: AGPL-3.0-or-later
*/
use smallvec::SmallVec;
use std::fmt;
use thiserror::Error;
#[must_use]
pub (crate) fn bytesep (s: &mut [u8], delim: u8) -> (&mut [u8], Option<&mut [u8]>)
{
    let mut index = 0;
    let len = s.len ();
    while index < len {
        if s[index] == delim {
            let (first, second) = s.split_at_mut (index);
            return (first, Some (&mut second[1..]));
        }
        index += 1;
    }
    (s, None)
}
#[derive (Debug, Error)]
pub enum ParseError
{
    #[error ("Cannot parse command-less line!")]
    EmptyCommand,
    #[error ("Cannot split line without separator!")]
    NoSeparator,
    #[error ("Cannot parse tags with missing key!")]
    MissingKey,
    #[error ("Cannot parse line with missing parameter!")]
    MissingParameter,
    #[error ("Cannot parse tags with trailing semicolon!")]
    TrailingSemicolon,
    #[error ("Cannot parse line with missing tags!")]
    MissingTags,
    #[error ("Non last params cannot have spaces!")]
    NonLastParamSpaces,
    #[error ("Non last params cannot start with colon!")]
    NonLastParamColon,
    #[error ("Cannot parse line with missing delimiter!")]
    MissingDelim,
}
pub (crate) const IRC_TAG_MAX: usize = 6;
pub (crate) const IRC_TAG_MAP: [&str; IRC_TAG_MAX] = [
    "batch",
    "msgid",
    "+draft/react",
    "+draft/reply",
    "time",
    "+typing",
];
pub (crate) fn unescape_tag (input: &mut str) -> Result<&str, std::str::Utf8Error>
{
    let bytes = unsafe { input.as_bytes_mut () };
    let mut source_pos = 0;
    let mut dest_pos = 0;
    while source_pos < bytes.len () {
        if bytes[source_pos] == b'\\' && source_pos + 1 < bytes.len () {
            source_pos += 1;
            let unescaped_char = match bytes[source_pos] {
                b':' => b';',
                b's' => b' ',
                b'r' => b'\r',
                b'n' => b'\n',
                _ => bytes[source_pos],
            };
            bytes[dest_pos] = unescaped_char;
        } else {
            bytes[dest_pos] = bytes[source_pos];
        }
        source_pos += 1;
        dest_pos += 1;
    }
    std::str::from_utf8 (&bytes[..dest_pos])
}
#[derive(Debug)]
pub struct Message<'a>
{
    tags: [Option<&'a str>; IRC_TAG_MAX],
    line: String,
    source_index: Option<(usize, usize)>,
    command_index: (usize, usize),
    param_indexes: SmallVec<[(usize, usize); 15]>,
}
impl<'a> fmt::Display for Message<'a>
{
    fn fmt (&self, f: &mut fmt::Formatter<'_>) -> fmt::Result
    {
        let mut tags_iter = self.tags.iter ().flatten ();
        if let Some (first_tag) = tags_iter.next () {
            write! (f, "@{}", first_tag)?;
            for tag in tags_iter {
                write! (f, ";{}", tag)?;
            }
            write! (f, " ")?;
        }
        if let Some ((start, end)) = self.source_index {
            write! (f, ":{} ", &self.line[start..end])?;
        }
        write! (f, "{}", self.command ())?;
        for param in self.params () {
            write! (f, " {}", param)?;
        }
        Ok (())
    }
}
impl<'a> Message<'a> {
    pub fn from (line: &'a mut [u8]) -> Result<Self, ParseError>
    {
        let mut message = Message {
            tags: [None; IRC_TAG_MAX],
            line: String::with_capacity (line.len ()),
            source_index: None,
            command_index: (0, 0),
            param_indexes: SmallVec::new (),
        };
        if let Some (first) = line.get_mut (0) {
            if *first == b'@' {
                if let (tags, Some (line)) = bytesep (line, b' ') {
                    message.line = String::from_utf8_lossy (line).to_string ();
                    message.parse_tags (&mut tags[1..])?;
                    // While this comes at the cost of a realloc, it is worth it
                    // as we will be storing the entire line in the struct.
                    message.line.shrink_to_fit ();
                } else {
                    // As there is no space delimiter in the byte slice,
                    // we can be sure that there are either missing tags
                    // or a missing command, and hence return early.
                    return Err (ParseError::MissingDelim);
                }
            } else {
                // Earlier, line was created with the size of the length of
                // the byte array, so there is no need to shrink to fit.
                message.line = String::from_utf8_lossy (line).to_string ();
            }
        } else {
            // We know at this point that the message is completly empty
            // and hence can return early.
            return Err (ParseError::EmptyCommand);
        }
        let mut source_offset: usize = 0;
        if message.line.starts_with (':') {
            // Parse prefix
            source_offset += 1;
            let space_index = match message.line.find (' ') {
                Some (space_index) => space_index,
                None => return Err (ParseError::EmptyCommand), // If there is no space delimiter,
                                                               // the message doesn't contain a
                                                               // command, so we can return early.
            };
            message.source_index = Some ((1, space_index));
        }
        message.command_index.0 = match message.line.as_str ()
            [(source_offset + message.source_index.unwrap_or_default ().1)..]
            .find (|arg0: char| char::is_ascii_alphanumeric (&arg0))
        {
            Some (command_index) => {
                source_offset + message.source_index.unwrap_or_default ().1 + command_index
            }
            None => return Err (ParseError::EmptyCommand), // If there is no alphanumeric char
                                                           // after the source, the message
                                                           // doesn't contain a valid command,
                                                           // so we can return early.
        };
        match message.line.as_str ()[message.command_index.0..].find (' ') {
            Some (command_index) => {
                message.command_index.1 = message.command_index.0 + command_index
            }
            None => {
                message.command_index.1 = message.line.len (); // If there is no space after the command, the rest of the
                                                               // entire message is the command, and we can return early.
                return Ok (message);
            }
        }
        let mut command_offset = match message.line.as_str ()[message.command_index.1..]
            .find (|c: char| c.is_ascii () && !c.is_ascii_whitespace ())
        {
            Some (command_index) => message.command_index.1 + command_index,
            None => {
                // If there are no more non-whitespace ascii characters after
                // the command, we've reached the end of the message.
                message.command_index.1 = message.line.len ();
                return Ok (message);
            }
        };
        loop {
            if message.line.as_str ()[command_offset..].starts_with (':') {
                message
                    .param_indexes
                    .push ((command_offset + 1, message.line.len ()));
                break;
            }
            match message.line.as_str ()[command_offset..]
                .find (|c: char| c.is_ascii () && !c.is_ascii_whitespace ())
            {
                Some (index) => match message.line.as_str ()[(command_offset + index)..].find (' ') {
                    Some (param_index) => {
                        message
                            .param_indexes
                            .push ((command_offset + index, command_offset + index + param_index));
                        command_offset = command_offset + index + param_index + 1;
                        command_offset += message.line[command_offset..]
                            .chars ()
                            .take_while (|&ch| ch == ' ')
                            .count ();
                    }
                    None => {
                        message
                            .param_indexes
                            .push ((command_offset, message.line.len ()));
                        // There are no more valid characters making up a parameter,
                        // so we can break as there are no more parameters left to store.
                        break;
                    }
                },
                None => {
                    // There are no more valid characters making up a parameter,
                    // so we can break as there are no more parameters left to store.
                    break;
                }
            }
        }
        Ok (message)
    }
    fn parse_tags (&mut self, tags: &'a mut [u8]) -> Result<(), ParseError>
    {
        match bytesep (tags, b';') {
            (head, Some (tail)) => {
                if head.is_empty () {
                    // The head component being empty indicates
                    // an empty split between tags, i.e. ";;",
                    // meaning that at least one tag is missing.
                    return Err (ParseError::MissingTags);
                }
                // The specification states that IRCv3 tags
                // should be UTF8, and tags that are invalid
                // UTF8 may be dropped by clients.
                if let Ok (tag) = std::str::from_utf8_mut (head) {
                    let (key, value) = if let Some (index) = tag.find ('=') {
                        let (key, value) = tag.split_at_mut (index);
                        if key.is_empty () {
                            // The key component being empty indicates
                            // an empty key, i.e. "=value", which is invalid.
                            return Err (ParseError::MissingKey);
                        }
                        let key: &str = key; // Make key immutable
                        (key, unescape_tag (&mut value[1..]).unwrap_or_default())
                    } else {
                        let tag: &str = tag; // Make tag immutable
                        (tag, "")
                    };
                    for (i, tag) in IRC_TAG_MAP.iter ().enumerate ().take (IRC_TAG_MAX) {
                        if tag != &key {
                            continue;
                        }
                        self.tags[i] = Some (value);
                    }
                }
                self.parse_tags(tail)
            }
            (head, None) => {
                if head.is_empty () {
                    // The tail component being empty indicates a
                    // trailing semicolon, i.e. "foo;", meaning
                    // that at least the last tag is missing.
                    return Err (ParseError::MissingTags);
                }
                match std::str::from_utf8_mut (head) {
                    Ok (tag) => {
                        let (key, value) = if let Some (index) = tag.find ('=') {
                            let (key, value) = tag.split_at_mut (index);
                            if key.is_empty () {
                                // The key component being empty indicates
                                // an empty key, i.e. "=value", which is invalid.
                                return Err (ParseError::MissingKey);
                            }
                            let key: &str = key; // Make key immutable
                            (key, unescape_tag (&mut value[1..]).unwrap_or_default ())
                        } else {
                            let tag: &str = tag; // Make tag immutable
                            (tag, "")
                        };
                        for (i, tag) in IRC_TAG_MAP.iter ().enumerate ().take (IRC_TAG_MAX) {
                            if tag != &key {
                                continue;
                            }
                            self.tags[i] = Some (value);
                        }
                        Ok (())
                    }
                    // The specification states that IRCv3 tags
                    // should be UTF8, and tags that are invalid
                    // UTF8 may be dropped by clients.
                    Err (_) => Ok (()),
                }
            }
        }
    }
    #[must_use]
    pub fn command (&self) -> &str
    {
        &self.line[self.command_index.0..self.command_index.1]
    }
    #[must_use]
    pub fn param_len (&self) -> usize
    {
        self.param_indexes.len ()
    }
    #[must_use]
    pub fn param_at (&self, index: usize) -> Option<&str>
    {
        self.param_indexes
            .get (index)
            .map (|(start, end)| &self.line[*start..*end])
    }
    pub fn params (&self) -> impl Iterator<Item = &str> + '_
    {
        self.param_indexes
            .iter ()
            .map (|(start, end)| &self.line[*start..*end])
    }
}
Filename: src/main.rs. Size: 14kb. View raw, , hex, or download this file.

This paste expires on 2026-01-03 22:48:24.235644+00:00. Pasted through v1-api.