View paste MTHQ

New paste Repaste Download

	`/*`
	`* parser.rs - An IRCv3 parser.`
	`*`
	`* Written on Wednesday, 4 September 2024 by oldfashionedcow`
	`*`
	`* Copyright 2024 Cow`
	`*`
	`* This file is part of Beryllium.`
	`*`
	`* Beryllium is free software: you can redistribute it and/or modify`
	`* it under the terms of the GNU Affero General Public License as published by`
	`* the Free Software Foundation, either version 3 of the License, or`
	`* (at your option) any later version.`
	`*`
	`* Beryllium is distributed in the hope that it will be useful,`
	`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
	`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
	`* GNU Affero General Public License for more details.`
	`*`
	`* You should have received a copy of the GNU Affero General Public License`
	`* along with Beryllium. If not, see <http://www.gnu.org/licenses/>.`
	`*`
	`* SPDX-License-Identifier: AGPL-3.0-or-later`
	`*/`

	`use smallvec::SmallVec;`
	`use std::fmt;`
	`use thiserror::Error;`

	`#[must_use]`
	`pub (crate) fn bytesep (s: &mut [u8], delim: u8) -> (&mut [u8], Option<&mut [u8]>)`
	`{`
	`let mut index = 0;`
	`let len = s.len ();`

	`while index < len {`
	`if s[index] == delim {`
	`let (first, second) = s.split_at_mut (index);`
	`return (first, Some (&mut second[1..]));`
	`}`
	`index += 1;`
	`}`

	`(s, None)`
	`}`

	`#[derive (Debug, Error)]`
	`pub enum ParseError`
	`{`
	`#[error ("Cannot parse command-less line!")]`
	`EmptyCommand,`
	`#[error ("Cannot split line without separator!")]`
	`NoSeparator,`
	`#[error ("Cannot parse tags with missing key!")]`
	`MissingKey,`
	`#[error ("Cannot parse line with missing parameter!")]`
	`MissingParameter,`
	`#[error ("Cannot parse tags with trailing semicolon!")]`
	`TrailingSemicolon,`
	`#[error ("Cannot parse line with missing tags!")]`
	`MissingTags,`
	`#[error ("Non last params cannot have spaces!")]`
	`NonLastParamSpaces,`
	`#[error ("Non last params cannot start with colon!")]`
	`NonLastParamColon,`
	`#[error ("Cannot parse line with missing delimiter!")]`
	`MissingDelim,`
	`}`

	`pub (crate) const IRC_TAG_MAX: usize = 6;`
	`pub (crate) const IRC_TAG_MAP: [&str; IRC_TAG_MAX] = [`
	`"batch",`
	`"msgid",`
	`"+draft/react",`
	`"+draft/reply",`
	`"time",`
	`"+typing",`
	`];`

	`pub (crate) fn unescape_tag (input: &mut str) -> Result<&str, std::str::Utf8Error>`
	`{`
	`let bytes = unsafe { input.as_bytes_mut () };`
	`let mut source_pos = 0;`
	`let mut dest_pos = 0;`

	`while source_pos < bytes.len () {`
	`if bytes[source_pos] == b'\\' && source_pos + 1 < bytes.len () {`
	`source_pos += 1;`
	`let unescaped_char = match bytes[source_pos] {`
	`b':' => b';',`
	`b's' => b' ',`
	`b'r' => b'\r',`
	`b'n' => b'\n',`
	`_ => bytes[source_pos],`
	`};`
	`bytes[dest_pos] = unescaped_char;`
	`} else {`
	`bytes[dest_pos] = bytes[source_pos];`
	`}`
	`source_pos += 1;`
	`dest_pos += 1;`
	`}`

	`std::str::from_utf8 (&bytes[..dest_pos])`
	`}`

	`#[derive(Debug)]`
	`pub struct Message<'a>`
	`{`
	`tags: [Option<&'a str>; IRC_TAG_MAX],`
	`line: String,`
	`source_index: Option<(usize, usize)>,`
	`command_index: (usize, usize),`
	`param_indexes: SmallVec<[(usize, usize); 15]>,`
	`}`

	`impl<'a> fmt::Display for Message<'a>`
	`{`
	`fn fmt (&self, f: &mut fmt::Formatter<'_>) -> fmt::Result`
	`{`
	`let mut tags_iter = self.tags.iter ().flatten ();`

	`if let Some (first_tag) = tags_iter.next () {`
	`write! (f, "@{}", first_tag)?;`

	`for tag in tags_iter {`
	`write! (f, ";{}", tag)?;`
	`}`

	`write! (f, " ")?;`
	`}`

	`if let Some ((start, end)) = self.source_index {`
	`write! (f, ":{} ", &self.line[start..end])?;`
	`}`

	`write! (f, "{}", self.command ())?;`

	`for param in self.params () {`
	`write! (f, " {}", param)?;`
	`}`

	`Ok (())`
	`}`
	`}`

	`impl<'a> Message<'a> {`
	`pub fn from (line: &'a mut [u8]) -> Result<Self, ParseError>`
	`{`
	`let mut message = Message {`
	`tags: [None; IRC_TAG_MAX],`
	`line: String::with_capacity (line.len ()),`
	`source_index: None,`
	`command_index: (0, 0),`
	`param_indexes: SmallVec::new (),`
	`};`

	`if let Some (first) = line.get_mut (0) {`
	`if *first == b'@' {`
	`if let (tags, Some (line)) = bytesep (line, b' ') {`
	`message.line = String::from_utf8_lossy (line).to_string ();`
	`message.parse_tags (&mut tags[1..])?;`
	`// While this comes at the cost of a realloc, it is worth it`
	`// as we will be storing the entire line in the struct.`
	`message.line.shrink_to_fit ();`
	`} else {`
	`// As there is no space delimiter in the byte slice,`
	`// we can be sure that there are either missing tags`
	`// or a missing command, and hence return early.`
	`return Err (ParseError::MissingDelim);`
	`}`
	`} else {`
	`// Earlier, line was created with the size of the length of`
	`// the byte array, so there is no need to shrink to fit.`
	`message.line = String::from_utf8_lossy (line).to_string ();`
	`}`
	`} else {`
	`// We know at this point that the message is completly empty`
	`// and hence can return early.`
	`return Err (ParseError::EmptyCommand);`
	`}`

	`let mut source_offset: usize = 0;`

	`if message.line.starts_with (':') {`
	`// Parse prefix`
	`source_offset += 1;`
	`let space_index = match message.line.find (' ') {`
	`Some (space_index) => space_index,`
	`None => return Err (ParseError::EmptyCommand), // If there is no space delimiter,`
	`// the message doesn't contain a`
	`// command, so we can return early.`
	`};`

	`message.source_index = Some ((1, space_index));`
	`}`

	`message.command_index.0 = match message.line.as_str ()`
	`[(source_offset + message.source_index.unwrap_or_default ().1)..]`
	`.find (\|arg0: char\| char::is_ascii_alphanumeric (&arg0))`
	`{`
	`Some (command_index) => {`
	`source_offset + message.source_index.unwrap_or_default ().1 + command_index`
	`}`
	`None => return Err (ParseError::EmptyCommand), // If there is no alphanumeric char`
	`// after the source, the message`
	`// doesn't contain a valid command,`
	`// so we can return early.`
	`};`

	`match message.line.as_str ()[message.command_index.0..].find (' ') {`
	`Some (command_index) => {`
	`message.command_index.1 = message.command_index.0 + command_index`
	`}`
	`None => {`
	`message.command_index.1 = message.line.len (); // If there is no space after the command, the rest of the`
	`// entire message is the command, and we can return early.`
	`return Ok (message);`
	`}`
	`}`

	`let mut command_offset = match message.line.as_str ()[message.command_index.1..]`
	`.find (\|c: char\| c.is_ascii () && !c.is_ascii_whitespace ())`
	`{`
	`Some (command_index) => message.command_index.1 + command_index,`
	`None => {`
	`// If there are no more non-whitespace ascii characters after`
	`// the command, we've reached the end of the message.`
	`message.command_index.1 = message.line.len ();`
	`return Ok (message);`
	`}`
	`};`

	`loop {`
	`if message.line.as_str ()[command_offset..].starts_with (':') {`
	`message`
	`.param_indexes`
	`.push ((command_offset + 1, message.line.len ()));`
	`break;`
	`}`

	`match message.line.as_str ()[command_offset..]`
	`.find (\|c: char\| c.is_ascii () && !c.is_ascii_whitespace ())`
	`{`
	`Some (index) => match message.line.as_str ()[(command_offset + index)..].find (' ') {`
	`Some (param_index) => {`
	`message`
	`.param_indexes`
	`.push ((command_offset + index, command_offset + index + param_index));`
	`command_offset = command_offset + index + param_index + 1;`
	`command_offset += message.line[command_offset..]`
	`.chars ()`
	`.take_while (\|&ch\| ch == ' ')`
	`.count ();`
	`}`
	`None => {`
	`message`
	`.param_indexes`
	`.push ((command_offset, message.line.len ()));`
	`// There are no more valid characters making up a parameter,`
	`// so we can break as there are no more parameters left to store.`
	`break;`
	`}`
	`},`

	`None => {`
	`// There are no more valid characters making up a parameter,`
	`// so we can break as there are no more parameters left to store.`
	`break;`
	`}`
	`}`
	`}`

	`Ok (message)`
	`}`

	`fn parse_tags (&mut self, tags: &'a mut [u8]) -> Result<(), ParseError>`
	`{`
	`match bytesep (tags, b';') {`
	`(head, Some (tail)) => {`
	`if head.is_empty () {`
	`// The head component being empty indicates`
	`// an empty split between tags, i.e. ";;",`
	`// meaning that at least one tag is missing.`
	`return Err (ParseError::MissingTags);`
	`}`

	`// The specification states that IRCv3 tags`
	`// should be UTF8, and tags that are invalid`
	`// UTF8 may be dropped by clients.`
	`if let Ok (tag) = std::str::from_utf8_mut (head) {`
	`let (key, value) = if let Some (index) = tag.find ('=') {`
	`let (key, value) = tag.split_at_mut (index);`
	`if key.is_empty () {`
	`// The key component being empty indicates`
	`// an empty key, i.e. "=value", which is invalid.`
	`return Err (ParseError::MissingKey);`
	`}`
	`let key: &str = key; // Make key immutable`
	`(key, unescape_tag (&mut value[1..]).unwrap_or_default())`
	`} else {`
	`let tag: &str = tag; // Make tag immutable`
	`(tag, "")`
	`};`

	`for (i, tag) in IRC_TAG_MAP.iter ().enumerate ().take (IRC_TAG_MAX) {`
	`if tag != &key {`
	`continue;`
	`}`
	`self.tags[i] = Some (value);`
	`}`
	`}`

	`self.parse_tags(tail)`
	`}`
	`(head, None) => {`
	`if head.is_empty () {`
	`// The tail component being empty indicates a`
	`// trailing semicolon, i.e. "foo;", meaning`
	`// that at least the last tag is missing.`
	`return Err (ParseError::MissingTags);`
	`}`
	`match std::str::from_utf8_mut (head) {`
	`Ok (tag) => {`
	`let (key, value) = if let Some (index) = tag.find ('=') {`
	`let (key, value) = tag.split_at_mut (index);`
	`if key.is_empty () {`
	`// The key component being empty indicates`
	`// an empty key, i.e. "=value", which is invalid.`
	`return Err (ParseError::MissingKey);`
	`}`
	`let key: &str = key; // Make key immutable`
	`(key, unescape_tag (&mut value[1..]).unwrap_or_default ())`
	`} else {`
	`let tag: &str = tag; // Make tag immutable`
	`(tag, "")`
	`};`

	`for (i, tag) in IRC_TAG_MAP.iter ().enumerate ().take (IRC_TAG_MAX) {`
	`if tag != &key {`
	`continue;`
	`}`
	`self.tags[i] = Some (value);`
	`}`
	`Ok (())`
	`}`
	`// The specification states that IRCv3 tags`
	`// should be UTF8, and tags that are invalid`
	`// UTF8 may be dropped by clients.`
	`Err (_) => Ok (()),`
	`}`
	`}`
	`}`
	`}`

	`#[must_use]`
	`pub fn command (&self) -> &str`
	`{`
	`&self.line[self.command_index.0..self.command_index.1]`
	`}`

	`#[must_use]`
	`pub fn param_len (&self) -> usize`
	`{`
	`self.param_indexes.len ()`
	`}`

	`#[must_use]`
	`pub fn param_at (&self, index: usize) -> Option<&str>`
	`{`
	`self.param_indexes`
	`.get (index)`
	`.map (\|(start, end)\| &self.line[start..end])`
	`}`

	`pub fn params (&self) -> impl Iterator<Item = &str> + '_`
	`{`
	`self.param_indexes`
	`.iter ()`
	`.map (\|(start, end)\| &self.line[start..end])`
	`}`
	`}`

/*
 * parser.rs - An IRCv3 parser.
 *
 * Written on Wednesday, 4 September 2024 by oldfashionedcow
 *
 * Copyright 2024 Cow
 *
 * This file is part of Beryllium.
 *
 * Beryllium is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Beryllium is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with Beryllium.  If not, see <http://www.gnu.org/licenses/>.
 *
 * SPDX-License-Identifier: AGPL-3.0-or-later
 */

use smallvec::SmallVec;
use std::fmt;
use thiserror::Error;

#[must_use]
pub (crate) fn bytesep (s: &mut [u8], delim: u8) -> (&mut [u8], Option<&mut [u8]>)
{
    let mut index = 0;
    let len = s.len ();

while index < len {
        if s[index] == delim {
            let (first, second) = s.split_at_mut (index);
            return (first, Some (&mut second[1..]));
        }
        index += 1;
    }

(s, None)
}

#[derive (Debug, Error)]
pub enum ParseError
{
    #[error ("Cannot parse command-less line!")]
    EmptyCommand,
    #[error ("Cannot split line without separator!")]
    NoSeparator,
    #[error ("Cannot parse tags with missing key!")]
    MissingKey,
    #[error ("Cannot parse line with missing parameter!")]
    MissingParameter,
    #[error ("Cannot parse tags with trailing semicolon!")]
    TrailingSemicolon,
    #[error ("Cannot parse line with missing tags!")]
    MissingTags,
    #[error ("Non last params cannot have spaces!")]
    NonLastParamSpaces,
    #[error ("Non last params cannot start with colon!")]
    NonLastParamColon,
    #[error ("Cannot parse line with missing delimiter!")]
    MissingDelim,
}

pub (crate) const IRC_TAG_MAX: usize = 6;
pub (crate) const IRC_TAG_MAP: [&str; IRC_TAG_MAX] = [
    "batch",
    "msgid",
    "+draft/react",
    "+draft/reply",
    "time",
    "+typing",
];

pub (crate) fn unescape_tag (input: &mut str) -> Result<&str, std::str::Utf8Error>
{
    let bytes = unsafe { input.as_bytes_mut () };
    let mut source_pos = 0;
    let mut dest_pos = 0;

while source_pos < bytes.len () {
        if bytes[source_pos] == b'\\' && source_pos + 1 < bytes.len () {
            source_pos += 1;
            let unescaped_char = match bytes[source_pos] {
                b':' => b';',
                b's' => b' ',
                b'r' => b'\r',
                b'n' => b'\n',
                _ => bytes[source_pos],
            };
            bytes[dest_pos] = unescaped_char;
        } else {
            bytes[dest_pos] = bytes[source_pos];
        }
        source_pos += 1;
        dest_pos += 1;
    }

std::str::from_utf8 (&bytes[..dest_pos])
}

#[derive(Debug)]
pub struct Message<'a>
{
    tags: [Option<&'a str>; IRC_TAG_MAX],
    line: String,
    source_index: Option<(usize, usize)>,
    command_index: (usize, usize),
    param_indexes: SmallVec<[(usize, usize); 15]>,
}

impl<'a> fmt::Display for Message<'a>
{
    fn fmt (&self, f: &mut fmt::Formatter<'_>) -> fmt::Result
    {
        let mut tags_iter = self.tags.iter ().flatten ();

if let Some (first_tag) = tags_iter.next () {
            write! (f, "@{}", first_tag)?;

for tag in tags_iter {
                write! (f, ";{}", tag)?;
            }

write! (f, " ")?;
        }

if let Some ((start, end)) = self.source_index {
            write! (f, ":{} ", &self.line[start..end])?;
        }

write! (f, "{}", self.command ())?;

for param in self.params () {
            write! (f, " {}", param)?;
        }

Ok (())
    }
}

impl<'a> Message<'a> {
    pub fn from (line: &'a mut [u8]) -> Result<Self, ParseError>
    {
        let mut message = Message {
            tags: [None; IRC_TAG_MAX],
            line: String::with_capacity (line.len ()),
            source_index: None,
            command_index: (0, 0),
            param_indexes: SmallVec::new (),
        };

if let Some (first) = line.get_mut (0) {
            if *first == b'@' {
                if let (tags, Some (line)) = bytesep (line, b' ') {
                    message.line = String::from_utf8_lossy (line).to_string ();
                    message.parse_tags (&mut tags[1..])?;
                    // While this comes at the cost of a realloc, it is worth it
                    // as we will be storing the entire line in the struct.
                    message.line.shrink_to_fit ();
                } else {
                    // As there is no space delimiter in the byte slice,
                    // we can be sure that there are either missing tags
                    // or a missing command, and hence return early.
                    return Err (ParseError::MissingDelim);
                }
            } else {
                // Earlier, line was created with the size of the length of
                // the byte array, so there is no need to shrink to fit.
                message.line = String::from_utf8_lossy (line).to_string ();
            }
        } else {
            // We know at this point that the message is completly empty
            // and hence can return early.
            return Err (ParseError::EmptyCommand);
        }

let mut source_offset: usize = 0;

if message.line.starts_with (':') {
            // Parse prefix
            source_offset += 1;
            let space_index = match message.line.find (' ') {
                Some (space_index) => space_index,
                None => return Err (ParseError::EmptyCommand), // If there is no space delimiter,
                                                               // the message doesn't contain a
                                                               // command, so we can return early.
            };

message.source_index = Some ((1, space_index));
        }

message.command_index.0 = match message.line.as_str ()
            [(source_offset + message.source_index.unwrap_or_default ().1)..]
            .find (|arg0: char| char::is_ascii_alphanumeric (&arg0))
        {
            Some (command_index) => {
                source_offset + message.source_index.unwrap_or_default ().1 + command_index
            }
            None => return Err (ParseError::EmptyCommand), // If there is no alphanumeric char
                                                           // after the source, the message
                                                           // doesn't contain a valid command,
                                                           // so we can return early.
        };

match message.line.as_str ()[message.command_index.0..].find (' ') {
            Some (command_index) => {
                message.command_index.1 = message.command_index.0 + command_index
            }
            None => {
                message.command_index.1 = message.line.len (); // If there is no space after the command, the rest of the
                                                               // entire message is the command, and we can return early.
                return Ok (message);
            }
        }

let mut command_offset = match message.line.as_str ()[message.command_index.1..]
            .find (|c: char| c.is_ascii () && !c.is_ascii_whitespace ())
        {
            Some (command_index) => message.command_index.1 + command_index,
            None => {
                // If there are no more non-whitespace ascii characters after
                // the command, we've reached the end of the message.
                message.command_index.1 = message.line.len ();
                return Ok (message);
            }
        };

loop {
            if message.line.as_str ()[command_offset..].starts_with (':') {
                message
                    .param_indexes
                    .push ((command_offset + 1, message.line.len ()));
                break;
            }

match message.line.as_str ()[command_offset..]
                .find (|c: char| c.is_ascii () && !c.is_ascii_whitespace ())
            {
                Some (index) => match message.line.as_str ()[(command_offset + index)..].find (' ') {
                    Some (param_index) => {
                        message
                            .param_indexes
                            .push ((command_offset + index, command_offset + index + param_index));
                        command_offset = command_offset + index + param_index + 1;
                        command_offset += message.line[command_offset..]
                            .chars ()
                            .take_while (|&ch| ch == ' ')
                            .count ();
                    }
                    None => {
                        message
                            .param_indexes
                            .push ((command_offset, message.line.len ()));
                        // There are no more valid characters making up a parameter,
                        // so we can break as there are no more parameters left to store.
                        break;
                    }
                },

None => {
                    // There are no more valid characters making up a parameter,
                    // so we can break as there are no more parameters left to store.
                    break;
                }
            }
        }

Ok (message)
    }

fn parse_tags (&mut self, tags: &'a mut [u8]) -> Result<(), ParseError>
    {
        match bytesep (tags, b';') {
            (head, Some (tail)) => {
                if head.is_empty () {
                    // The head component being empty indicates
                    // an empty split between tags, i.e. ";;",
                    // meaning that at least one tag is missing.
                    return Err (ParseError::MissingTags);
                }

// The specification states that IRCv3 tags
                // should be UTF8, and tags that are invalid
                // UTF8 may be dropped by clients.
                if let Ok (tag) = std::str::from_utf8_mut (head) {
                    let (key, value) = if let Some (index) = tag.find ('=') {
                        let (key, value) = tag.split_at_mut (index);
                        if key.is_empty () {
                            // The key component being empty indicates
                            // an empty key, i.e. "=value", which is invalid.
                            return Err (ParseError::MissingKey);
                        }
                        let key: &str = key; // Make key immutable
                        (key, unescape_tag (&mut value[1..]).unwrap_or_default())
                    } else {
                        let tag: &str = tag; // Make tag immutable
                        (tag, "")
                    };

for (i, tag) in IRC_TAG_MAP.iter ().enumerate ().take (IRC_TAG_MAX) {
                        if tag != &key {
                            continue;
                        }
                        self.tags[i] = Some (value);
                    }
                }

self.parse_tags(tail)
            }
            (head, None) => {
                if head.is_empty () {
                    // The tail component being empty indicates a
                    // trailing semicolon, i.e. "foo;", meaning
                    // that at least the last tag is missing.
                    return Err (ParseError::MissingTags);
                }
                match std::str::from_utf8_mut (head) {
                    Ok (tag) => {
                        let (key, value) = if let Some (index) = tag.find ('=') {
                            let (key, value) = tag.split_at_mut (index);
                            if key.is_empty () {
                                // The key component being empty indicates
                                // an empty key, i.e. "=value", which is invalid.
                                return Err (ParseError::MissingKey);
                            }
                            let key: &str = key; // Make key immutable
                            (key, unescape_tag (&mut value[1..]).unwrap_or_default ())
                        } else {
                            let tag: &str = tag; // Make tag immutable
                            (tag, "")
                        };

for (i, tag) in IRC_TAG_MAP.iter ().enumerate ().take (IRC_TAG_MAX) {
                            if tag != &key {
                                continue;
                            }
                            self.tags[i] = Some (value);
                        }
                        Ok (())
                    }
                    // The specification states that IRCv3 tags
                    // should be UTF8, and tags that are invalid
                    // UTF8 may be dropped by clients.
                    Err (_) => Ok (()),
                }
            }
        }
    }

#[must_use]
    pub fn command (&self) -> &str
    {
        &self.line[self.command_index.0..self.command_index.1]
    }

#[must_use]
    pub fn param_len (&self) -> usize
    {
        self.param_indexes.len ()
    }

#[must_use]
    pub fn param_at (&self, index: usize) -> Option<&str>
    {
        self.param_indexes
            .get (index)
            .map (|(start, end)| &self.line[*start..*end])
    }

pub fn params (&self) -> impl Iterator<Item = &str> + '_
    {
        self.param_indexes
            .iter ()
            .map (|(start, end)| &self.line[*start..*end])
    }
}

Filename: src/main.rs. Size: 14kb. View raw, , hex, or download this file.

This paste expires on 2026-01-03 22:48:24.235644+00:00. Pasted through v1-api.

Are you sure you'd like to remove this file?