use fastembed::{EmbeddingModel, TextEmbedding, TextInitOptions}; use fastembed::similarity::cosine_similarity; use serde::Deserialize; /// A named tag with example texts used for semantic similarity scoring. /// /// # Example /// /// ```rust /// use search_hub::tagging::TagDef; /// /// let tag = TagDef { /// name: "rust".into(), /// examples: vec!["Rust ownership".into(), "cargo build system".into()], /// }; /// assert_eq!(tag.name, "rust"); /// ``` #[derive(Debug, Clone, Deserialize)] pub struct TagDef { /// The tag label (e.g. "rust", "web"). pub name: String, /// Example phrases that exemplify this tag for embedding comparison. pub examples: Vec, } /// Return the hardcoded default set of 25 tags with 3 example texts each. /// /// # Example /// /// ```rust /// use search_hub::tagging::default_tags; /// /// let tags = default_tags(); /// assert_eq!(tags.len(), 25); /// assert_eq!(tags[0].name, "rust"); /// ``` pub fn default_tags() -> Vec { vec![ TagDef { name: "rust".into(), examples: vec![ "Rust ownership and borrow checker enforcing memory safety at compile time".into(), "pattern matching with enums and the Result type for error handling".into(), "cargo build system, crates.io ecosystem, and procedural macros".into(), ]}, TagDef { name: "python".into(), examples: vec![ "Python indentation-based syntax, list comprehensions, and generator expressions".into(), "dynamic typing, duck typing, and Python's data model protocols".into(), "pip packaging, virtual environments, and Python import system".into(), ]}, TagDef { name: "web".into(), examples: vec![ "HTML semantic markup, accessibility attributes, and document structure".into(), "CSS layout with flexbox and grid, responsive design with media queries".into(), "DOM manipulation, event bubbling, and Web API interfaces in the browser".into(), ]}, TagDef { name: "audio".into(), examples: vec![ "music streaming, albums, playlists, and artist discovery".into(), "podcast episodes, RSS feeds, and audio content distribution".into(), "radio stations, live broadcasts, and audio programming".into(), ]}, TagDef { name: "backend".into(), examples: vec![ "HTTP server routing, request handling, and response middleware chains".into(), "connection pooling, ORM patterns, and server-side template rendering".into(), "backend service architecture, message queues, and inter-service communication".into(), ]}, TagDef { name: "devops".into(), examples: vec![ "container images, Dockerfiles, and Kubernetes pod orchestration".into(), "infrastructure provisioning with Terraform and configuration management".into(), "CI/CD build pipelines, artifact management, and deployment strategies".into(), ]}, TagDef { name: "data".into(), examples: vec![ "data frame operations, statistical analysis, and numerical computing".into(), "data visualization with plotting libraries and charting techniques".into(), "ETL workflows, data cleaning, and batch processing pipelines".into(), ]}, TagDef { name: "ai".into(), examples: vec![ "transformer attention mechanisms, tokenization, and embedding layers".into(), "gradient descent, backpropagation, and neural network loss functions".into(), "model quantization, fine-tuning strategies, and inference optimization".into(), ]}, TagDef { name: "linux".into(), examples: vec![ "file permission bits, process management, and signal handling".into(), "piping stdout, redirecting file descriptors, and shell expansion rules".into(), "package managers, init systems, and systemd unit files".into(), ]}, TagDef { name: "security".into(), examples: vec![ "authentication tokens, OAuth flows, and JWT session handling".into(), "input sanitization, parameterized queries, and XSS/CSRF prevention".into(), "certificate authorities, TLS handshakes, and mTLS configurations".into(), ]}, TagDef { name: "design".into(), examples: vec![ "design tokens, component libraries, and design system consistency".into(), "typographic scale, whitespace rhythm, and visual hierarchy principles".into(), "color contrast, WCAG accessibility ratios, and responsive breakpoints".into(), ]}, TagDef { name: "mobile".into(), examples: vec![ "touch gesture handling, viewport sizing, and responsive mobile layouts".into(), "app lifecycle, push notifications, and background task management".into(), "native platform APIs, mobile sensors, and cross-platform mobile frameworks".into(), ]}, TagDef { name: "gaming".into(), examples: vec![ "game loop architecture, frame-rate independence, and delta time".into(), "physics simulation, collision detection, and spatial partitioning".into(), "shader programs, GPU rendering pipeline, and 3D transformations".into(), ]}, TagDef { name: "tutorial".into(), examples: vec![ "beginner-friendly walkthroughs with code examples and expected output".into(), "learning objectives, prerequisite knowledge, and progressive skill building".into(), "interactive code playgrounds, exercises, and quiz-based reinforcement".into(), ]}, TagDef { name: "news".into(), examples: vec![ "version bumps, deprecation timelines, and migration announcements".into(), "community announcements, conference talks, and ecosystem updates".into(), "release notes, changelogs, and feature release highlights".into(), ]}, TagDef { name: "video".into(), examples: vec![ "video streaming platforms, channels, and content creation".into(), "video editing, encoding formats, and transcoding workflows".into(), "live streaming, video on demand, and media playback".into(), ]}, TagDef { name: "tools".into(), examples: vec![ "text editor configuration, IDE plugins, and developer workflow tooling".into(), "version control workflows, git branching strategies, and merge patterns".into(), "debugger breakpoints, profiling tools, and performance tracing utilities".into(), ]}, TagDef { name: "database".into(), examples: vec![ "SQL table schemas, foreign key relationships, and constraint design".into(), "index structures, query plan analysis, and query performance tuning".into(), "ACID transactions, isolation levels, and connection pool configuration".into(), ]}, TagDef { name: "cli".into(), examples: vec![ "command argument parsing, subcommand patterns, and flag conventions".into(), "terminal output formatting, colored logging, and progress indicators".into(), "stdin/stdout pipes, exit codes, and shell completion scripts".into(), ]}, TagDef { name: "social".into(), examples: vec![ "social media platforms, feeds, and community discussions".into(), "user profiles, followers, and content sharing features".into(), "messaging systems, real-time chat, and social networking APIs".into(), ]}, TagDef { name: "testing".into(), examples: vec![ "unit test assertions, test fixtures, and parametrized test cases".into(), "mocking external dependencies, test doubles, and fake implementations".into(), "integration tests, end-to-end testing, and continuous testing in CI".into(), ]}, TagDef { name: "javascript".into(), examples: vec![ "JavaScript closures, prototypal inheritance, and the event loop".into(), "async/await patterns, Promise chaining, and callback conventions".into(), "ES modules, npm packages, and JavaScript bundler tooling".into(), ]}, TagDef { name: "api".into(), examples: vec![ "RESTful resource design, URL patterns, and HTTP method semantics".into(), "request validation, error response formatting, and status code conventions".into(), "API versioning, rate limiting, and OpenAPI specification documents".into(), ]}, TagDef { name: "documentation".into(), examples: vec![ "API reference docs, docstrings, and inline code annotations".into(), "architecture decision records and design documentation practices".into(), "README writing, project wikis, and onboarding guides for contributors".into(), ]}, TagDef { name: "productivity".into(), examples: vec![ "habit tracking, time management, and personal workflow optimization".into(), "note-taking systems, knowledge base management, and personal wikis".into(), "task organization, prioritization frameworks, and automation of repetitive work".into(), ]}, ] } /// Engine that embeds content and scores it against tag prototypes using cosine similarity. /// /// # Example /// /// ```ignore /// let tags = search_hub::tagging::default_tags(); /// let mut engine = search_hub::tagging::TaggingEngine::new(&tags, 0.40) /// .expect("failed to init tagging engine"); /// let matched = engine.tags_for("the rust programming language borrow checker", 3) /// .expect("tagging failed"); /// assert!(matched.contains(&"rust".to_string())); /// ``` pub struct TaggingEngine { model: TextEmbedding, tag_examples: Vec<(String, Vec>)>, threshold: f32, } impl TaggingEngine { /// Create a new tagging engine from the given tag definitions. /// /// Downloads the ONNX embedding model on first run (cached afterwards). /// /// # Parameters /// /// * `tags` - Slice of `TagDef` entries (from config or `default_tags()`). /// * `threshold` - Minimum cosine-similarity score (0.0 to 1.0) for a tag /// to be assigned. Default 0.40 in `tags_for()` but can /// be overridden per-call with `tags_for_with_threshold()`. /// /// # Returns /// /// A `TaggingEngine` ready to score content. /// /// # Errors /// /// Returns an error if the embedding model cannot be loaded or the /// tag examples fail to embed. /// /// # Example /// /// ```ignore /// let tags = search_hub::tagging::default_tags(); /// let mut engine = search_hub::tagging::TaggingEngine::new(&tags, 0.60) /// .expect("model init"); /// ``` pub fn new(tags: &[TagDef], threshold: f32) -> anyhow::Result { let mut model = TextEmbedding::try_new( TextInitOptions::new(EmbeddingModel::BGESmallENV15) .with_show_download_progress(true), )?; let mut all_examples: Vec = Vec::new(); let mut tag_indices: Vec<(usize, &str)> = Vec::new(); for (ti, tag) in tags.iter().enumerate() { for example in &tag.examples { tag_indices.push((ti, &tag.name)); all_examples.push(format!("passage: {}", example)); } } let embeddings = model.embed(all_examples, None)?; let mut tag_examples: Vec<(String, Vec>)> = tags .iter() .map(|t| (t.name.clone(), Vec::new())) .collect(); for ((ti, _name), emb) in tag_indices.iter().zip(embeddings.iter()) { tag_examples[*ti].1.push(emb.clone()); } Ok(Self { model, tag_examples, threshold }) } fn truncate(content: &str, max_chars: usize) -> &str { let end = content.char_indices() .take(max_chars) .last() .map(|(i, c)| i + c.len_utf8()) .unwrap_or(content.len()); &content[..end.min(content.len())] } fn score_content(&mut self, content: &str) -> anyhow::Result> { let truncated = Self::truncate(content, 2000); let emb = self.model.embed( vec![format!("passage: {}", truncated)], None, )?; if emb.is_empty() { return Ok(Vec::new()); } let query_emb = &emb[0]; let mut scores: Vec<(usize, f32)> = self.tag_examples .iter() .enumerate() .map(|(i, (_, examples))| { let max_sim = examples .iter() .map(|proto| cosine_similarity(query_emb, proto)) .fold(f32::NEG_INFINITY, f32::max); (i, max_sim) }) .collect(); scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal)); Ok(scores .into_iter() .map(|(i, score)| (self.tag_examples[i].0.clone(), score)) .collect()) } /// Score `content` against all tag prototypes and return tags above the /// configured threshold. /// /// # Parameters /// /// * `content` - The text to tag (e.g. page body converted to Markdown). /// * `max_tags` - Maximum number of tags to return. /// /// # Returns /// /// A `Vec` of tag names matching the content, sorted by score /// descending. /// /// # Errors /// /// Returns an error if the embedding model fails to process the content. /// /// # Example /// /// ```ignore /// let tags = search_hub::tagging::default_tags(); /// let mut engine = search_hub::tagging::TaggingEngine::new(&tags, 0.40) /// .expect("model init"); /// let matched = engine.tags_for("the rust programming language", 3) /// .expect("tagging failed"); /// println!("{:?}", matched); /// ``` pub fn tags_for(&mut self, content: &str, max_tags: usize) -> anyhow::Result> { Ok(self .tags_for_with_threshold(content, max_tags, self.threshold)? .into_iter() .map(|(tag, _)| tag) .collect()) } /// Score `content` and return tag-score pairs above a custom threshold. /// /// # Parameters /// /// * `content` - The text to tag. /// * `max_tags` - Maximum number of tags to return. /// * `threshold` - Minimum cosine-similarity score (0.0 to 1.0). /// /// # Returns /// /// A `Vec<(String, f32)>` of (tag_name, score) matching the content, /// sorted by score descending. /// /// # Errors /// /// Returns an error if the embedding model fails to process the content. /// /// # Example /// /// ```ignore /// let tags = search_hub::tagging::default_tags(); /// let mut engine = search_hub::tagging::TaggingEngine::new(&tags) /// .expect("model init"); /// let matched = engine.tags_for_with_threshold("rust programming", 5, 0.30) /// .expect("tagging failed"); /// for (tag, score) in &matched { /// println!("{}: {:.3}", tag, score); /// } /// ``` pub fn tags_for_with_threshold( &mut self, content: &str, max_tags: usize, threshold: f32, ) -> anyhow::Result> { let scored = self.score_content(content)?; Ok(scored .into_iter() .filter(|(_, score)| *score >= threshold) .take(max_tags) .collect()) } }