use fastembed::{EmbeddingModel, TextEmbedding, TextInitOptions};
use fastembed::similarity::cosine_similarity;
use serde::Deserialize;

/// A named tag with example texts used for semantic similarity scoring.
///
/// # Example
///
/// ```rust
/// use search_hub::tagging::TagDef;
///
/// let tag = TagDef {
///     name: "rust".into(),
///     examples: vec!["Rust ownership".into(), "cargo build system".into()],
/// };
/// assert_eq!(tag.name, "rust");
/// ```
#[derive(Debug, Clone, Deserialize)]
pub struct TagDef {
    /// The tag label (e.g. "rust", "web").
    pub name: String,
    /// Example phrases that exemplify this tag for embedding comparison.
    pub examples: Vec<String>,
}

/// Return the hardcoded default set of 25 tags with 3 example texts each.
///
/// # Example
///
/// ```rust
/// use search_hub::tagging::default_tags;
///
/// let tags = default_tags();
/// assert_eq!(tags.len(), 25);
/// assert_eq!(tags[0].name, "rust");
/// ```
pub fn default_tags() -> Vec<TagDef> {
    vec![
        TagDef { name: "rust".into(), examples: vec![
            "Rust ownership and borrow checker enforcing memory safety at compile time".into(),
            "pattern matching with enums and the Result type for error handling".into(),
            "cargo build system, crates.io ecosystem, and procedural macros".into(),
        ]},
        TagDef { name: "python".into(), examples: vec![
            "Python indentation-based syntax, list comprehensions, and generator expressions".into(),
            "dynamic typing, duck typing, and Python's data model protocols".into(),
            "pip packaging, virtual environments, and Python import system".into(),
        ]},
        TagDef { name: "web".into(), examples: vec![
            "HTML semantic markup, accessibility attributes, and document structure".into(),
            "CSS layout with flexbox and grid, responsive design with media queries".into(),
            "DOM manipulation, event bubbling, and Web API interfaces in the browser".into(),
        ]},
        TagDef { name: "audio".into(), examples: vec![
            "music streaming, albums, playlists, and artist discovery".into(),
            "podcast episodes, RSS feeds, and audio content distribution".into(),
            "radio stations, live broadcasts, and audio programming".into(),
        ]},
        TagDef { name: "backend".into(), examples: vec![
            "HTTP server routing, request handling, and response middleware chains".into(),
            "connection pooling, ORM patterns, and server-side template rendering".into(),
            "backend service architecture, message queues, and inter-service communication".into(),
        ]},
        TagDef { name: "devops".into(), examples: vec![
            "container images, Dockerfiles, and Kubernetes pod orchestration".into(),
            "infrastructure provisioning with Terraform and configuration management".into(),
            "CI/CD build pipelines, artifact management, and deployment strategies".into(),
        ]},
        TagDef { name: "data".into(), examples: vec![
            "data frame operations, statistical analysis, and numerical computing".into(),
            "data visualization with plotting libraries and charting techniques".into(),
            "ETL workflows, data cleaning, and batch processing pipelines".into(),
        ]},
        TagDef { name: "ai".into(), examples: vec![
            "transformer attention mechanisms, tokenization, and embedding layers".into(),
            "gradient descent, backpropagation, and neural network loss functions".into(),
            "model quantization, fine-tuning strategies, and inference optimization".into(),
        ]},
        TagDef { name: "linux".into(), examples: vec![
            "file permission bits, process management, and signal handling".into(),
            "piping stdout, redirecting file descriptors, and shell expansion rules".into(),
            "package managers, init systems, and systemd unit files".into(),
        ]},
        TagDef { name: "security".into(), examples: vec![
            "authentication tokens, OAuth flows, and JWT session handling".into(),
            "input sanitization, parameterized queries, and XSS/CSRF prevention".into(),
            "certificate authorities, TLS handshakes, and mTLS configurations".into(),
        ]},
        TagDef { name: "design".into(), examples: vec![
            "design tokens, component libraries, and design system consistency".into(),
            "typographic scale, whitespace rhythm, and visual hierarchy principles".into(),
            "color contrast, WCAG accessibility ratios, and responsive breakpoints".into(),
        ]},
        TagDef { name: "mobile".into(), examples: vec![
            "touch gesture handling, viewport sizing, and responsive mobile layouts".into(),
            "app lifecycle, push notifications, and background task management".into(),
            "native platform APIs, mobile sensors, and cross-platform mobile frameworks".into(),
        ]},
        TagDef { name: "gaming".into(), examples: vec![
            "game loop architecture, frame-rate independence, and delta time".into(),
            "physics simulation, collision detection, and spatial partitioning".into(),
            "shader programs, GPU rendering pipeline, and 3D transformations".into(),
        ]},
        TagDef { name: "tutorial".into(), examples: vec![
            "beginner-friendly walkthroughs with code examples and expected output".into(),
            "learning objectives, prerequisite knowledge, and progressive skill building".into(),
            "interactive code playgrounds, exercises, and quiz-based reinforcement".into(),
        ]},
        TagDef { name: "news".into(), examples: vec![
            "version bumps, deprecation timelines, and migration announcements".into(),
            "community announcements, conference talks, and ecosystem updates".into(),
            "release notes, changelogs, and feature release highlights".into(),
        ]},
        TagDef { name: "video".into(), examples: vec![
            "video streaming platforms, channels, and content creation".into(),
            "video editing, encoding formats, and transcoding workflows".into(),
            "live streaming, video on demand, and media playback".into(),
        ]},
        TagDef { name: "tools".into(), examples: vec![
            "text editor configuration, IDE plugins, and developer workflow tooling".into(),
            "version control workflows, git branching strategies, and merge patterns".into(),
            "debugger breakpoints, profiling tools, and performance tracing utilities".into(),
        ]},
        TagDef { name: "database".into(), examples: vec![
            "SQL table schemas, foreign key relationships, and constraint design".into(),
            "index structures, query plan analysis, and query performance tuning".into(),
            "ACID transactions, isolation levels, and connection pool configuration".into(),
        ]},
        TagDef { name: "cli".into(), examples: vec![
            "command argument parsing, subcommand patterns, and flag conventions".into(),
            "terminal output formatting, colored logging, and progress indicators".into(),
            "stdin/stdout pipes, exit codes, and shell completion scripts".into(),
        ]},
        TagDef { name: "social".into(), examples: vec![
            "social media platforms, feeds, and community discussions".into(),
            "user profiles, followers, and content sharing features".into(),
            "messaging systems, real-time chat, and social networking APIs".into(),
        ]},
        TagDef { name: "testing".into(), examples: vec![
            "unit test assertions, test fixtures, and parametrized test cases".into(),
            "mocking external dependencies, test doubles, and fake implementations".into(),
            "integration tests, end-to-end testing, and continuous testing in CI".into(),
        ]},
        TagDef { name: "javascript".into(), examples: vec![
            "JavaScript closures, prototypal inheritance, and the event loop".into(),
            "async/await patterns, Promise chaining, and callback conventions".into(),
            "ES modules, npm packages, and JavaScript bundler tooling".into(),
        ]},
        TagDef { name: "api".into(), examples: vec![
            "RESTful resource design, URL patterns, and HTTP method semantics".into(),
            "request validation, error response formatting, and status code conventions".into(),
            "API versioning, rate limiting, and OpenAPI specification documents".into(),
        ]},
        TagDef { name: "documentation".into(), examples: vec![
            "API reference docs, docstrings, and inline code annotations".into(),
            "architecture decision records and design documentation practices".into(),
            "README writing, project wikis, and onboarding guides for contributors".into(),
        ]},
        TagDef { name: "productivity".into(), examples: vec![
            "habit tracking, time management, and personal workflow optimization".into(),
            "note-taking systems, knowledge base management, and personal wikis".into(),
            "task organization, prioritization frameworks, and automation of repetitive work".into(),
        ]},
    ]
}

/// Engine that embeds content and scores it against tag prototypes using cosine similarity.
///
/// # Example
///
/// ```ignore
/// let tags = search_hub::tagging::default_tags();
/// let mut engine = search_hub::tagging::TaggingEngine::new(&tags, 0.40)
///     .expect("failed to init tagging engine");
/// let matched = engine.tags_for("the rust programming language borrow checker", 3)
///     .expect("tagging failed");
/// assert!(matched.contains(&"rust".to_string()));
/// ```
pub struct TaggingEngine {
    model: TextEmbedding,
    tag_examples: Vec<(String, Vec<Vec<f32>>)>,
    threshold: f32,
}

impl TaggingEngine {
    /// Create a new tagging engine from the given tag definitions.
    ///
    /// Downloads the ONNX embedding model on first run (cached afterwards).
    ///
    /// # Parameters
    ///
    /// * `tags`      - Slice of `TagDef` entries (from config or `default_tags()`).
    /// * `threshold` - Minimum cosine-similarity score (0.0 to 1.0) for a tag
    ///                  to be assigned. Default 0.40 in `tags_for()` but can
    ///                  be overridden per-call with `tags_for_with_threshold()`.
    ///
    /// # Returns
    ///
    /// A `TaggingEngine` ready to score content.
    ///
    /// # Errors
    ///
    /// Returns an error if the embedding model cannot be loaded or the
    /// tag examples fail to embed.
    ///
    /// # Example
    ///
    /// ```ignore
    /// let tags = search_hub::tagging::default_tags();
    /// let mut engine = search_hub::tagging::TaggingEngine::new(&tags, 0.60)
    ///     .expect("model init");
    /// ```
    pub fn new(tags: &[TagDef], threshold: f32) -> anyhow::Result<Self> {
        let mut model = TextEmbedding::try_new(
            TextInitOptions::new(EmbeddingModel::BGESmallENV15)
                .with_show_download_progress(true),
        )?;

        let mut all_examples: Vec<String> = Vec::new();
        let mut tag_indices: Vec<(usize, &str)> = Vec::new();

        for (ti, tag) in tags.iter().enumerate() {
            for example in &tag.examples {
                tag_indices.push((ti, &tag.name));
                all_examples.push(format!("passage: {}", example));
            }
        }

        let embeddings = model.embed(all_examples, None)?;

        let mut tag_examples: Vec<(String, Vec<Vec<f32>>)> = tags
            .iter()
            .map(|t| (t.name.clone(), Vec::new()))
            .collect();

        for ((ti, _name), emb) in tag_indices.iter().zip(embeddings.iter()) {
            tag_examples[*ti].1.push(emb.clone());
        }

        Ok(Self { model, tag_examples, threshold })
    }

    fn truncate(content: &str, max_chars: usize) -> &str {
        let end = content.char_indices()
            .take(max_chars)
            .last()
            .map(|(i, c)| i + c.len_utf8())
            .unwrap_or(content.len());
        &content[..end.min(content.len())]
    }

    fn score_content(&mut self, content: &str) -> anyhow::Result<Vec<(String, f32)>> {
        let truncated = Self::truncate(content, 2000);
        let emb = self.model.embed(
            vec![format!("passage: {}", truncated)],
            None,
        )?;
        if emb.is_empty() {
            return Ok(Vec::new());
        }
        let query_emb = &emb[0];

        let mut scores: Vec<(usize, f32)> = self.tag_examples
            .iter()
            .enumerate()
            .map(|(i, (_, examples))| {
                let max_sim = examples
                    .iter()
                    .map(|proto| cosine_similarity(query_emb, proto))
                    .fold(f32::NEG_INFINITY, f32::max);
                (i, max_sim)
            })
            .collect();

        scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));

        Ok(scores
            .into_iter()
            .map(|(i, score)| (self.tag_examples[i].0.clone(), score))
            .collect())
    }

    /// Score `content` against all tag prototypes and return tags above the
    /// configured threshold.
    ///
    /// # Parameters
    ///
    /// * `content`  - The text to tag (e.g. page body converted to Markdown).
    /// * `max_tags` - Maximum number of tags to return.
    ///
    /// # Returns
    ///
    /// A `Vec<String>` of tag names matching the content, sorted by score
    /// descending.
    ///
    /// # Errors
    ///
    /// Returns an error if the embedding model fails to process the content.
    ///
    /// # Example
    ///
    /// ```ignore
    /// let tags = search_hub::tagging::default_tags();
    /// let mut engine = search_hub::tagging::TaggingEngine::new(&tags, 0.40)
    ///     .expect("model init");
    /// let matched = engine.tags_for("the rust programming language", 3)
    ///     .expect("tagging failed");
    /// println!("{:?}", matched);
    /// ```
    pub fn tags_for(&mut self, content: &str, max_tags: usize) -> anyhow::Result<Vec<String>> {
        Ok(self
            .tags_for_with_threshold(content, max_tags, self.threshold)?
            .into_iter()
            .map(|(tag, _)| tag)
            .collect())
    }

    /// Score `content` and return tag-score pairs above a custom threshold.
    ///
    /// # Parameters
    ///
    /// * `content`   - The text to tag.
    /// * `max_tags`  - Maximum number of tags to return.
    /// * `threshold` - Minimum cosine-similarity score (0.0 to 1.0).
    ///
    /// # Returns
    ///
    /// A `Vec<(String, f32)>` of (tag_name, score) matching the content,
    /// sorted by score descending.
    ///
    /// # Errors
    ///
    /// Returns an error if the embedding model fails to process the content.
    ///
    /// # Example
    ///
    /// ```ignore
    /// let tags = search_hub::tagging::default_tags();
    /// let mut engine = search_hub::tagging::TaggingEngine::new(&tags)
    ///     .expect("model init");
    /// let matched = engine.tags_for_with_threshold("rust programming", 5, 0.30)
    ///     .expect("tagging failed");
    /// for (tag, score) in &matched {
    ///     println!("{}: {:.3}", tag, score);
    /// }
    /// ```
    pub fn tags_for_with_threshold(
        &mut self,
        content: &str,
        max_tags: usize,
        threshold: f32,
    ) -> anyhow::Result<Vec<(String, f32)>> {
        let scored = self.score_content(content)?;
        Ok(scored
            .into_iter()
            .filter(|(_, score)| *score >= threshold)
            .take(max_tags)
            .collect())
    }
}
