pinakes/crates/pinakes-core/src/links.rs

//! Markdown link extraction and management for Obsidian-style bidirectional
//! links.
//!
//! This module provides:
//! - Wikilink extraction (`[[target]]` and `[[target|display]]`)
//! - Embed extraction (`![[target]]`)
//! - Markdown link extraction (`[text](path)` for internal links)
//! - Link resolution strategies
//! - Context extraction for backlink previews

use std::{path::Path, sync::LazyLock};

use regex::Regex;
use uuid::Uuid;

use crate::model::{LinkType, MarkdownLink, MediaId};

// Compile regexes once at startup to avoid recompilation on every call.
// Stored as Option so that initialization failure is handled gracefully
// rather than panicking.
static WIKILINK_RE: LazyLock<Option<Regex>> =
  LazyLock::new(|| Regex::new(r"\[\[([^\]|]+)(?:\|([^\]]+))?\]\]").ok());

static EMBED_RE: LazyLock<Option<Regex>> =
  LazyLock::new(|| Regex::new(r"!\[\[([^\]|]+)(?:\|([^\]]+))?\]\]").ok());

static MARKDOWN_LINK_RE: LazyLock<Option<Regex>> =
  LazyLock::new(|| Regex::new(r"\[([^\]]+)\]\(([^)]+)\)").ok());

/// Configuration for context extraction around links
const CONTEXT_CHARS_BEFORE: usize = 50;
const CONTEXT_CHARS_AFTER: usize = 50;

/// Extract all markdown links from file content.
///
/// This extracts:
/// - Wikilinks: `[[target]]` and `[[target|display text]]`
/// - Embeds: `![[target]]`
/// - Markdown links: `[text](path)` (internal paths only, no http/https)
#[must_use]
pub fn extract_links(
  source_media_id: MediaId,
  content: &str,
) -> Vec<MarkdownLink> {
  let mut links = Vec::new();

  // Extract wikilinks: [[target]] or [[target|display]]
  links.extend(extract_wikilinks(source_media_id, content));

  // Extract embeds: ![[target]]
  links.extend(extract_embeds(source_media_id, content));

  // Extract markdown links: [text](path)
  links.extend(extract_markdown_links(source_media_id, content));

  links
}

/// Extract wikilinks from content.
/// Matches: `[[target]]` or `[[target|display text]]` but NOT `![[...]]`
/// (embeds)
fn extract_wikilinks(
  source_media_id: MediaId,
  content: &str,
) -> Vec<MarkdownLink> {
  let Some(re) = WIKILINK_RE.as_ref() else {
    return Vec::new();
  };
  let mut links = Vec::new();

  for (line_num, line) in content.lines().enumerate() {
    for cap in re.captures_iter(line) {
      let Some(full_match) = cap.get(0) else {
        continue;
      };
      let match_start = full_match.start();

      // Check if preceded by ! (which would make it an embed, not a wikilink)
      if match_start > 0 {
        let bytes = line.as_bytes();
        if bytes.get(match_start - 1) == Some(&b'!') {
          continue; // Skip embeds
        }
      }

      let Some(target_match) = cap.get(1) else {
        continue;
      };
      let target = target_match.as_str().trim();
      let display_text = cap.get(2).map(|m| m.as_str().trim().to_string());

      let context = extract_context(
        content,
        line_num,
        full_match.start(),
        full_match.end(),
      );

      links.push(MarkdownLink {
        id: Uuid::now_v7(),
        source_media_id,
        target_path: target.to_string(),
        target_media_id: None, // Will be resolved later
        link_type: LinkType::Wikilink,
        link_text: display_text.or_else(|| Some(target.to_string())),
        line_number: Some(
          i32::try_from(line_num)
            .unwrap_or(i32::MAX)
            .saturating_add(1),
        ), // 1-indexed
        context: Some(context),
        created_at: chrono::Utc::now(),
      });
    }
  }

  links
}

/// Extract embeds from content.
/// Matches: `![[target]]`
fn extract_embeds(
  source_media_id: MediaId,
  content: &str,
) -> Vec<MarkdownLink> {
  let Some(re) = EMBED_RE.as_ref() else {
    return Vec::new();
  };
  let mut links = Vec::new();

  for (line_num, line) in content.lines().enumerate() {
    for cap in re.captures_iter(line) {
      let Some(full_match) = cap.get(0) else {
        continue;
      };
      let Some(target_match) = cap.get(1) else {
        continue;
      };
      let target = target_match.as_str().trim();
      let display_text = cap.get(2).map(|m| m.as_str().trim().to_string());

      let context = extract_context(
        content,
        line_num,
        full_match.start(),
        full_match.end(),
      );

      links.push(MarkdownLink {
        id: Uuid::now_v7(),
        source_media_id,
        target_path: target.to_string(),
        target_media_id: None,
        link_type: LinkType::Embed,
        link_text: display_text.or_else(|| Some(target.to_string())),
        line_number: Some(
          i32::try_from(line_num)
            .unwrap_or(i32::MAX)
            .saturating_add(1),
        ),
        context: Some(context),
        created_at: chrono::Utc::now(),
      });
    }
  }

  links
}

/// Extract markdown links from content.
/// Matches: `[text](path)` but only for internal paths (no http/https)
fn extract_markdown_links(
  source_media_id: MediaId,
  content: &str,
) -> Vec<MarkdownLink> {
  let Some(re) = MARKDOWN_LINK_RE.as_ref() else {
    return Vec::new();
  };
  let mut links = Vec::new();

  for (line_num, line) in content.lines().enumerate() {
    for cap in re.captures_iter(line) {
      let Some(full_match) = cap.get(0) else {
        continue;
      };
      let match_start = full_match.start();

      // Skip markdown images: ![alt](image.png)
      // Check if the character immediately before '[' is '!'
      if match_start > 0 && line.as_bytes().get(match_start - 1) == Some(&b'!')
      {
        continue;
      }

      let Some(text_match) = cap.get(1) else {
        continue;
      };
      let Some(path_match) = cap.get(2) else {
        continue;
      };
      let text = text_match.as_str().trim();
      let path = path_match.as_str().trim();

      // Skip external links
      if path.starts_with("http://")
        || path.starts_with("https://")
        || path.starts_with("mailto:")
        || path.starts_with("ftp://")
      {
        continue;
      }

      // Skip anchor-only links
      if path.starts_with('#') {
        continue;
      }

      // Remove any anchor from the path for resolution
      let target_path = path.split('#').next().unwrap_or(path);

      let context = extract_context(
        content,
        line_num,
        full_match.start(),
        full_match.end(),
      );

      links.push(MarkdownLink {
        id: Uuid::now_v7(),
        source_media_id,
        target_path: target_path.to_string(),
        target_media_id: None,
        link_type: LinkType::MarkdownLink,
        link_text: Some(text.to_string()),
        line_number: Some(
          i32::try_from(line_num)
            .unwrap_or(i32::MAX)
            .saturating_add(1),
        ),
        context: Some(context),
        created_at: chrono::Utc::now(),
      });
    }
  }

  links
}

/// Extract surrounding context for a link.
fn extract_context(
  content: &str,
  line_num: usize,
  _start: usize,
  _end: usize,
) -> String {
  let lines: Vec<&str> = content.lines().collect();
  if line_num >= lines.len() {
    return String::new();
  }

  let line = lines[line_num];
  let line_len = line.len();

  // Get surrounding lines for context if the current line is short
  if line_len < 30 && line_num > 0 {
    // Include previous line
    let prev = lines.get(line_num.saturating_sub(1)).unwrap_or(&"");
    let next = lines.get(line_num + 1).unwrap_or(&"");
    return format!("{} {} {}", prev.trim(), line.trim(), next.trim())
      .chars()
      .take(CONTEXT_CHARS_BEFORE + CONTEXT_CHARS_AFTER + 20)
      .collect();
  }

  // Truncate long lines
  if line_len > CONTEXT_CHARS_BEFORE + CONTEXT_CHARS_AFTER {
    line
      .chars()
      .take(CONTEXT_CHARS_BEFORE + CONTEXT_CHARS_AFTER)
      .collect()
  } else {
    line.to_string()
  }
}

/// Link resolution strategies for finding target media items.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ResolutionStrategy {
  /// Direct path match
  DirectPath,
  /// Relative to source directory
  RelativeToSource,
  /// Filename with .md extension added
  FilenameWithMd,
  /// Filename-only search (Obsidian-style)
  FilenameOnly,
}

/// Resolve a link target to possible file paths.
///
/// Returns a list of candidate paths to check, in order of preference.
#[must_use]
pub fn resolve_link_candidates(
  target: &str,
  source_path: &Path,
  root_dirs: &[std::path::PathBuf],
) -> Vec<std::path::PathBuf> {
  let mut candidates = Vec::new();

  // Clean up the target path
  let target = target.trim();

  // 1. Direct path - if it looks like a path
  if target.contains('/') || target.contains('\\') {
    let direct = std::path::PathBuf::from(target);
    if direct.is_absolute() {
      candidates.push(direct);
    } else {
      // Relative to each root dir
      for root in root_dirs {
        candidates.push(root.join(&direct));
      }
    }
  }

  // 2. Relative to source file's directory
  if let Some(source_dir) = source_path.parent() {
    let relative = source_dir.join(target);
    candidates.push(relative.clone());

    // Also try with .md extension
    if !target.to_ascii_lowercase().ends_with(".md") {
      candidates.push(relative.with_extension("md"));
      let mut with_md = relative.clone();
      with_md.set_file_name(format!(
        "{}.md",
        relative.file_name().unwrap_or_default().to_string_lossy()
      ));
      candidates.push(with_md);
    }
  }

  // 3. Filename with .md extension in root dirs
  let target_with_md = if target.to_ascii_lowercase().ends_with(".md") {
    target.to_string()
  } else {
    format!("{target}.md")
  };

  for root in root_dirs {
    candidates.push(root.join(&target_with_md));
  }

  // 4. Remove duplicates while preserving order
  let mut seen = std::collections::HashSet::new();
  candidates.retain(|p| seen.insert(p.clone()));

  candidates
}

/// Extract frontmatter aliases from markdown content.
///
/// Obsidian uses the `aliases` field in frontmatter to define alternative names
/// for a note that can be used in wikilinks.
#[must_use]
pub fn extract_aliases(content: &str) -> Vec<String> {
  let Ok(parsed) =
    gray_matter::Matter::<gray_matter::engine::YAML>::new().parse(content)
  else {
    return Vec::new();
  };

  let Some(data) = parsed.data else {
    return Vec::new();
  };

  let gray_matter::Pod::Hash(map) = data else {
    return Vec::new();
  };

  let Some(aliases) = map.get("aliases") else {
    return Vec::new();
  };

  match aliases {
    gray_matter::Pod::Array(arr) => {
      arr
        .iter()
        .filter_map(|a| {
          if let gray_matter::Pod::String(s) = a {
            Some(s.clone())
          } else {
            None
          }
        })
        .collect()
    },
    gray_matter::Pod::String(s) => {
      // Single alias as string
      vec![s.clone()]
    },
    _ => Vec::new(),
  }
}

#[cfg(test)]
mod tests {
  use super::*;

  fn test_media_id() -> MediaId {
    MediaId(Uuid::nil())
  }

  #[test]
  fn test_extract_simple_wikilink() {
    let content = "This is a [[simple link]] in text.";
    let links = extract_links(test_media_id(), content);

    assert_eq!(links.len(), 1);
    assert_eq!(links[0].target_path, "simple link");
    assert_eq!(links[0].link_type, LinkType::Wikilink);
    assert_eq!(links[0].link_text, Some("simple link".to_string()));
  }

  #[test]
  fn test_extract_wikilink_with_display() {
    let content = "Check out [[target note|this article]] for more.";
    let links = extract_links(test_media_id(), content);

    assert_eq!(links.len(), 1);
    assert_eq!(links[0].target_path, "target note");
    assert_eq!(links[0].link_text, Some("this article".to_string()));
  }

  #[test]
  fn test_extract_embed() {
    let content = "Here is an image: ![[image.png]]";
    let links = extract_links(test_media_id(), content);

    assert_eq!(links.len(), 1);
    assert_eq!(links[0].target_path, "image.png");
    assert_eq!(links[0].link_type, LinkType::Embed);
  }

  #[test]
  fn test_extract_markdown_link() {
    let content = "Read [the documentation](docs/README.md) for details.";
    let links = extract_links(test_media_id(), content);

    assert_eq!(links.len(), 1);
    assert_eq!(links[0].target_path, "docs/README.md");
    assert_eq!(links[0].link_type, LinkType::MarkdownLink);
    assert_eq!(links[0].link_text, Some("the documentation".to_string()));
  }

  #[test]
  fn test_skip_external_links() {
    let content = "Visit [our site](https://example.com) or [email \
                   us](mailto:test@test.com).";
    let links = extract_links(test_media_id(), content);

    assert!(links.is_empty());
  }

  #[test]
  fn test_multiple_links() {
    let content = r"
# My Note

This links to [[Note A]] and also [[Note B|Note B Title]].

We also have a markdown link to [config](./config.md).

And an embedded image: ![[diagram.png]]
";
    let links = extract_links(test_media_id(), content);

    assert_eq!(links.len(), 4);

    let types: Vec<_> = links.iter().map(|l| l.link_type).collect();
    assert!(types.contains(&LinkType::Wikilink));
    assert!(types.contains(&LinkType::Embed));
    assert!(types.contains(&LinkType::MarkdownLink));
  }

  #[test]
  fn test_line_numbers() {
    let content = "Line 1\n[[link on line 2]]\nLine 3";
    let links = extract_links(test_media_id(), content);

    assert_eq!(links.len(), 1);
    assert_eq!(links[0].line_number, Some(2));
  }

  #[test]
  fn test_resolve_candidates() {
    let source_path = std::path::Path::new("/notes/projects/readme.md");
    let root_dirs = vec![std::path::PathBuf::from("/notes")];

    let candidates =
      resolve_link_candidates("My Note", source_path, &root_dirs);

    // Should include relative path and .md variations
    assert!(!candidates.is_empty());
    assert!(
      candidates
        .iter()
        .any(|p| p.to_string_lossy().contains("My Note.md"))
    );
  }

  #[test]
  fn test_extract_aliases() {
    let content = r"---
title: My Note
aliases:
  - Alternative Name
  - Another Alias
---

# Content here
";
    let aliases = extract_aliases(content);
    assert_eq!(aliases, vec!["Alternative Name", "Another Alias"]);
  }

  #[test]
  fn test_extract_single_alias() {
    let content = r"---
title: My Note
aliases: Single Alias
---

# Content
";
    let aliases = extract_aliases(content);
    assert_eq!(aliases, vec!["Single Alias"]);
  }

  #[test]
  fn test_wikilink_not_matching_embed() {
    let content = "A wikilink [[note]] and an embed ![[image.png]]";
    let links = extract_links(test_media_id(), content);

    assert_eq!(links.len(), 2);
    let wikilinks: Vec<_> = links
      .iter()
      .filter(|l| l.link_type == LinkType::Wikilink)
      .collect();
    let embeds: Vec<_> = links
      .iter()
      .filter(|l| l.link_type == LinkType::Embed)
      .collect();

    assert_eq!(wikilinks.len(), 1);
    assert_eq!(embeds.len(), 1);
    assert_eq!(wikilinks[0].target_path, "note");
    assert_eq!(embeds[0].target_path, "image.png");
  }

  #[test]
  fn test_exclude_markdown_images() {
    // Test that markdown images ![alt](image.png) are NOT extracted as links
    let content = r"
# My Note

Here's a regular link: [documentation](docs/guide.md)

Here's an image: ![Screenshot](images/screenshot.png)

Another link: [config](config.toml)

Multiple images:
![Logo](logo.png) and ![Banner](banner.jpg)

Mixed: [link](file.md) then ![image](pic.png) then [another](other.md)
";
    let links = extract_links(test_media_id(), content);

    // Should only extract the 4 markdown links, not the 4 images
    assert_eq!(
      links.len(),
      4,
      "Should extract 4 links, not images. Got: {links:#?}"
    );

    // Verify all extracted items are MarkdownLink type (not images)
    for link in &links {
      assert_eq!(
        link.link_type,
        LinkType::MarkdownLink,
        "Link '{}' should be MarkdownLink type",
        link.target_path
      );
    }

    // Verify correct targets were extracted (links, not images)
    let targets: Vec<&str> =
      links.iter().map(|l| l.target_path.as_str()).collect();
    assert!(
      targets.contains(&"docs/guide.md"),
      "Should contain docs/guide.md"
    );
    assert!(
      targets.contains(&"config.toml"),
      "Should contain config.toml"
    );
    assert!(targets.contains(&"file.md"), "Should contain file.md");
    assert!(targets.contains(&"other.md"), "Should contain other.md");

    // Verify images were NOT extracted
    assert!(
      !targets.contains(&"images/screenshot.png"),
      "Should NOT contain screenshot.png (it's an image)"
    );
    assert!(
      !targets.contains(&"logo.png"),
      "Should NOT contain logo.png (it's an image)"
    );
    assert!(
      !targets.contains(&"banner.jpg"),
      "Should NOT contain banner.jpg (it's an image)"
    );
    assert!(
      !targets.contains(&"pic.png"),
      "Should NOT contain pic.png (it's an image)"
    );
  }

  #[test]
  fn test_edge_case_image_at_line_start() {
    // Test edge case: image at the very start of a line
    let content = "![Image at start](start.png)\n[Link](file.md)";
    let links = extract_links(test_media_id(), content);

    assert_eq!(
      links.len(),
      1,
      "Should only extract the link, not the image"
    );
    assert_eq!(links[0].target_path, "file.md");
    assert_eq!(links[0].link_type, LinkType::MarkdownLink);
  }
}