//! Markdown link extraction and management for Obsidian-style bidirectional //! links. //! //! This module provides: //! - Wikilink extraction (`[[target]]` and `[[target|display]]`) //! - Embed extraction (`![[target]]`) //! - Markdown link extraction (`[text](path)` for internal links) //! - Link resolution strategies //! - Context extraction for backlink previews use std::{path::Path, sync::LazyLock}; use regex::Regex; use uuid::Uuid; use crate::model::{LinkType, MarkdownLink, MediaId}; // Compile regexes once at startup to avoid recompilation on every call. // Stored as Option so that initialization failure is handled gracefully // rather than panicking. static WIKILINK_RE: LazyLock> = LazyLock::new(|| Regex::new(r"\[\[([^\]|]+)(?:\|([^\]]+))?\]\]").ok()); static EMBED_RE: LazyLock> = LazyLock::new(|| Regex::new(r"!\[\[([^\]|]+)(?:\|([^\]]+))?\]\]").ok()); static MARKDOWN_LINK_RE: LazyLock> = LazyLock::new(|| Regex::new(r"\[([^\]]+)\]\(([^)]+)\)").ok()); /// Configuration for context extraction around links const CONTEXT_CHARS_BEFORE: usize = 50; const CONTEXT_CHARS_AFTER: usize = 50; /// Extract all markdown links from file content. /// /// This extracts: /// - Wikilinks: `[[target]]` and `[[target|display text]]` /// - Embeds: `![[target]]` /// - Markdown links: `[text](path)` (internal paths only, no http/https) #[must_use] pub fn extract_links( source_media_id: MediaId, content: &str, ) -> Vec { let mut links = Vec::new(); // Extract wikilinks: [[target]] or [[target|display]] links.extend(extract_wikilinks(source_media_id, content)); // Extract embeds: ![[target]] links.extend(extract_embeds(source_media_id, content)); // Extract markdown links: [text](path) links.extend(extract_markdown_links(source_media_id, content)); links } /// Extract wikilinks from content. /// Matches: `[[target]]` or `[[target|display text]]` but NOT `![[...]]` /// (embeds) fn extract_wikilinks( source_media_id: MediaId, content: &str, ) -> Vec { let Some(re) = WIKILINK_RE.as_ref() else { return Vec::new(); }; let mut links = Vec::new(); for (line_num, line) in content.lines().enumerate() { for cap in re.captures_iter(line) { let Some(full_match) = cap.get(0) else { continue; }; let match_start = full_match.start(); // Check if preceded by ! (which would make it an embed, not a wikilink) if match_start > 0 { let bytes = line.as_bytes(); if bytes.get(match_start - 1) == Some(&b'!') { continue; // Skip embeds } } let Some(target_match) = cap.get(1) else { continue; }; let target = target_match.as_str().trim(); let display_text = cap.get(2).map(|m| m.as_str().trim().to_string()); let context = extract_context( content, line_num, full_match.start(), full_match.end(), ); links.push(MarkdownLink { id: Uuid::now_v7(), source_media_id, target_path: target.to_string(), target_media_id: None, // Will be resolved later link_type: LinkType::Wikilink, link_text: display_text.or_else(|| Some(target.to_string())), line_number: Some( i32::try_from(line_num) .unwrap_or(i32::MAX) .saturating_add(1), ), // 1-indexed context: Some(context), created_at: chrono::Utc::now(), }); } } links } /// Extract embeds from content. /// Matches: `![[target]]` fn extract_embeds( source_media_id: MediaId, content: &str, ) -> Vec { let Some(re) = EMBED_RE.as_ref() else { return Vec::new(); }; let mut links = Vec::new(); for (line_num, line) in content.lines().enumerate() { for cap in re.captures_iter(line) { let Some(full_match) = cap.get(0) else { continue; }; let Some(target_match) = cap.get(1) else { continue; }; let target = target_match.as_str().trim(); let display_text = cap.get(2).map(|m| m.as_str().trim().to_string()); let context = extract_context( content, line_num, full_match.start(), full_match.end(), ); links.push(MarkdownLink { id: Uuid::now_v7(), source_media_id, target_path: target.to_string(), target_media_id: None, link_type: LinkType::Embed, link_text: display_text.or_else(|| Some(target.to_string())), line_number: Some( i32::try_from(line_num) .unwrap_or(i32::MAX) .saturating_add(1), ), context: Some(context), created_at: chrono::Utc::now(), }); } } links } /// Extract markdown links from content. /// Matches: `[text](path)` but only for internal paths (no http/https) fn extract_markdown_links( source_media_id: MediaId, content: &str, ) -> Vec { let Some(re) = MARKDOWN_LINK_RE.as_ref() else { return Vec::new(); }; let mut links = Vec::new(); for (line_num, line) in content.lines().enumerate() { for cap in re.captures_iter(line) { let Some(full_match) = cap.get(0) else { continue; }; let match_start = full_match.start(); // Skip markdown images: ![alt](image.png) // Check if the character immediately before '[' is '!' if match_start > 0 && line.as_bytes().get(match_start - 1) == Some(&b'!') { continue; } let Some(text_match) = cap.get(1) else { continue; }; let Some(path_match) = cap.get(2) else { continue; }; let text = text_match.as_str().trim(); let path = path_match.as_str().trim(); // Skip external links if path.starts_with("http://") || path.starts_with("https://") || path.starts_with("mailto:") || path.starts_with("ftp://") { continue; } // Skip anchor-only links if path.starts_with('#') { continue; } // Remove any anchor from the path for resolution let target_path = path.split('#').next().unwrap_or(path); let context = extract_context( content, line_num, full_match.start(), full_match.end(), ); links.push(MarkdownLink { id: Uuid::now_v7(), source_media_id, target_path: target_path.to_string(), target_media_id: None, link_type: LinkType::MarkdownLink, link_text: Some(text.to_string()), line_number: Some( i32::try_from(line_num) .unwrap_or(i32::MAX) .saturating_add(1), ), context: Some(context), created_at: chrono::Utc::now(), }); } } links } /// Extract surrounding context for a link. fn extract_context( content: &str, line_num: usize, _start: usize, _end: usize, ) -> String { let lines: Vec<&str> = content.lines().collect(); if line_num >= lines.len() { return String::new(); } let line = lines[line_num]; let line_len = line.len(); // Get surrounding lines for context if the current line is short if line_len < 30 && line_num > 0 { // Include previous line let prev = lines.get(line_num.saturating_sub(1)).unwrap_or(&""); let next = lines.get(line_num + 1).unwrap_or(&""); return format!("{} {} {}", prev.trim(), line.trim(), next.trim()) .chars() .take(CONTEXT_CHARS_BEFORE + CONTEXT_CHARS_AFTER + 20) .collect(); } // Truncate long lines if line_len > CONTEXT_CHARS_BEFORE + CONTEXT_CHARS_AFTER { line .chars() .take(CONTEXT_CHARS_BEFORE + CONTEXT_CHARS_AFTER) .collect() } else { line.to_string() } } /// Link resolution strategies for finding target media items. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum ResolutionStrategy { /// Direct path match DirectPath, /// Relative to source directory RelativeToSource, /// Filename with .md extension added FilenameWithMd, /// Filename-only search (Obsidian-style) FilenameOnly, } /// Resolve a link target to possible file paths. /// /// Returns a list of candidate paths to check, in order of preference. #[must_use] pub fn resolve_link_candidates( target: &str, source_path: &Path, root_dirs: &[std::path::PathBuf], ) -> Vec { let mut candidates = Vec::new(); // Clean up the target path let target = target.trim(); // 1. Direct path - if it looks like a path if target.contains('/') || target.contains('\\') { let direct = std::path::PathBuf::from(target); if direct.is_absolute() { candidates.push(direct); } else { // Relative to each root dir for root in root_dirs { candidates.push(root.join(&direct)); } } } // 2. Relative to source file's directory if let Some(source_dir) = source_path.parent() { let relative = source_dir.join(target); candidates.push(relative.clone()); // Also try with .md extension if !target.to_ascii_lowercase().ends_with(".md") { candidates.push(relative.with_extension("md")); let mut with_md = relative.clone(); with_md.set_file_name(format!( "{}.md", relative.file_name().unwrap_or_default().to_string_lossy() )); candidates.push(with_md); } } // 3. Filename with .md extension in root dirs let target_with_md = if target.to_ascii_lowercase().ends_with(".md") { target.to_string() } else { format!("{target}.md") }; for root in root_dirs { candidates.push(root.join(&target_with_md)); } // 4. Remove duplicates while preserving order let mut seen = std::collections::HashSet::new(); candidates.retain(|p| seen.insert(p.clone())); candidates } /// Extract frontmatter aliases from markdown content. /// /// Obsidian uses the `aliases` field in frontmatter to define alternative names /// for a note that can be used in wikilinks. #[must_use] pub fn extract_aliases(content: &str) -> Vec { let Ok(parsed) = gray_matter::Matter::::new().parse(content) else { return Vec::new(); }; let Some(data) = parsed.data else { return Vec::new(); }; let gray_matter::Pod::Hash(map) = data else { return Vec::new(); }; let Some(aliases) = map.get("aliases") else { return Vec::new(); }; match aliases { gray_matter::Pod::Array(arr) => { arr .iter() .filter_map(|a| { if let gray_matter::Pod::String(s) = a { Some(s.clone()) } else { None } }) .collect() }, gray_matter::Pod::String(s) => { // Single alias as string vec![s.clone()] }, _ => Vec::new(), } } #[cfg(test)] mod tests { use super::*; fn test_media_id() -> MediaId { MediaId(Uuid::nil()) } #[test] fn test_extract_simple_wikilink() { let content = "This is a [[simple link]] in text."; let links = extract_links(test_media_id(), content); assert_eq!(links.len(), 1); assert_eq!(links[0].target_path, "simple link"); assert_eq!(links[0].link_type, LinkType::Wikilink); assert_eq!(links[0].link_text, Some("simple link".to_string())); } #[test] fn test_extract_wikilink_with_display() { let content = "Check out [[target note|this article]] for more."; let links = extract_links(test_media_id(), content); assert_eq!(links.len(), 1); assert_eq!(links[0].target_path, "target note"); assert_eq!(links[0].link_text, Some("this article".to_string())); } #[test] fn test_extract_embed() { let content = "Here is an image: ![[image.png]]"; let links = extract_links(test_media_id(), content); assert_eq!(links.len(), 1); assert_eq!(links[0].target_path, "image.png"); assert_eq!(links[0].link_type, LinkType::Embed); } #[test] fn test_extract_markdown_link() { let content = "Read [the documentation](docs/README.md) for details."; let links = extract_links(test_media_id(), content); assert_eq!(links.len(), 1); assert_eq!(links[0].target_path, "docs/README.md"); assert_eq!(links[0].link_type, LinkType::MarkdownLink); assert_eq!(links[0].link_text, Some("the documentation".to_string())); } #[test] fn test_skip_external_links() { let content = "Visit [our site](https://example.com) or [email \ us](mailto:test@test.com)."; let links = extract_links(test_media_id(), content); assert!(links.is_empty()); } #[test] fn test_multiple_links() { let content = r" # My Note This links to [[Note A]] and also [[Note B|Note B Title]]. We also have a markdown link to [config](./config.md). And an embedded image: ![[diagram.png]] "; let links = extract_links(test_media_id(), content); assert_eq!(links.len(), 4); let types: Vec<_> = links.iter().map(|l| l.link_type).collect(); assert!(types.contains(&LinkType::Wikilink)); assert!(types.contains(&LinkType::Embed)); assert!(types.contains(&LinkType::MarkdownLink)); } #[test] fn test_line_numbers() { let content = "Line 1\n[[link on line 2]]\nLine 3"; let links = extract_links(test_media_id(), content); assert_eq!(links.len(), 1); assert_eq!(links[0].line_number, Some(2)); } #[test] fn test_resolve_candidates() { let source_path = std::path::Path::new("/notes/projects/readme.md"); let root_dirs = vec![std::path::PathBuf::from("/notes")]; let candidates = resolve_link_candidates("My Note", source_path, &root_dirs); // Should include relative path and .md variations assert!(!candidates.is_empty()); assert!( candidates .iter() .any(|p| p.to_string_lossy().contains("My Note.md")) ); } #[test] fn test_extract_aliases() { let content = r"--- title: My Note aliases: - Alternative Name - Another Alias --- # Content here "; let aliases = extract_aliases(content); assert_eq!(aliases, vec!["Alternative Name", "Another Alias"]); } #[test] fn test_extract_single_alias() { let content = r"--- title: My Note aliases: Single Alias --- # Content "; let aliases = extract_aliases(content); assert_eq!(aliases, vec!["Single Alias"]); } #[test] fn test_wikilink_not_matching_embed() { let content = "A wikilink [[note]] and an embed ![[image.png]]"; let links = extract_links(test_media_id(), content); assert_eq!(links.len(), 2); let wikilinks: Vec<_> = links .iter() .filter(|l| l.link_type == LinkType::Wikilink) .collect(); let embeds: Vec<_> = links .iter() .filter(|l| l.link_type == LinkType::Embed) .collect(); assert_eq!(wikilinks.len(), 1); assert_eq!(embeds.len(), 1); assert_eq!(wikilinks[0].target_path, "note"); assert_eq!(embeds[0].target_path, "image.png"); } #[test] fn test_exclude_markdown_images() { // Test that markdown images ![alt](image.png) are NOT extracted as links let content = r" # My Note Here's a regular link: [documentation](docs/guide.md) Here's an image: ![Screenshot](images/screenshot.png) Another link: [config](config.toml) Multiple images: ![Logo](logo.png) and ![Banner](banner.jpg) Mixed: [link](file.md) then ![image](pic.png) then [another](other.md) "; let links = extract_links(test_media_id(), content); // Should only extract the 4 markdown links, not the 4 images assert_eq!( links.len(), 4, "Should extract 4 links, not images. Got: {links:#?}" ); // Verify all extracted items are MarkdownLink type (not images) for link in &links { assert_eq!( link.link_type, LinkType::MarkdownLink, "Link '{}' should be MarkdownLink type", link.target_path ); } // Verify correct targets were extracted (links, not images) let targets: Vec<&str> = links.iter().map(|l| l.target_path.as_str()).collect(); assert!( targets.contains(&"docs/guide.md"), "Should contain docs/guide.md" ); assert!( targets.contains(&"config.toml"), "Should contain config.toml" ); assert!(targets.contains(&"file.md"), "Should contain file.md"); assert!(targets.contains(&"other.md"), "Should contain other.md"); // Verify images were NOT extracted assert!( !targets.contains(&"images/screenshot.png"), "Should NOT contain screenshot.png (it's an image)" ); assert!( !targets.contains(&"logo.png"), "Should NOT contain logo.png (it's an image)" ); assert!( !targets.contains(&"banner.jpg"), "Should NOT contain banner.jpg (it's an image)" ); assert!( !targets.contains(&"pic.png"), "Should NOT contain pic.png (it's an image)" ); } #[test] fn test_edge_case_image_at_line_start() { // Test edge case: image at the very start of a line let content = "![Image at start](start.png)\n[Link](file.md)"; let links = extract_links(test_media_id(), content); assert_eq!( links.len(), 1, "Should only extract the link, not the image" ); assert_eq!(links[0].target_path, "file.md"); assert_eq!(links[0].link_type, LinkType::MarkdownLink); } }