various: markdown improvements

Signed-off-by: NotAShelf <raf@notashelf.dev> Change-Id: I81fda8247814da19eed1e76dbe97bd5b6a6a6964
2026-02-05 15:39:05 +03:00 · 2026-02-05 15:39:05 +03:00 · 80a8b5c7ca
commit 80a8b5c7ca
parent 875bdf5ebc
23 changed files with 3458 additions and 30 deletions
--- a/crates/pinakes-core/src/links.rs
+++ b/crates/pinakes-core/src/links.rs
@ -0,0 +1,456 @@
+//! Markdown link extraction and management for Obsidian-style bidirectional links.
+//!
+//! This module provides:
+//! - Wikilink extraction (`[[target]]` and `[[target|display]]`)
+//! - Embed extraction (`![[target]]`)
+//! - Markdown link extraction (`[text](path)` for internal links)
+//! - Link resolution strategies
+//! - Context extraction for backlink previews
+
+use std::path::Path;
+
+use regex::Regex;
+use uuid::Uuid;
+
+use crate::error::Result;
+use crate::model::{LinkType, MarkdownLink, MediaId};
+
+/// Configuration for context extraction around links
+const CONTEXT_CHARS_BEFORE: usize = 50;
+const CONTEXT_CHARS_AFTER: usize = 50;
+
+/// Extract all markdown links from file content.
+///
+/// This extracts:
+/// - Wikilinks: `[[target]]` and `[[target|display text]]`
+/// - Embeds: `![[target]]`
+/// - Markdown links: `[text](path)` (internal paths only, no http/https)
+pub fn extract_links(source_media_id: MediaId, content: &str) -> Vec<MarkdownLink> {
+    let mut links = Vec::new();
+
+    // Extract wikilinks: [[target]] or [[target|display]]
+    links.extend(extract_wikilinks(source_media_id, content));
+
+    // Extract embeds: ![[target]]
+    links.extend(extract_embeds(source_media_id, content));
+
+    // Extract markdown links: [text](path)
+    links.extend(extract_markdown_links(source_media_id, content));
+
+    links
+}
+
+/// Extract wikilinks from content.
+/// Matches: `[[target]]` or `[[target|display text]]` but NOT `![[...]]` (embeds)
+fn extract_wikilinks(source_media_id: MediaId, content: &str) -> Vec<MarkdownLink> {
+    // Match [[...]] - we'll manually filter out embeds that are preceded by !
+    let re = Regex::new(r"\[\[([^\]|]+)(?:\|([^\]]+))?\]\]").unwrap();
+    let mut links = Vec::new();
+
+    for (line_num, line) in content.lines().enumerate() {
+        for cap in re.captures_iter(line) {
+            let full_match = cap.get(0).unwrap();
+            let match_start = full_match.start();
+
+            // Check if preceded by ! (which would make it an embed, not a wikilink)
+            if match_start > 0 {
+                let bytes = line.as_bytes();
+                if bytes.get(match_start - 1) == Some(&b'!') {
+                    continue; // Skip embeds
+                }
+            }
+
+            let target = cap.get(1).unwrap().as_str().trim();
+            let display_text = cap.get(2).map(|m| m.as_str().trim().to_string());
+
+            let context = extract_context(content, line_num, full_match.start(), full_match.end());
+
+            links.push(MarkdownLink {
+                id: Uuid::now_v7(),
+                source_media_id,
+                target_path: target.to_string(),
+                target_media_id: None, // Will be resolved later
+                link_type: LinkType::Wikilink,
+                link_text: display_text.or_else(|| Some(target.to_string())),
+                line_number: Some(line_num as i32 + 1), // 1-indexed
+                context: Some(context),
+                created_at: chrono::Utc::now(),
+            });
+        }
+    }
+
+    links
+}
+
+/// Extract embeds from content.
+/// Matches: `![[target]]`
+fn extract_embeds(source_media_id: MediaId, content: &str) -> Vec<MarkdownLink> {
+    let re = Regex::new(r"!\[\[([^\]|]+)(?:\|([^\]]+))?\]\]").unwrap();
+    let mut links = Vec::new();
+
+    for (line_num, line) in content.lines().enumerate() {
+        for cap in re.captures_iter(line) {
+            let full_match = cap.get(0).unwrap();
+            let target = cap.get(1).unwrap().as_str().trim();
+            let display_text = cap.get(2).map(|m| m.as_str().trim().to_string());
+
+            let context = extract_context(content, line_num, full_match.start(), full_match.end());
+
+            links.push(MarkdownLink {
+                id: Uuid::now_v7(),
+                source_media_id,
+                target_path: target.to_string(),
+                target_media_id: None,
+                link_type: LinkType::Embed,
+                link_text: display_text.or_else(|| Some(target.to_string())),
+                line_number: Some(line_num as i32 + 1),
+                context: Some(context),
+                created_at: chrono::Utc::now(),
+            });
+        }
+    }
+
+    links
+}
+
+/// Extract markdown links from content.
+/// Matches: `[text](path)` but only for internal paths (no http/https)
+fn extract_markdown_links(source_media_id: MediaId, content: &str) -> Vec<MarkdownLink> {
+    // Match [text](path) where path doesn't start with http:// or https://
+    let re = Regex::new(r"\[([^\]]+)\]\(([^)]+)\)").unwrap();
+    let mut links = Vec::new();
+
+    for (line_num, line) in content.lines().enumerate() {
+        for cap in re.captures_iter(line) {
+            let full_match = cap.get(0).unwrap();
+            let text = cap.get(1).unwrap().as_str().trim();
+            let path = cap.get(2).unwrap().as_str().trim();
+
+            // Skip external links
+            if path.starts_with("http://")
+                || path.starts_with("https://")
+                || path.starts_with("mailto:")
+                || path.starts_with("ftp://")
+            {
+                continue;
+            }
+
+            // Skip anchor-only links
+            if path.starts_with('#') {
+                continue;
+            }
+
+            // Remove any anchor from the path for resolution
+            let target_path = path.split('#').next().unwrap_or(path);
+
+            let context = extract_context(content, line_num, full_match.start(), full_match.end());
+
+            links.push(MarkdownLink {
+                id: Uuid::now_v7(),
+                source_media_id,
+                target_path: target_path.to_string(),
+                target_media_id: None,
+                link_type: LinkType::MarkdownLink,
+                link_text: Some(text.to_string()),
+                line_number: Some(line_num as i32 + 1),
+                context: Some(context),
+                created_at: chrono::Utc::now(),
+            });
+        }
+    }
+
+    links
+}
+
+/// Extract surrounding context for a link.
+fn extract_context(content: &str, line_num: usize, _start: usize, _end: usize) -> String {
+    let lines: Vec<&str> = content.lines().collect();
+    if line_num >= lines.len() {
+        return String::new();
+    }
+
+    let line = lines[line_num];
+    let line_len = line.len();
+
+    // Get surrounding lines for context if the current line is short
+    if line_len < 30 && line_num > 0 {
+        // Include previous line
+        let prev = lines.get(line_num.saturating_sub(1)).unwrap_or(&"");
+        let next = lines.get(line_num + 1).unwrap_or(&"");
+        return format!("{} {} {}", prev.trim(), line.trim(), next.trim())
+            .chars()
+            .take(CONTEXT_CHARS_BEFORE + CONTEXT_CHARS_AFTER + 20)
+            .collect();
+    }
+
+    // Truncate long lines
+    if line_len > CONTEXT_CHARS_BEFORE + CONTEXT_CHARS_AFTER {
+        line.chars()
+            .take(CONTEXT_CHARS_BEFORE + CONTEXT_CHARS_AFTER)
+            .collect()
+    } else {
+        line.to_string()
+    }
+}
+
+/// Link resolution strategies for finding target media items.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum ResolutionStrategy {
+    /// Direct path match
+    DirectPath,
+    /// Relative to source directory
+    RelativeToSource,
+    /// Filename with .md extension added
+    FilenameWithMd,
+    /// Filename-only search (Obsidian-style)
+    FilenameOnly,
+}
+
+/// Resolve a link target to possible file paths.
+///
+/// Returns a list of candidate paths to check, in order of preference.
+pub fn resolve_link_candidates(
+    target: &str,
+    source_path: &Path,
+    root_dirs: &[std::path::PathBuf],
+) -> Vec<std::path::PathBuf> {
+    let mut candidates = Vec::new();
+
+    // Clean up the target path
+    let target = target.trim();
+
+    // 1. Direct path - if it looks like a path
+    if target.contains('/') || target.contains('\\') {
+        let direct = std::path::PathBuf::from(target);
+        if direct.is_absolute() {
+            candidates.push(direct);
+        } else {
+            // Relative to each root dir
+            for root in root_dirs {
+                candidates.push(root.join(&direct));
+            }
+        }
+    }
+
+    // 2. Relative to source file's directory
+    if let Some(source_dir) = source_path.parent() {
+        let relative = source_dir.join(target);
+        candidates.push(relative.clone());
+
+        // Also try with .md extension
+        if !target.ends_with(".md") {
+            candidates.push(relative.with_extension("md"));
+            let mut with_md = relative.clone();
+            with_md.set_file_name(format!(
+                "{}.md",
+                relative.file_name().unwrap_or_default().to_string_lossy()
+            ));
+            candidates.push(with_md);
+        }
+    }
+
+    // 3. Filename with .md extension in root dirs
+    let target_with_md = if target.ends_with(".md") {
+        target.to_string()
+    } else {
+        format!("{}.md", target)
+    };
+
+    for root in root_dirs {
+        candidates.push(root.join(&target_with_md));
+    }
+
+    // 4. Remove duplicates while preserving order
+    let mut seen = std::collections::HashSet::new();
+    candidates.retain(|p| seen.insert(p.clone()));
+
+    candidates
+}
+
+/// Extract frontmatter aliases from markdown content.
+///
+/// Obsidian uses the `aliases` field in frontmatter to define alternative names
+/// for a note that can be used in wikilinks.
+pub fn extract_aliases(content: &str) -> Result<Vec<String>> {
+    let parsed = gray_matter::Matter::<gray_matter::engine::YAML>::new().parse(content);
+
+    if let Some(data) = parsed.ok().and_then(|p| p.data) {
+        if let gray_matter::Pod::Hash(map) = data {
+            if let Some(aliases) = map.get("aliases") {
+                match aliases {
+                    gray_matter::Pod::Array(arr) => {
+                        return Ok(arr
+                            .iter()
+                            .filter_map(|a| {
+                                if let gray_matter::Pod::String(s) = a {
+                                    Some(s.clone())
+                                } else {
+                                    None
+                                }
+                            })
+                            .collect());
+                    }
+                    gray_matter::Pod::String(s) => {
+                        // Single alias as string
+                        return Ok(vec![s.clone()]);
+                    }
+                    _ => {}
+                }
+            }
+        }
+    }
+
+    Ok(Vec::new())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn test_media_id() -> MediaId {
+        MediaId(Uuid::nil())
+    }
+
+    #[test]
+    fn test_extract_simple_wikilink() {
+        let content = "This is a [[simple link]] in text.";
+        let links = extract_links(test_media_id(), content);
+
+        assert_eq!(links.len(), 1);
+        assert_eq!(links[0].target_path, "simple link");
+        assert_eq!(links[0].link_type, LinkType::Wikilink);
+        assert_eq!(links[0].link_text, Some("simple link".to_string()));
+    }
+
+    #[test]
+    fn test_extract_wikilink_with_display() {
+        let content = "Check out [[target note|this article]] for more.";
+        let links = extract_links(test_media_id(), content);
+
+        assert_eq!(links.len(), 1);
+        assert_eq!(links[0].target_path, "target note");
+        assert_eq!(links[0].link_text, Some("this article".to_string()));
+    }
+
+    #[test]
+    fn test_extract_embed() {
+        let content = "Here is an image: ![[image.png]]";
+        let links = extract_links(test_media_id(), content);
+
+        assert_eq!(links.len(), 1);
+        assert_eq!(links[0].target_path, "image.png");
+        assert_eq!(links[0].link_type, LinkType::Embed);
+    }
+
+    #[test]
+    fn test_extract_markdown_link() {
+        let content = "Read [the documentation](docs/README.md) for details.";
+        let links = extract_links(test_media_id(), content);
+
+        assert_eq!(links.len(), 1);
+        assert_eq!(links[0].target_path, "docs/README.md");
+        assert_eq!(links[0].link_type, LinkType::MarkdownLink);
+        assert_eq!(links[0].link_text, Some("the documentation".to_string()));
+    }
+
+    #[test]
+    fn test_skip_external_links() {
+        let content = "Visit [our site](https://example.com) or [email us](mailto:test@test.com).";
+        let links = extract_links(test_media_id(), content);
+
+        assert!(links.is_empty());
+    }
+
+    #[test]
+    fn test_multiple_links() {
+        let content = r#"
+# My Note
+
+This links to [[Note A]] and also [[Note B|Note B Title]].
+
+We also have a markdown link to [config](./config.md).
+
+And an embedded image: ![[diagram.png]]
+"#;
+        let links = extract_links(test_media_id(), content);
+
+        assert_eq!(links.len(), 4);
+
+        let types: Vec<_> = links.iter().map(|l| l.link_type).collect();
+        assert!(types.contains(&LinkType::Wikilink));
+        assert!(types.contains(&LinkType::Embed));
+        assert!(types.contains(&LinkType::MarkdownLink));
+    }
+
+    #[test]
+    fn test_line_numbers() {
+        let content = "Line 1\n[[link on line 2]]\nLine 3";
+        let links = extract_links(test_media_id(), content);
+
+        assert_eq!(links.len(), 1);
+        assert_eq!(links[0].line_number, Some(2));
+    }
+
+    #[test]
+    fn test_resolve_candidates() {
+        let source_path = std::path::Path::new("/notes/projects/readme.md");
+        let root_dirs = vec![std::path::PathBuf::from("/notes")];
+
+        let candidates = resolve_link_candidates("My Note", source_path, &root_dirs);
+
+        // Should include relative path and .md variations
+        assert!(!candidates.is_empty());
+        assert!(candidates
+            .iter()
+            .any(|p| p.to_string_lossy().contains("My Note.md")));
+    }
+
+    #[test]
+    fn test_extract_aliases() {
+        let content = r#"---
+title: My Note
+aliases:
+  - Alternative Name
+  - Another Alias
+---
+
+# Content here
+"#;
+        let aliases = extract_aliases(content).unwrap();
+        assert_eq!(aliases, vec!["Alternative Name", "Another Alias"]);
+    }
+
+    #[test]
+    fn test_extract_single_alias() {
+        let content = r#"---
+title: My Note
+aliases: Single Alias
+---
+
+# Content
+"#;
+        let aliases = extract_aliases(content).unwrap();
+        assert_eq!(aliases, vec!["Single Alias"]);
+    }
+
+    #[test]
+    fn test_wikilink_not_matching_embed() {
+        let content = "A wikilink [[note]] and an embed ![[image.png]]";
+        let links = extract_links(test_media_id(), content);
+
+        assert_eq!(links.len(), 2);
+        let wikilinks: Vec<_> = links
+            .iter()
+            .filter(|l| l.link_type == LinkType::Wikilink)
+            .collect();
+        let embeds: Vec<_> = links
+            .iter()
+            .filter(|l| l.link_type == LinkType::Embed)
+            .collect();
+
+        assert_eq!(wikilinks.len(), 1);
+        assert_eq!(embeds.len(), 1);
+        assert_eq!(wikilinks[0].target_path, "note");
+        assert_eq!(embeds[0].target_path, "image.png");
+    }
+}