Signed-off-by: NotAShelf <raf@notashelf.dev> Change-Id: I9e0ff5ea33a5cf697473423e88f167ce6a6a6964
644 lines
17 KiB
Rust
644 lines
17 KiB
Rust
//! Markdown link extraction and management for Obsidian-style bidirectional
|
|
//! links.
|
|
//!
|
|
//! This module provides:
|
|
//! - Wikilink extraction (`[[target]]` and `[[target|display]]`)
|
|
//! - Embed extraction (`![[target]]`)
|
|
//! - Markdown link extraction (`[text](path)` for internal links)
|
|
//! - Link resolution strategies
|
|
//! - Context extraction for backlink previews
|
|
|
|
use std::{path::Path, sync::LazyLock};
|
|
|
|
use regex::Regex;
|
|
use uuid::Uuid;
|
|
|
|
use crate::model::{LinkType, MarkdownLink, MediaId};
|
|
|
|
// Compile regexes once at startup to avoid recompilation on every call.
|
|
// Stored as Option so that initialization failure is handled gracefully
|
|
// rather than panicking.
|
|
static WIKILINK_RE: LazyLock<Option<Regex>> =
|
|
LazyLock::new(|| Regex::new(r"\[\[([^\]|]+)(?:\|([^\]]+))?\]\]").ok());
|
|
|
|
static EMBED_RE: LazyLock<Option<Regex>> =
|
|
LazyLock::new(|| Regex::new(r"!\[\[([^\]|]+)(?:\|([^\]]+))?\]\]").ok());
|
|
|
|
static MARKDOWN_LINK_RE: LazyLock<Option<Regex>> =
|
|
LazyLock::new(|| Regex::new(r"\[([^\]]+)\]\(([^)]+)\)").ok());
|
|
|
|
/// Configuration for context extraction around links
|
|
const CONTEXT_CHARS_BEFORE: usize = 50;
|
|
const CONTEXT_CHARS_AFTER: usize = 50;
|
|
|
|
/// Extract all markdown links from file content.
|
|
///
|
|
/// This extracts:
|
|
/// - Wikilinks: `[[target]]` and `[[target|display text]]`
|
|
/// - Embeds: `![[target]]`
|
|
/// - Markdown links: `[text](path)` (internal paths only, no http/https)
|
|
#[must_use]
|
|
pub fn extract_links(
|
|
source_media_id: MediaId,
|
|
content: &str,
|
|
) -> Vec<MarkdownLink> {
|
|
let mut links = Vec::new();
|
|
|
|
// Extract wikilinks: [[target]] or [[target|display]]
|
|
links.extend(extract_wikilinks(source_media_id, content));
|
|
|
|
// Extract embeds: ![[target]]
|
|
links.extend(extract_embeds(source_media_id, content));
|
|
|
|
// Extract markdown links: [text](path)
|
|
links.extend(extract_markdown_links(source_media_id, content));
|
|
|
|
links
|
|
}
|
|
|
|
/// Extract wikilinks from content.
|
|
/// Matches: `[[target]]` or `[[target|display text]]` but NOT `![[...]]`
|
|
/// (embeds)
|
|
fn extract_wikilinks(
|
|
source_media_id: MediaId,
|
|
content: &str,
|
|
) -> Vec<MarkdownLink> {
|
|
let Some(re) = WIKILINK_RE.as_ref() else {
|
|
return Vec::new();
|
|
};
|
|
let mut links = Vec::new();
|
|
|
|
for (line_num, line) in content.lines().enumerate() {
|
|
for cap in re.captures_iter(line) {
|
|
let Some(full_match) = cap.get(0) else {
|
|
continue;
|
|
};
|
|
let match_start = full_match.start();
|
|
|
|
// Check if preceded by ! (which would make it an embed, not a wikilink)
|
|
if match_start > 0 {
|
|
let bytes = line.as_bytes();
|
|
if bytes.get(match_start - 1) == Some(&b'!') {
|
|
continue; // Skip embeds
|
|
}
|
|
}
|
|
|
|
let Some(target_match) = cap.get(1) else {
|
|
continue;
|
|
};
|
|
let target = target_match.as_str().trim();
|
|
let display_text = cap.get(2).map(|m| m.as_str().trim().to_string());
|
|
|
|
let context = extract_context(
|
|
content,
|
|
line_num,
|
|
full_match.start(),
|
|
full_match.end(),
|
|
);
|
|
|
|
links.push(MarkdownLink {
|
|
id: Uuid::now_v7(),
|
|
source_media_id,
|
|
target_path: target.to_string(),
|
|
target_media_id: None, // Will be resolved later
|
|
link_type: LinkType::Wikilink,
|
|
link_text: display_text.or_else(|| Some(target.to_string())),
|
|
line_number: Some(
|
|
i32::try_from(line_num)
|
|
.unwrap_or(i32::MAX)
|
|
.saturating_add(1),
|
|
), // 1-indexed
|
|
context: Some(context),
|
|
created_at: chrono::Utc::now(),
|
|
});
|
|
}
|
|
}
|
|
|
|
links
|
|
}
|
|
|
|
/// Extract embeds from content.
|
|
/// Matches: `![[target]]`
|
|
fn extract_embeds(
|
|
source_media_id: MediaId,
|
|
content: &str,
|
|
) -> Vec<MarkdownLink> {
|
|
let Some(re) = EMBED_RE.as_ref() else {
|
|
return Vec::new();
|
|
};
|
|
let mut links = Vec::new();
|
|
|
|
for (line_num, line) in content.lines().enumerate() {
|
|
for cap in re.captures_iter(line) {
|
|
let Some(full_match) = cap.get(0) else {
|
|
continue;
|
|
};
|
|
let Some(target_match) = cap.get(1) else {
|
|
continue;
|
|
};
|
|
let target = target_match.as_str().trim();
|
|
let display_text = cap.get(2).map(|m| m.as_str().trim().to_string());
|
|
|
|
let context = extract_context(
|
|
content,
|
|
line_num,
|
|
full_match.start(),
|
|
full_match.end(),
|
|
);
|
|
|
|
links.push(MarkdownLink {
|
|
id: Uuid::now_v7(),
|
|
source_media_id,
|
|
target_path: target.to_string(),
|
|
target_media_id: None,
|
|
link_type: LinkType::Embed,
|
|
link_text: display_text.or_else(|| Some(target.to_string())),
|
|
line_number: Some(
|
|
i32::try_from(line_num)
|
|
.unwrap_or(i32::MAX)
|
|
.saturating_add(1),
|
|
),
|
|
context: Some(context),
|
|
created_at: chrono::Utc::now(),
|
|
});
|
|
}
|
|
}
|
|
|
|
links
|
|
}
|
|
|
|
/// Extract markdown links from content.
|
|
/// Matches: `[text](path)` but only for internal paths (no http/https)
|
|
fn extract_markdown_links(
|
|
source_media_id: MediaId,
|
|
content: &str,
|
|
) -> Vec<MarkdownLink> {
|
|
let Some(re) = MARKDOWN_LINK_RE.as_ref() else {
|
|
return Vec::new();
|
|
};
|
|
let mut links = Vec::new();
|
|
|
|
for (line_num, line) in content.lines().enumerate() {
|
|
for cap in re.captures_iter(line) {
|
|
let Some(full_match) = cap.get(0) else {
|
|
continue;
|
|
};
|
|
let match_start = full_match.start();
|
|
|
|
// Skip markdown images: 
|
|
// Check if the character immediately before '[' is '!'
|
|
if match_start > 0 && line.as_bytes().get(match_start - 1) == Some(&b'!')
|
|
{
|
|
continue;
|
|
}
|
|
|
|
let Some(text_match) = cap.get(1) else {
|
|
continue;
|
|
};
|
|
let Some(path_match) = cap.get(2) else {
|
|
continue;
|
|
};
|
|
let text = text_match.as_str().trim();
|
|
let path = path_match.as_str().trim();
|
|
|
|
// Skip external links
|
|
if path.starts_with("http://")
|
|
|| path.starts_with("https://")
|
|
|| path.starts_with("mailto:")
|
|
|| path.starts_with("ftp://")
|
|
{
|
|
continue;
|
|
}
|
|
|
|
// Skip anchor-only links
|
|
if path.starts_with('#') {
|
|
continue;
|
|
}
|
|
|
|
// Remove any anchor from the path for resolution
|
|
let target_path = path.split('#').next().unwrap_or(path);
|
|
|
|
let context = extract_context(
|
|
content,
|
|
line_num,
|
|
full_match.start(),
|
|
full_match.end(),
|
|
);
|
|
|
|
links.push(MarkdownLink {
|
|
id: Uuid::now_v7(),
|
|
source_media_id,
|
|
target_path: target_path.to_string(),
|
|
target_media_id: None,
|
|
link_type: LinkType::MarkdownLink,
|
|
link_text: Some(text.to_string()),
|
|
line_number: Some(
|
|
i32::try_from(line_num)
|
|
.unwrap_or(i32::MAX)
|
|
.saturating_add(1),
|
|
),
|
|
context: Some(context),
|
|
created_at: chrono::Utc::now(),
|
|
});
|
|
}
|
|
}
|
|
|
|
links
|
|
}
|
|
|
|
/// Extract surrounding context for a link.
|
|
fn extract_context(
|
|
content: &str,
|
|
line_num: usize,
|
|
_start: usize,
|
|
_end: usize,
|
|
) -> String {
|
|
let lines: Vec<&str> = content.lines().collect();
|
|
if line_num >= lines.len() {
|
|
return String::new();
|
|
}
|
|
|
|
let line = lines[line_num];
|
|
let line_len = line.len();
|
|
|
|
// Get surrounding lines for context if the current line is short
|
|
if line_len < 30 && line_num > 0 {
|
|
// Include previous line
|
|
let prev = lines.get(line_num.saturating_sub(1)).unwrap_or(&"");
|
|
let next = lines.get(line_num + 1).unwrap_or(&"");
|
|
return format!("{} {} {}", prev.trim(), line.trim(), next.trim())
|
|
.chars()
|
|
.take(CONTEXT_CHARS_BEFORE + CONTEXT_CHARS_AFTER + 20)
|
|
.collect();
|
|
}
|
|
|
|
// Truncate long lines
|
|
if line_len > CONTEXT_CHARS_BEFORE + CONTEXT_CHARS_AFTER {
|
|
line
|
|
.chars()
|
|
.take(CONTEXT_CHARS_BEFORE + CONTEXT_CHARS_AFTER)
|
|
.collect()
|
|
} else {
|
|
line.to_string()
|
|
}
|
|
}
|
|
|
|
/// Link resolution strategies for finding target media items.
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
|
pub enum ResolutionStrategy {
|
|
/// Direct path match
|
|
DirectPath,
|
|
/// Relative to source directory
|
|
RelativeToSource,
|
|
/// Filename with .md extension added
|
|
FilenameWithMd,
|
|
/// Filename-only search (Obsidian-style)
|
|
FilenameOnly,
|
|
}
|
|
|
|
/// Resolve a link target to possible file paths.
|
|
///
|
|
/// Returns a list of candidate paths to check, in order of preference.
|
|
#[must_use]
|
|
pub fn resolve_link_candidates(
|
|
target: &str,
|
|
source_path: &Path,
|
|
root_dirs: &[std::path::PathBuf],
|
|
) -> Vec<std::path::PathBuf> {
|
|
let mut candidates = Vec::new();
|
|
|
|
// Clean up the target path
|
|
let target = target.trim();
|
|
|
|
// 1. Direct path - if it looks like a path
|
|
if target.contains('/') || target.contains('\\') {
|
|
let direct = std::path::PathBuf::from(target);
|
|
if direct.is_absolute() {
|
|
candidates.push(direct);
|
|
} else {
|
|
// Relative to each root dir
|
|
for root in root_dirs {
|
|
candidates.push(root.join(&direct));
|
|
}
|
|
}
|
|
}
|
|
|
|
// 2. Relative to source file's directory
|
|
if let Some(source_dir) = source_path.parent() {
|
|
let relative = source_dir.join(target);
|
|
candidates.push(relative.clone());
|
|
|
|
// Also try with .md extension
|
|
if !target.to_ascii_lowercase().ends_with(".md") {
|
|
candidates.push(relative.with_extension("md"));
|
|
let mut with_md = relative.clone();
|
|
with_md.set_file_name(format!(
|
|
"{}.md",
|
|
relative.file_name().unwrap_or_default().to_string_lossy()
|
|
));
|
|
candidates.push(with_md);
|
|
}
|
|
}
|
|
|
|
// 3. Filename with .md extension in root dirs
|
|
let target_with_md = if target.to_ascii_lowercase().ends_with(".md") {
|
|
target.to_string()
|
|
} else {
|
|
format!("{target}.md")
|
|
};
|
|
|
|
for root in root_dirs {
|
|
candidates.push(root.join(&target_with_md));
|
|
}
|
|
|
|
// 4. Remove duplicates while preserving order
|
|
let mut seen = std::collections::HashSet::new();
|
|
candidates.retain(|p| seen.insert(p.clone()));
|
|
|
|
candidates
|
|
}
|
|
|
|
/// Extract frontmatter aliases from markdown content.
|
|
///
|
|
/// Obsidian uses the `aliases` field in frontmatter to define alternative names
|
|
/// for a note that can be used in wikilinks.
|
|
#[must_use]
|
|
pub fn extract_aliases(content: &str) -> Vec<String> {
|
|
let Ok(parsed) =
|
|
gray_matter::Matter::<gray_matter::engine::YAML>::new().parse(content)
|
|
else {
|
|
return Vec::new();
|
|
};
|
|
|
|
let Some(data) = parsed.data else {
|
|
return Vec::new();
|
|
};
|
|
|
|
let gray_matter::Pod::Hash(map) = data else {
|
|
return Vec::new();
|
|
};
|
|
|
|
let Some(aliases) = map.get("aliases") else {
|
|
return Vec::new();
|
|
};
|
|
|
|
match aliases {
|
|
gray_matter::Pod::Array(arr) => {
|
|
arr
|
|
.iter()
|
|
.filter_map(|a| {
|
|
if let gray_matter::Pod::String(s) = a {
|
|
Some(s.clone())
|
|
} else {
|
|
None
|
|
}
|
|
})
|
|
.collect()
|
|
},
|
|
gray_matter::Pod::String(s) => {
|
|
// Single alias as string
|
|
vec![s.clone()]
|
|
},
|
|
_ => Vec::new(),
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
fn test_media_id() -> MediaId {
|
|
MediaId(Uuid::nil())
|
|
}
|
|
|
|
#[test]
|
|
fn test_extract_simple_wikilink() {
|
|
let content = "This is a [[simple link]] in text.";
|
|
let links = extract_links(test_media_id(), content);
|
|
|
|
assert_eq!(links.len(), 1);
|
|
assert_eq!(links[0].target_path, "simple link");
|
|
assert_eq!(links[0].link_type, LinkType::Wikilink);
|
|
assert_eq!(links[0].link_text, Some("simple link".to_string()));
|
|
}
|
|
|
|
#[test]
|
|
fn test_extract_wikilink_with_display() {
|
|
let content = "Check out [[target note|this article]] for more.";
|
|
let links = extract_links(test_media_id(), content);
|
|
|
|
assert_eq!(links.len(), 1);
|
|
assert_eq!(links[0].target_path, "target note");
|
|
assert_eq!(links[0].link_text, Some("this article".to_string()));
|
|
}
|
|
|
|
#[test]
|
|
fn test_extract_embed() {
|
|
let content = "Here is an image: ![[image.png]]";
|
|
let links = extract_links(test_media_id(), content);
|
|
|
|
assert_eq!(links.len(), 1);
|
|
assert_eq!(links[0].target_path, "image.png");
|
|
assert_eq!(links[0].link_type, LinkType::Embed);
|
|
}
|
|
|
|
#[test]
|
|
fn test_extract_markdown_link() {
|
|
let content = "Read [the documentation](docs/README.md) for details.";
|
|
let links = extract_links(test_media_id(), content);
|
|
|
|
assert_eq!(links.len(), 1);
|
|
assert_eq!(links[0].target_path, "docs/README.md");
|
|
assert_eq!(links[0].link_type, LinkType::MarkdownLink);
|
|
assert_eq!(links[0].link_text, Some("the documentation".to_string()));
|
|
}
|
|
|
|
#[test]
|
|
fn test_skip_external_links() {
|
|
let content = "Visit [our site](https://example.com) or [email \
|
|
us](mailto:test@test.com).";
|
|
let links = extract_links(test_media_id(), content);
|
|
|
|
assert!(links.is_empty());
|
|
}
|
|
|
|
#[test]
|
|
fn test_multiple_links() {
|
|
let content = r"
|
|
# My Note
|
|
|
|
This links to [[Note A]] and also [[Note B|Note B Title]].
|
|
|
|
We also have a markdown link to [config](./config.md).
|
|
|
|
And an embedded image: ![[diagram.png]]
|
|
";
|
|
let links = extract_links(test_media_id(), content);
|
|
|
|
assert_eq!(links.len(), 4);
|
|
|
|
let types: Vec<_> = links.iter().map(|l| l.link_type).collect();
|
|
assert!(types.contains(&LinkType::Wikilink));
|
|
assert!(types.contains(&LinkType::Embed));
|
|
assert!(types.contains(&LinkType::MarkdownLink));
|
|
}
|
|
|
|
#[test]
|
|
fn test_line_numbers() {
|
|
let content = "Line 1\n[[link on line 2]]\nLine 3";
|
|
let links = extract_links(test_media_id(), content);
|
|
|
|
assert_eq!(links.len(), 1);
|
|
assert_eq!(links[0].line_number, Some(2));
|
|
}
|
|
|
|
#[test]
|
|
fn test_resolve_candidates() {
|
|
let source_path = std::path::Path::new("/notes/projects/readme.md");
|
|
let root_dirs = vec![std::path::PathBuf::from("/notes")];
|
|
|
|
let candidates =
|
|
resolve_link_candidates("My Note", source_path, &root_dirs);
|
|
|
|
// Should include relative path and .md variations
|
|
assert!(!candidates.is_empty());
|
|
assert!(
|
|
candidates
|
|
.iter()
|
|
.any(|p| p.to_string_lossy().contains("My Note.md"))
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_extract_aliases() {
|
|
let content = r"---
|
|
title: My Note
|
|
aliases:
|
|
- Alternative Name
|
|
- Another Alias
|
|
---
|
|
|
|
# Content here
|
|
";
|
|
let aliases = extract_aliases(content);
|
|
assert_eq!(aliases, vec!["Alternative Name", "Another Alias"]);
|
|
}
|
|
|
|
#[test]
|
|
fn test_extract_single_alias() {
|
|
let content = r"---
|
|
title: My Note
|
|
aliases: Single Alias
|
|
---
|
|
|
|
# Content
|
|
";
|
|
let aliases = extract_aliases(content);
|
|
assert_eq!(aliases, vec!["Single Alias"]);
|
|
}
|
|
|
|
#[test]
|
|
fn test_wikilink_not_matching_embed() {
|
|
let content = "A wikilink [[note]] and an embed ![[image.png]]";
|
|
let links = extract_links(test_media_id(), content);
|
|
|
|
assert_eq!(links.len(), 2);
|
|
let wikilinks: Vec<_> = links
|
|
.iter()
|
|
.filter(|l| l.link_type == LinkType::Wikilink)
|
|
.collect();
|
|
let embeds: Vec<_> = links
|
|
.iter()
|
|
.filter(|l| l.link_type == LinkType::Embed)
|
|
.collect();
|
|
|
|
assert_eq!(wikilinks.len(), 1);
|
|
assert_eq!(embeds.len(), 1);
|
|
assert_eq!(wikilinks[0].target_path, "note");
|
|
assert_eq!(embeds[0].target_path, "image.png");
|
|
}
|
|
|
|
#[test]
|
|
fn test_exclude_markdown_images() {
|
|
// Test that markdown images  are NOT extracted as links
|
|
let content = r"
|
|
# My Note
|
|
|
|
Here's a regular link: [documentation](docs/guide.md)
|
|
|
|
Here's an image: 
|
|
|
|
Another link: [config](config.toml)
|
|
|
|
Multiple images:
|
|
 and 
|
|
|
|
Mixed: [link](file.md) then  then [another](other.md)
|
|
";
|
|
let links = extract_links(test_media_id(), content);
|
|
|
|
// Should only extract the 4 markdown links, not the 4 images
|
|
assert_eq!(
|
|
links.len(),
|
|
4,
|
|
"Should extract 4 links, not images. Got: {links:#?}"
|
|
);
|
|
|
|
// Verify all extracted items are MarkdownLink type (not images)
|
|
for link in &links {
|
|
assert_eq!(
|
|
link.link_type,
|
|
LinkType::MarkdownLink,
|
|
"Link '{}' should be MarkdownLink type",
|
|
link.target_path
|
|
);
|
|
}
|
|
|
|
// Verify correct targets were extracted (links, not images)
|
|
let targets: Vec<&str> =
|
|
links.iter().map(|l| l.target_path.as_str()).collect();
|
|
assert!(
|
|
targets.contains(&"docs/guide.md"),
|
|
"Should contain docs/guide.md"
|
|
);
|
|
assert!(
|
|
targets.contains(&"config.toml"),
|
|
"Should contain config.toml"
|
|
);
|
|
assert!(targets.contains(&"file.md"), "Should contain file.md");
|
|
assert!(targets.contains(&"other.md"), "Should contain other.md");
|
|
|
|
// Verify images were NOT extracted
|
|
assert!(
|
|
!targets.contains(&"images/screenshot.png"),
|
|
"Should NOT contain screenshot.png (it's an image)"
|
|
);
|
|
assert!(
|
|
!targets.contains(&"logo.png"),
|
|
"Should NOT contain logo.png (it's an image)"
|
|
);
|
|
assert!(
|
|
!targets.contains(&"banner.jpg"),
|
|
"Should NOT contain banner.jpg (it's an image)"
|
|
);
|
|
assert!(
|
|
!targets.contains(&"pic.png"),
|
|
"Should NOT contain pic.png (it's an image)"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_edge_case_image_at_line_start() {
|
|
// Test edge case: image at the very start of a line
|
|
let content = "\n[Link](file.md)";
|
|
let links = extract_links(test_media_id(), content);
|
|
|
|
assert_eq!(
|
|
links.len(),
|
|
1,
|
|
"Should only extract the link, not the image"
|
|
);
|
|
assert_eq!(links[0].target_path, "file.md");
|
|
assert_eq!(links[0].link_type, LinkType::MarkdownLink);
|
|
}
|
|
}
|