various: markdown improvements

Signed-off-by: NotAShelf <raf@notashelf.dev>
Change-Id: I81fda8247814da19eed1e76dbe97bd5b6a6a6964
This commit is contained in:
raf 2026-02-05 15:39:05 +03:00
commit 80a8b5c7ca
Signed by: NotAShelf
GPG key ID: 29D95B64378DB4BF
23 changed files with 3458 additions and 30 deletions

View file

@ -0,0 +1,456 @@
//! Markdown link extraction and management for Obsidian-style bidirectional links.
//!
//! This module provides:
//! - Wikilink extraction (`[[target]]` and `[[target|display]]`)
//! - Embed extraction (`![[target]]`)
//! - Markdown link extraction (`[text](path)` for internal links)
//! - Link resolution strategies
//! - Context extraction for backlink previews
use std::path::Path;
use regex::Regex;
use uuid::Uuid;
use crate::error::Result;
use crate::model::{LinkType, MarkdownLink, MediaId};
/// Configuration for context extraction around links
const CONTEXT_CHARS_BEFORE: usize = 50;
const CONTEXT_CHARS_AFTER: usize = 50;
/// Extract all markdown links from file content.
///
/// This extracts:
/// - Wikilinks: `[[target]]` and `[[target|display text]]`
/// - Embeds: `![[target]]`
/// - Markdown links: `[text](path)` (internal paths only, no http/https)
pub fn extract_links(source_media_id: MediaId, content: &str) -> Vec<MarkdownLink> {
let mut links = Vec::new();
// Extract wikilinks: [[target]] or [[target|display]]
links.extend(extract_wikilinks(source_media_id, content));
// Extract embeds: ![[target]]
links.extend(extract_embeds(source_media_id, content));
// Extract markdown links: [text](path)
links.extend(extract_markdown_links(source_media_id, content));
links
}
/// Extract wikilinks from content.
/// Matches: `[[target]]` or `[[target|display text]]` but NOT `![[...]]` (embeds)
fn extract_wikilinks(source_media_id: MediaId, content: &str) -> Vec<MarkdownLink> {
// Match [[...]] - we'll manually filter out embeds that are preceded by !
let re = Regex::new(r"\[\[([^\]|]+)(?:\|([^\]]+))?\]\]").unwrap();
let mut links = Vec::new();
for (line_num, line) in content.lines().enumerate() {
for cap in re.captures_iter(line) {
let full_match = cap.get(0).unwrap();
let match_start = full_match.start();
// Check if preceded by ! (which would make it an embed, not a wikilink)
if match_start > 0 {
let bytes = line.as_bytes();
if bytes.get(match_start - 1) == Some(&b'!') {
continue; // Skip embeds
}
}
let target = cap.get(1).unwrap().as_str().trim();
let display_text = cap.get(2).map(|m| m.as_str().trim().to_string());
let context = extract_context(content, line_num, full_match.start(), full_match.end());
links.push(MarkdownLink {
id: Uuid::now_v7(),
source_media_id,
target_path: target.to_string(),
target_media_id: None, // Will be resolved later
link_type: LinkType::Wikilink,
link_text: display_text.or_else(|| Some(target.to_string())),
line_number: Some(line_num as i32 + 1), // 1-indexed
context: Some(context),
created_at: chrono::Utc::now(),
});
}
}
links
}
/// Extract embeds from content.
/// Matches: `![[target]]`
fn extract_embeds(source_media_id: MediaId, content: &str) -> Vec<MarkdownLink> {
let re = Regex::new(r"!\[\[([^\]|]+)(?:\|([^\]]+))?\]\]").unwrap();
let mut links = Vec::new();
for (line_num, line) in content.lines().enumerate() {
for cap in re.captures_iter(line) {
let full_match = cap.get(0).unwrap();
let target = cap.get(1).unwrap().as_str().trim();
let display_text = cap.get(2).map(|m| m.as_str().trim().to_string());
let context = extract_context(content, line_num, full_match.start(), full_match.end());
links.push(MarkdownLink {
id: Uuid::now_v7(),
source_media_id,
target_path: target.to_string(),
target_media_id: None,
link_type: LinkType::Embed,
link_text: display_text.or_else(|| Some(target.to_string())),
line_number: Some(line_num as i32 + 1),
context: Some(context),
created_at: chrono::Utc::now(),
});
}
}
links
}
/// Extract markdown links from content.
/// Matches: `[text](path)` but only for internal paths (no http/https)
fn extract_markdown_links(source_media_id: MediaId, content: &str) -> Vec<MarkdownLink> {
// Match [text](path) where path doesn't start with http:// or https://
let re = Regex::new(r"\[([^\]]+)\]\(([^)]+)\)").unwrap();
let mut links = Vec::new();
for (line_num, line) in content.lines().enumerate() {
for cap in re.captures_iter(line) {
let full_match = cap.get(0).unwrap();
let text = cap.get(1).unwrap().as_str().trim();
let path = cap.get(2).unwrap().as_str().trim();
// Skip external links
if path.starts_with("http://")
|| path.starts_with("https://")
|| path.starts_with("mailto:")
|| path.starts_with("ftp://")
{
continue;
}
// Skip anchor-only links
if path.starts_with('#') {
continue;
}
// Remove any anchor from the path for resolution
let target_path = path.split('#').next().unwrap_or(path);
let context = extract_context(content, line_num, full_match.start(), full_match.end());
links.push(MarkdownLink {
id: Uuid::now_v7(),
source_media_id,
target_path: target_path.to_string(),
target_media_id: None,
link_type: LinkType::MarkdownLink,
link_text: Some(text.to_string()),
line_number: Some(line_num as i32 + 1),
context: Some(context),
created_at: chrono::Utc::now(),
});
}
}
links
}
/// Extract surrounding context for a link.
fn extract_context(content: &str, line_num: usize, _start: usize, _end: usize) -> String {
let lines: Vec<&str> = content.lines().collect();
if line_num >= lines.len() {
return String::new();
}
let line = lines[line_num];
let line_len = line.len();
// Get surrounding lines for context if the current line is short
if line_len < 30 && line_num > 0 {
// Include previous line
let prev = lines.get(line_num.saturating_sub(1)).unwrap_or(&"");
let next = lines.get(line_num + 1).unwrap_or(&"");
return format!("{} {} {}", prev.trim(), line.trim(), next.trim())
.chars()
.take(CONTEXT_CHARS_BEFORE + CONTEXT_CHARS_AFTER + 20)
.collect();
}
// Truncate long lines
if line_len > CONTEXT_CHARS_BEFORE + CONTEXT_CHARS_AFTER {
line.chars()
.take(CONTEXT_CHARS_BEFORE + CONTEXT_CHARS_AFTER)
.collect()
} else {
line.to_string()
}
}
/// Link resolution strategies for finding target media items.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ResolutionStrategy {
/// Direct path match
DirectPath,
/// Relative to source directory
RelativeToSource,
/// Filename with .md extension added
FilenameWithMd,
/// Filename-only search (Obsidian-style)
FilenameOnly,
}
/// Resolve a link target to possible file paths.
///
/// Returns a list of candidate paths to check, in order of preference.
pub fn resolve_link_candidates(
target: &str,
source_path: &Path,
root_dirs: &[std::path::PathBuf],
) -> Vec<std::path::PathBuf> {
let mut candidates = Vec::new();
// Clean up the target path
let target = target.trim();
// 1. Direct path - if it looks like a path
if target.contains('/') || target.contains('\\') {
let direct = std::path::PathBuf::from(target);
if direct.is_absolute() {
candidates.push(direct);
} else {
// Relative to each root dir
for root in root_dirs {
candidates.push(root.join(&direct));
}
}
}
// 2. Relative to source file's directory
if let Some(source_dir) = source_path.parent() {
let relative = source_dir.join(target);
candidates.push(relative.clone());
// Also try with .md extension
if !target.ends_with(".md") {
candidates.push(relative.with_extension("md"));
let mut with_md = relative.clone();
with_md.set_file_name(format!(
"{}.md",
relative.file_name().unwrap_or_default().to_string_lossy()
));
candidates.push(with_md);
}
}
// 3. Filename with .md extension in root dirs
let target_with_md = if target.ends_with(".md") {
target.to_string()
} else {
format!("{}.md", target)
};
for root in root_dirs {
candidates.push(root.join(&target_with_md));
}
// 4. Remove duplicates while preserving order
let mut seen = std::collections::HashSet::new();
candidates.retain(|p| seen.insert(p.clone()));
candidates
}
/// Extract frontmatter aliases from markdown content.
///
/// Obsidian uses the `aliases` field in frontmatter to define alternative names
/// for a note that can be used in wikilinks.
pub fn extract_aliases(content: &str) -> Result<Vec<String>> {
let parsed = gray_matter::Matter::<gray_matter::engine::YAML>::new().parse(content);
if let Some(data) = parsed.ok().and_then(|p| p.data) {
if let gray_matter::Pod::Hash(map) = data {
if let Some(aliases) = map.get("aliases") {
match aliases {
gray_matter::Pod::Array(arr) => {
return Ok(arr
.iter()
.filter_map(|a| {
if let gray_matter::Pod::String(s) = a {
Some(s.clone())
} else {
None
}
})
.collect());
}
gray_matter::Pod::String(s) => {
// Single alias as string
return Ok(vec![s.clone()]);
}
_ => {}
}
}
}
}
Ok(Vec::new())
}
#[cfg(test)]
mod tests {
use super::*;
fn test_media_id() -> MediaId {
MediaId(Uuid::nil())
}
#[test]
fn test_extract_simple_wikilink() {
let content = "This is a [[simple link]] in text.";
let links = extract_links(test_media_id(), content);
assert_eq!(links.len(), 1);
assert_eq!(links[0].target_path, "simple link");
assert_eq!(links[0].link_type, LinkType::Wikilink);
assert_eq!(links[0].link_text, Some("simple link".to_string()));
}
#[test]
fn test_extract_wikilink_with_display() {
let content = "Check out [[target note|this article]] for more.";
let links = extract_links(test_media_id(), content);
assert_eq!(links.len(), 1);
assert_eq!(links[0].target_path, "target note");
assert_eq!(links[0].link_text, Some("this article".to_string()));
}
#[test]
fn test_extract_embed() {
let content = "Here is an image: ![[image.png]]";
let links = extract_links(test_media_id(), content);
assert_eq!(links.len(), 1);
assert_eq!(links[0].target_path, "image.png");
assert_eq!(links[0].link_type, LinkType::Embed);
}
#[test]
fn test_extract_markdown_link() {
let content = "Read [the documentation](docs/README.md) for details.";
let links = extract_links(test_media_id(), content);
assert_eq!(links.len(), 1);
assert_eq!(links[0].target_path, "docs/README.md");
assert_eq!(links[0].link_type, LinkType::MarkdownLink);
assert_eq!(links[0].link_text, Some("the documentation".to_string()));
}
#[test]
fn test_skip_external_links() {
let content = "Visit [our site](https://example.com) or [email us](mailto:test@test.com).";
let links = extract_links(test_media_id(), content);
assert!(links.is_empty());
}
#[test]
fn test_multiple_links() {
let content = r#"
# My Note
This links to [[Note A]] and also [[Note B|Note B Title]].
We also have a markdown link to [config](./config.md).
And an embedded image: ![[diagram.png]]
"#;
let links = extract_links(test_media_id(), content);
assert_eq!(links.len(), 4);
let types: Vec<_> = links.iter().map(|l| l.link_type).collect();
assert!(types.contains(&LinkType::Wikilink));
assert!(types.contains(&LinkType::Embed));
assert!(types.contains(&LinkType::MarkdownLink));
}
#[test]
fn test_line_numbers() {
let content = "Line 1\n[[link on line 2]]\nLine 3";
let links = extract_links(test_media_id(), content);
assert_eq!(links.len(), 1);
assert_eq!(links[0].line_number, Some(2));
}
#[test]
fn test_resolve_candidates() {
let source_path = std::path::Path::new("/notes/projects/readme.md");
let root_dirs = vec![std::path::PathBuf::from("/notes")];
let candidates = resolve_link_candidates("My Note", source_path, &root_dirs);
// Should include relative path and .md variations
assert!(!candidates.is_empty());
assert!(candidates
.iter()
.any(|p| p.to_string_lossy().contains("My Note.md")));
}
#[test]
fn test_extract_aliases() {
let content = r#"---
title: My Note
aliases:
- Alternative Name
- Another Alias
---
# Content here
"#;
let aliases = extract_aliases(content).unwrap();
assert_eq!(aliases, vec!["Alternative Name", "Another Alias"]);
}
#[test]
fn test_extract_single_alias() {
let content = r#"---
title: My Note
aliases: Single Alias
---
# Content
"#;
let aliases = extract_aliases(content).unwrap();
assert_eq!(aliases, vec!["Single Alias"]);
}
#[test]
fn test_wikilink_not_matching_embed() {
let content = "A wikilink [[note]] and an embed ![[image.png]]";
let links = extract_links(test_media_id(), content);
assert_eq!(links.len(), 2);
let wikilinks: Vec<_> = links
.iter()
.filter(|l| l.link_type == LinkType::Wikilink)
.collect();
let embeds: Vec<_> = links
.iter()
.filter(|l| l.link_type == LinkType::Embed)
.collect();
assert_eq!(wikilinks.len(), 1);
assert_eq!(embeds.len(), 1);
assert_eq!(wikilinks[0].target_path, "note");
assert_eq!(embeds[0].target_path, "image.png");
}
}